### Importing packages

In [1]:
import pandas as pd

### Reading csv file

In [2]:
census = pd.read_csv('census_data.csv', index_col=0)
census.head()

Unnamed: 0,first_name,last_name,birth_year,voted,num_children,income_year,higher_tax,marital_status
0,Denise,Ratke,2005,False,0,92129.41,disagree,single
1,Hali,Cummerata,1987,False,0,75649.17,neutral,divorced
2,Salomon,Orn,1992,True,2,166313.45,agree,single
3,Sarina,Schiller,1965,False,2,71704.81,strongly agree,married
4,Gust,Abernathy,1945,False,2,143316.08,agree,married


### EDA

In [3]:
# Getting information about each column
census.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   first_name      100 non-null    object 
 1   last_name       100 non-null    object 
 2   birth_year      100 non-null    object 
 3   voted           100 non-null    bool   
 4   num_children    100 non-null    int64  
 5   income_year     100 non-null    float64
 6   higher_tax      100 non-null    object 
 7   marital_status  100 non-null    object 
dtypes: bool(1), float64(1), int64(1), object(5)
memory usage: 6.3+ KB


In [4]:
# Viewing unique enteries in birth_year column
census.birth_year.unique()

array(['2005', '1987', '1992', '1965', '1945', '1951', '1963', '1949',
       '1950', '1971', '2007', '1944', '1995', '1973', '1946', '1954',
       '1994', '1989', '1947', '1993', '1976', '1984', 'missing', '1966',
       '1941', '2000', '1953', '1956', '1960', '2001', '1980', '1955',
       '1985', '1996', '1968', '1979', '2006', '1962', '1981', '1959',
       '1977', '1978', '1983', '1957', '1961', '1982', '2002', '1998',
       '1999', '1952', '1940', '1986', '1958'], dtype=object)

In [5]:
# Replace missing with 1967 in the birth_year column
census.birth_year = census.birth_year.replace("missing", 1967)
census.birth_year.unique()

array(['2005', '1987', '1992', '1965', '1945', '1951', '1963', '1949',
       '1950', '1971', '2007', '1944', '1995', '1973', '1946', '1954',
       '1994', '1989', '1947', '1993', '1976', '1984', 1967, '1966',
       '1941', '2000', '1953', '1956', '1960', '2001', '1980', '1955',
       '1985', '1996', '1968', '1979', '2006', '1962', '1981', '1959',
       '1977', '1978', '1983', '1957', '1961', '1982', '2002', '1998',
       '1999', '1952', '1940', '1986', '1958'], dtype=object)

In [6]:
# cast the birth_year column to int
census.birth_year = census.birth_year.astype(int)
census.dtypes

first_name         object
last_name          object
birth_year          int32
voted                bool
num_children        int64
income_year       float64
higher_tax         object
marital_status     object
dtype: object

In [7]:
# Average of birth_year
census.birth_year.mean()

1973.4

In [8]:
# Ordering the higher_tax column enteries
census.higher_tax = pd.Categorical(census.higher_tax, ['strongly disagree', 'disagree', 'neutral', 'agree', 'strongly agree'], ordered = True)
census.higher_tax.unique()

['disagree', 'neutral', 'agree', 'strongly agree', 'strongly disagree']
Categories (5, object): ['strongly disagree' < 'disagree' < 'neutral' < 'agree' < 'strongly agree']

In [10]:
# Label encode higher_tax column to get median sentiment
census['tax_codes'] = census.higher_tax.cat.codes
census.tax_codes.median()

2.0

In [11]:
# One-Hot Encode marital_status to create binary variables of each category
census_marital = pd.get_dummies(data = census, columns = ['marital_status'])
census_marital.head()

Unnamed: 0,first_name,last_name,birth_year,voted,num_children,income_year,higher_tax,tax_codes,marital_status_divorced,marital_status_married,marital_status_single,marital_status_widowed
0,Denise,Ratke,2005,False,0,92129.41,disagree,1,0,0,1,0
1,Hali,Cummerata,1987,False,0,75649.17,neutral,2,1,0,0,0
2,Salomon,Orn,1992,True,2,166313.45,agree,3,0,0,1,0
3,Sarina,Schiller,1965,False,2,71704.81,strongly agree,4,0,1,0,0
4,Gust,Abernathy,1945,False,2,143316.08,agree,3,0,1,0,0


### Extra Steps

In [12]:
# Ordering the marital_status column enteries
census.marital_status = pd.Categorical(census.marital_status, ['single', 'divorced', 'married', 'widowed'], ordered = False)
census.marital_status.unique()

['single', 'divorced', 'married', 'widowed']
Categories (4, object): ['single', 'divorced', 'married', 'widowed']

In [14]:
# Label encode marital_status column
census['marital_codes'] = census.marital_status.cat.codes
census.head()

Unnamed: 0,first_name,last_name,birth_year,voted,num_children,income_year,higher_tax,marital_status,tax_codes,marital_codes
0,Denise,Ratke,2005,False,0,92129.41,disagree,single,1,0
1,Hali,Cummerata,1987,False,0,75649.17,neutral,divorced,2,1
2,Salomon,Orn,1992,True,2,166313.45,agree,single,3,0
3,Sarina,Schiller,1965,False,2,71704.81,strongly agree,married,4,2
4,Gust,Abernathy,1945,False,2,143316.08,agree,married,3,2


In [17]:
# Creating year groups to group birth_year
bins = list(range(census.birth_year.min(), census.birth_year.max() + 1, 5))
bins

[1940,
 1945,
 1950,
 1955,
 1960,
 1965,
 1970,
 1975,
 1980,
 1985,
 1990,
 1995,
 2000,
 2005]

In [18]:
# Label encoding year_group column
census['year_group'] = pd.cut(census.birth_year, bins)
census['year_group_codes'] = census.age_group.cat.codes
census

Unnamed: 0,first_name,last_name,birth_year,voted,num_children,income_year,higher_tax,marital_status,tax_codes,marital_codes,age_group,age_group_codes,year_group,year_group_codes
0,Denise,Ratke,2005,False,0,92129.41,disagree,single,1,0,"(2000, 2005]",13,"(2000, 2005]",13
1,Hali,Cummerata,1987,False,0,75649.17,neutral,divorced,2,1,"(1985, 1990]",10,"(1985, 1990]",10
2,Salomon,Orn,1992,True,2,166313.45,agree,single,3,0,"(1990, 1995]",11,"(1990, 1995]",11
3,Sarina,Schiller,1965,False,2,71704.81,strongly agree,married,4,2,"(1960, 1965]",5,"(1960, 1965]",5
4,Gust,Abernathy,1945,False,2,143316.08,agree,married,3,2,"(1940, 1945]",1,"(1940, 1945]",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Carisa,Hills,1958,False,3,157117.14,agree,married,3,2,"(1955, 1960]",4,"(1955, 1960]",4
96,Tameka,Collins,2001,False,1,61518.34,strongly disagree,single,0,0,"(2000, 2005]",13,"(2000, 2005]",13
97,Adams,Leuschke,1987,False,0,41784.87,strongly agree,single,4,0,"(1985, 1990]",10,"(1985, 1990]",10
98,Earnestine,Gutmann,1985,True,4,79021.46,disagree,widowed,1,3,"(1980, 1985]",9,"(1980, 1985]",9
