# Census Income Analysis



In [117]:
# Import modules
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [118]:
# Load the Adult Income dataset into DataFrame.
column_names = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"
]

df = pd.read_csv("adult.data", header=None, names=column_names, na_values=" ?", skipinitialspace=True)




In [119]:
# Rename the column names in the DataFrame using the list given above.

# Create the list
column_name =['age', 'workclass', 'fnlwgt', 'education', 'education-years', 'marital-status', 'occupation', 'relationship', 'race','sex','capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income-group']


# Rename the columns using 'rename()'
for i in range(len(column_name)):
  df.rename(columns={df.columns[i]:column_name[i]},inplace=True)
# Print the first five rows of the DataFrame
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-years,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income-group
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [120]:
# Print the number of rows and columns of the DataFrame
df.shape


(32561, 15)

In [121]:
# Get the information of the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              32561 non-null  int64 
 1   workclass        32561 non-null  object
 2   fnlwgt           32561 non-null  int64 
 3   education        32561 non-null  object
 4   education-years  32561 non-null  int64 
 5   marital-status   32561 non-null  object
 6   occupation       32561 non-null  object
 7   relationship     32561 non-null  object
 8   race             32561 non-null  object
 9   sex              32561 non-null  object
 10  capital-gain     32561 non-null  int64 
 11  capital-loss     32561 non-null  int64 
 12  hours-per-week   32561 non-null  int64 
 13  native-country   32561 non-null  object
 14  income-group     32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [122]:
# Check the distribution of the labels in the target column.
print(df['income-group'])
print('\nDistribution of labels in the target column:\n',df['income-group'].value_counts())

0        <=50K
1        <=50K
2        <=50K
3        <=50K
4        <=50K
         ...  
32556    <=50K
32557     >50K
32558    <=50K
32559    <=50K
32560     >50K
Name: income-group, Length: 32561, dtype: object

Distribution of labels in the target column:
 income-group
<=50K    24720
>50K      7841
Name: count, dtype: int64


In [123]:
# Check for null values in the DataFrame.
df.isnull().sum()

age                0
workclass          0
fnlwgt             0
education          0
education-years    0
marital-status     0
occupation         0
relationship       0
race               0
sex                0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income-group       0
dtype: int64

In [124]:
# Print the distribution of the columns mentioned to find the invalid values.

# Print the categories in column 'native-country'
print(f"Categories in the column Native country are :\n{df['native-country'].unique()}")
# Print the categories in column 'workclass'
print(f"\nCategories in the column workclass are :\n{df['workclass'].unique()}")
# Print the categories in column 'occupation'
print(f"\nCategories in the column occupation are :\n{df['occupation'].unique()}")


Categories in the column Native country are :
['United-States' 'Cuba' 'Jamaica' 'India' '?' 'Mexico' 'South'
 'Puerto-Rico' 'Honduras' 'England' 'Canada' 'Germany' 'Iran'
 'Philippines' 'Italy' 'Poland' 'Columbia' 'Cambodia' 'Thailand' 'Ecuador'
 'Laos' 'Taiwan' 'Haiti' 'Portugal' 'Dominican-Republic' 'El-Salvador'
 'France' 'Guatemala' 'China' 'Japan' 'Yugoslavia' 'Peru'
 'Outlying-US(Guam-USVI-etc)' 'Scotland' 'Trinadad&Tobago' 'Greece'
 'Nicaragua' 'Vietnam' 'Hong' 'Ireland' 'Hungary' 'Holand-Netherlands']

Categories in the column workclass are :
['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov' '?'
 'Self-emp-inc' 'Without-pay' 'Never-worked']

Categories in the column occupation are :
['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty'
 'Other-service' 'Sales' 'Craft-repair' 'Transport-moving'
 'Farming-fishing' 'Machine-op-inspct' 'Tech-support' '?'
 'Protective-serv' 'Armed-Forces' 'Priv-house-serv']


In [125]:
# Replace the invalid values ' ?' with 'np.nan'.
df['native-country'].replace(to_replace=' ?',value=np.nan,inplace=True)
df['occupation'].replace(to_replace=' ?',value=np.nan,inplace=True)
df['workclass'].replace(to_replace=' ?',value=np.nan,inplace=True)
# Check for null values in the DataFrame again.
df.isnull().sum()


age                0
workclass          0
fnlwgt             0
education          0
education-years    0
marital-status     0
occupation         0
relationship       0
race               0
sex                0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income-group       0
dtype: int64

In [126]:
# Delete the rows with invalid values and the column not required

# Delete the rows with the 'dropna()' function
df = df.dropna()
# Delete the column with the 'drop()' function
df.drop(columns={'fnlwgt'},inplace=True)

# Print the number of rows and columns in the DataFrame.
print('Number of rows and columns in the dataframe are : ',df.shape)

Number of rows and columns in the dataframe are :  (32561, 14)


In [127]:
# Create a list of numeric columns names using 'select_dtypes()'.
numeric_columns = list(df.select_dtypes(include=['int64','float64']).columns)
numeric_columns

['age', 'education-years', 'capital-gain', 'capital-loss', 'hours-per-week']

In [128]:
# Map the 'sex' column and verify the distribution of labels.

# Print the distribution before mapping
print('Distribution before mapping:',df['sex'].value_counts(),sep='\n')
# Map the values of the column to convert the categorical values to integer
sex_dict = {"Male":0,"Female":1}
df['sex'] = df['sex'].map(sex_dict)
# Print the distribution after mapping
print('\nDistribution after mapping:',df['sex'].value_counts(),sep='\n')


Distribution before mapping:
sex
Male      21790
Female    10771
Name: count, dtype: int64

Distribution after mapping:
sex
0    21790
1    10771
Name: count, dtype: int64


In [129]:
# Map the 'income-group' column and verify the distribution of labels.

# Print the distribution before mapping
print('Distribution before mapping:',df['income-group'].value_counts(),sep='\n')

# Map the values of the column to convert the categorical values to integer
income_dict = {'<=50K':0,'>50K':1}
df['income-group'] = df['income-group'].map(income_dict)
# Print the distribution after mapping
print('\nDistribution after mapping:',df['income-group'].value_counts(),sep='\n')

Distribution before mapping:
income-group
<=50K    24720
>50K      7841
Name: count, dtype: int64

Distribution after mapping:
income-group
0    24720
1     7841
Name: count, dtype: int64


In [130]:
# Create the list of categorical columns names using 'select_dtypes()'.
lst_category = list(df.select_dtypes(include=['object']).columns)
lst_category

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'native-country']

In [131]:
# Create a 'income_dummies_df' DataFrame using the 'get_dummies()' function on the non-numeric categorical columns
income_dummies_df = pd.get_dummies(df[lst_category],dtype=int,drop_first=True)
income_dummies_df

Unnamed: 0,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,education_11th,education_12th,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32557,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32558,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32559,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [132]:
# Drop the categorical columns from the Income DataFrame `income_df`
df.drop(columns=lst_category,axis=1,inplace=True)
df

Unnamed: 0,age,education-years,sex,capital-gain,capital-loss,hours-per-week,income-group
0,39,13,0,2174,0,40,0
1,50,13,0,0,0,13,0
2,38,9,0,0,0,40,0
3,53,7,0,0,0,40,0
4,28,13,1,0,0,40,0
...,...,...,...,...,...,...,...
32556,27,12,1,0,0,38,0
32557,40,9,0,0,0,40,1
32558,58,9,1,0,0,40,0
32559,22,9,0,0,0,20,0


In [133]:
# Concat the income DataFrame and dummy DataFrame using 'concat()' function
final_df = pd.concat([df,income_dummies_df],axis=1)
final_df

Unnamed: 0,age,education-years,sex,capital-gain,capital-loss,hours-per-week,income-group,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,39,13,0,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,13,0,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,9,0,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,7,0,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,13,1,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,12,1,0,0,38,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32557,40,9,0,0,0,40,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32558,58,9,1,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32559,22,9,0,0,0,20,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [134]:
# Get the information of the DataFrame
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 100 columns):
 #   Column                                     Non-Null Count  Dtype
---  ------                                     --------------  -----
 0   age                                        32561 non-null  int64
 1   education-years                            32561 non-null  int64
 2   sex                                        32561 non-null  int64
 3   capital-gain                               32561 non-null  int64
 4   capital-loss                               32561 non-null  int64
 5   hours-per-week                             32561 non-null  int64
 6   income-group                               32561 non-null  int64
 7   workclass_Federal-gov                      32561 non-null  int32
 8   workclass_Local-gov                        32561 non-null  int32
 9   workclass_Never-worked                     32561 non-null  int32
 10  workclass_Private                          32

In [135]:
# Split the training and testing data
from sklearn.model_selection import train_test_split
features = list(final_df.columns)
features.remove('income-group')
X = final_df[features]
y = final_df['income-group']
# Import the module
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [136]:
# Normalise the train and test data-frames using the standard normalisation method.

# Define the 'standard_scalar()' function for calculating Z-scores
def standard_scalar(X):
  X_mean = X.mean()
  X_std = X.std()
  X_norm = (X-X_mean)/X_std
  return X_norm

# Create the DataFrames norm_X_train and norm_X_test

norm_X_train = X_train[numeric_columns].apply(standard_scalar,axis=0)
norm_X_test = X_test[numeric_columns].apply(standard_scalar,axis=0)
# Apply the 'standard_scalar()' on X_train on numeric columns using apply() function and get the descriptive statistics of the normalised X_train
norm_X_train.describe()


Unnamed: 0,age,education-years,capital-gain,capital-loss,hours-per-week
count,22792.0,22792.0,22792.0,22792.0,22792.0
mean,1.903239e-16,1.939091e-16,3.1175090000000005e-17,-2.6187080000000002e-17,-7.107921000000001e-17
std,1.0,1.0,1.0,1.0,1.0
min,-1.580586,-3.525874,-0.1463458,-0.2199164,-3.204928
25%,-0.7744656,-0.4232097,-0.1463458,-0.2199164,-0.03129405
50%,-0.1149125,-0.03537666,-0.1463458,-0.2199164,-0.03129405
75%,0.691208,0.7402893,-0.1463458,-0.2199164,0.3755821
max,3.769123,2.291621,13.85023,10.48701,4.769845


In [137]:
# Apply the 'standard_scalar()' on X_test on numeric columns using apply() function and get the descriptive statistics of the normalised X_test
norm_X_test.describe()

Unnamed: 0,age,education-years,capital-gain,capital-loss,hours-per-week
count,9769.0,9769.0,9769.0,9769.0,9769.0
mean,2.087478e-16,-1.327404e-16,-8.36446e-18,6.382447e-17,2.487518e-16
std,1.0,1.0,1.0,1.0,1.0
min,-1.585836,-3.538486,-0.145544,-0.2088927,-3.169174
25%,-0.7787338,-0.412648,-0.145544,-0.2088927,-0.0449295
50%,-0.1183776,-0.0219183,-0.145544,-0.2088927,-0.0449295
75%,0.6887244,0.7595411,-0.145544,-0.2088927,0.3556146
max,3.770387,2.32246,12.48304,10.85438,4.681491


In [138]:
# Deploy the 'LogisticRegression' model using the 'fit()' function.
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(norm_X_train,y_train)
log_reg.score(norm_X_train,y_train)

0.8135749385749386

In [139]:
# Make predictions on the test dataset by using the 'predict()' function.
y_test_pred = pd.Series(log_reg.predict(norm_X_test))
y_test_pred

0       0
1       0
2       0
3       0
4       0
       ..
9764    0
9765    0
9766    0
9767    0
9768    1
Length: 9769, dtype: int64

In [140]:
# Display the results of confusion_matrix
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y_test,y_test_pred))

[[7049  406]
 [1390  924]]


In [141]:
# Display the results of classification_report
print(classification_report(y_test,y_test_pred))

              precision    recall  f1-score   support

           0       0.84      0.95      0.89      7455
           1       0.69      0.40      0.51      2314

    accuracy                           0.82      9769
   macro avg       0.77      0.67      0.70      9769
weighted avg       0.80      0.82      0.80      9769



In [142]:
# Create a dictionary containing the different combination of features selected by RFE and their corresponding f1-scores.

# Import the libraries
from sklearn.feature_selection import RFE
from sklearn.metrics import f1_score
# Create the empty dictionary.
dict_rfe = {}
# Create a 'for' loop.
for i in range(1,len(norm_X_train.columns)+1):
  # Create the Logistic Regression Model
  log_reg2 = LogisticRegression()
  # Create the RFE model with 'i' number of features
  rfe = RFE(estimator=log_reg2, n_features_to_select=i)
  # Train the rfe model on the normalised training data using 'fit()'
  rfe.fit(norm_X_train,y_train)
  # Create a list of important features chosen by RFE.
  rfe_features = list(norm_X_train.columns[rfe.support_])
  # Create the normalised training DataFrame with rfe features
  X_train_rfe = norm_X_train[rfe_features]
  # Create the logistic regression
  log_reg3 = LogisticRegression()
  # Train the model normalised training DataFrame with rfe features using 'fit()'
  log_reg3.fit(X_train_rfe,y_train)
  # Predict 'y' values only for the test set as generally, they are predicted quite accurately for the train set.
  y_test_pred = log_reg3.predict(norm_X_test[rfe_features])
  # Calculate the f1-score
  f1_scores_array = f1_score(y_test,y_test_pred,average=None)
  # Add the name of features and f1-scores in the dictionary
  dict_rfe[i] = {'features':list(rfe_features),'f1_score':f1_scores_array}


In [143]:
dict_rfe

{1: {'features': ['capital-gain'],
  'f1_score': array([0.88521447, 0.32677442])},
 2: {'features': ['education-years', 'capital-gain'],
  'f1_score': array([0.88774272, 0.39502943])},
 3: {'features': ['age', 'education-years', 'capital-gain'],
  'f1_score': array([0.88411377, 0.45692666])},
 4: {'features': ['age', 'education-years', 'capital-gain', 'hours-per-week'],
  'f1_score': array([0.88515336, 0.4906815 ])},
 5: {'features': ['age',
   'education-years',
   'capital-gain',
   'capital-loss',
   'hours-per-week'],
  'f1_score': array([0.88700138, 0.50713502])}}

In [144]:
# Logistic Regression with the ideal number of features and predict the target.

# Create the Logistic Regression Model
log_reg4 = LogisticRegression()

rfe1 = RFE(estimator=log_reg4, n_features_to_select=5)
# Train the rfe model on the normalised training data
rfe1.fit(norm_X_train,y_train)
# Create a list of important features chosen by RFE.
rfe1_features = norm_X_train.columns[rfe1.support_]
print(rfe1_features)
# Create the normalised training DataFrame with rfe features
X_train_final = norm_X_train[rfe1_features]
# Create the Regression Model again
log_reg4 = LogisticRegression()
# Train the model with the normalised training features DataFrame with best rfe features and target training DataFrame
log_reg4.fit(X_train_final,y_train)
# Predict the target using the normalised test DataFrame with rfe features
y_test_pred_final = log_reg4.predict(norm_X_test[rfe1_features])
# Calculate the final f1-score and print it
scores_array_final = f1_score(y_test,y_test_pred_final,average=None)
print(scores_array_final)

Index(['age', 'education-years', 'capital-gain', 'capital-loss',
       'hours-per-week'],
      dtype='object')
[0.88700138 0.50713502]
