# Census Income Modeling

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import linregress
import pipeline_utilities as p_util
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier


In [2]:
# Import the data
original_data = pd.read_csv("../Project2_Resources/census-income-test.csv")
original_data.head()
original_data.columns

Index(['Age', 'Class', 'Industry', 'Occupation', 'Education', 'AHGA',
       'AHRSPAY', 'Marital', 'Major Industry', 'Major Occupation', 'Race',
       'Hispanic', 'Gender', 'ASEX', 'AUNMEM', 'AUNTYPE', 'Capital Gains',
       'Capital Loss', 'Dividends', 'Tax Status', 'Region', 'State',
       'General Status', 'Other Status', 'HHDFMX', 'HHDREL', 'MARSUPWT',
       'MIGMTR1', 'MIGMTR3', 'MIGMTR4', 'MIGSAME', 'MIGSUN', 'NOEMP', 'PARENT',
       'PEARNVAL', 'Citizenship', 'PEMNTVTY', 'PENATVTY', 'PRCITSHP',
       'Weeks Worked', 'Year', 'Above50K'],
      dtype='object')

In [3]:
original_data.describe()

Unnamed: 0,Age,Industry,Occupation,AHGA,Capital Gains,Capital Loss,Dividends,HHDFMX,MIGSAME,PEMNTVTY,PRCITSHP,Weeks Worked,Year
count,99762.0,99762.0,99762.0,99762.0,99762.0,99762.0,99762.0,99762.0,99762.0,99762.0,99762.0,99762.0,99762.0
mean,34.628596,15.332812,11.331118,54.461268,425.788577,35.919458,192.494727,1739.542838,1.956156,0.178505,1.519827,23.185331,94.500652
std,22.333449,18.026709,14.459589,270.202643,4616.795578,265.525252,1841.728492,994.900902,2.364331,0.558316,0.848229,24.389184,0.500002
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.26,0.0,0.0,0.0,0.0,94.0
25%,16.0,0.0,0.0,0.0,0.0,0.0,0.0,1058.05,0.0,0.0,2.0,0.0,94.0
50%,33.0,0.0,0.0,0.0,0.0,0.0,0.0,1615.395,1.0,0.0,2.0,8.0,95.0
75%,50.0,33.0,26.0,0.0,0.0,0.0,0.0,2187.2775,4.0,0.0,2.0,52.0,95.0
max,90.0,51.0,46.0,9900.0,99999.0,4608.0,99999.0,16258.2,6.0,2.0,2.0,52.0,95.0


In [4]:
# Define a function
def set_target(above50k):
    if above50k == '-50000':
        return 0
    return 1

# "Apply" the function to the amount column
original_data['KTarget'] = original_data['Above50K'].apply(set_target)
original_data.head()

Unnamed: 0,Age,Class,Industry,Occupation,Education,AHGA,AHRSPAY,Marital,Major Industry,Major Occupation,...,PARENT,PEARNVAL,Citizenship,PEMNTVTY,PENATVTY,PRCITSHP,Weeks Worked,Year,Above50K,KTarget
0,38,Private,6,36,1st 2nd 3rd or 4th grade,0,Not in universe,Married-civilian spouse present,Manufacturing-durable goods,Machine operators assmblrs & inspctrs,...,Mexico,Mexico,Foreign born- Not a citizen of U S,0,Not in universe,2,12,95,-50000,0
1,44,Self-employed-not incorporated,37,12,Associates degree-occup /vocational,0,Not in universe,Married-civilian spouse present,Business and repair services,Professional specialty,...,United-States,United-States,Native- Born in the United States,0,Not in universe,2,26,95,-50000,0
2,2,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,95,-50000,0
3,35,Private,29,3,High school graduate,0,Not in universe,Divorced,Transportation,Executive admin and managerial,...,United-States,United-States,Native- Born in the United States,2,Not in universe,2,52,94,-50000,0
4,49,Private,4,34,High school graduate,0,Not in universe,Divorced,Construction,Precision production craft & repair,...,United-States,United-States,Native- Born in the United States,0,Not in universe,2,50,95,-50000,0


In [5]:
original_data[['KTarget', 'Age', 'Industry', 'Occupation']].corr()

Unnamed: 0,KTarget,Age,Industry,Occupation
KTarget,1.0,0.137499,0.194715,0.014418
Age,0.137499,1.0,0.150886,0.110132
Industry,0.194715,0.150886,1.0,0.564211
Occupation,0.014418,0.110132,0.564211,1.0


In [6]:
columns_to_encode = ['Class', 'Education', 'AHRSPAY', 'Marital',
                     'Major Industry', 'Major Occupation', 'Race',
                     'Hispanic', 'Gender', 'ASEX', 'AUNMEM', 'AUNTYPE', 
                     'Tax Status', 'Region', 'State', 'General Status', 
                     'Other Status', 'HHDREL', 'MARSUPWT', 'MIGMTR1',
                     'MIGMTR3', 'MIGMTR4', 'MIGSUN', 'NOEMP', 'PARENT', 
                     'PEARNVAL', 'Citizenship', 'PENATVTY' 
                    ]

# Make a copy of the dataset
data_copy = original_data.copy()

# Loop through columns_to_encode and convert the columns to category codes
for column in columns_to_encode:
    data_copy[column] = data_copy[column].astype("category").cat.codes

data_copy.head()

Unnamed: 0,Age,Class,Industry,Occupation,Education,AHGA,AHRSPAY,Marital,Major Industry,Major Occupation,...,PARENT,PEARNVAL,Citizenship,PEMNTVTY,PENATVTY,PRCITSHP,Weeks Worked,Year,Above50K,KTarget
0,38,4,6,36,3,0,2,2,10,5,...,26,26,0,0,1,2,12,95,-50000,0
1,44,6,37,12,8,0,2,2,2,10,...,40,40,4,0,1,2,26,95,-50000,0
2,2,3,0,0,10,0,2,4,14,6,...,40,40,4,0,1,0,0,95,-50000,0
3,35,4,29,3,12,0,2,0,21,2,...,40,40,4,2,1,2,52,94,-50000,0
4,49,4,4,34,12,0,2,0,4,8,...,40,40,4,0,1,2,50,95,-50000,0


In [7]:
data_copy = data_copy.drop(columns='Above50K')

In [8]:
data_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99762 entries, 0 to 99761
Data columns (total 42 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Age               99762 non-null  int64  
 1   Class             99762 non-null  int8   
 2   Industry          99762 non-null  int64  
 3   Occupation        99762 non-null  int64  
 4   Education         99762 non-null  int8   
 5   AHGA              99762 non-null  int64  
 6   AHRSPAY           99762 non-null  int8   
 7   Marital           99762 non-null  int8   
 8   Major Industry    99762 non-null  int8   
 9   Major Occupation  99762 non-null  int8   
 10  Race              99762 non-null  int8   
 11  Hispanic          99762 non-null  int8   
 12  Gender            99762 non-null  int8   
 13  ASEX              99762 non-null  int8   
 14  AUNMEM            99762 non-null  int8   
 15  AUNTYPE           99762 non-null  int8   
 16  Capital Gains     99762 non-null  int64 

## Split the Data into Training and Testing Sets

In [9]:
# Create the labels set `y` and features DataFrame `X`
y = data_copy['KTarget']

X = data_copy.copy()
X = X.drop(columns='KTarget')

In [10]:
# Check the balance of the labels variable (`y`) by using the `value_counts` function.
y.value_counts()

KTarget
0    93576
1     6186
Name: count, dtype: int64

In [11]:
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Scale the Features

Use the `StandardScaler` to scale the features data. Remember that only `X_train` and `X_test` DataFrames should be scaled.

In [12]:
# Transforming the test dataset based on the fit from the training dataset
X_train_scaled, X_test_scaled = p_util.scale_data_with_StandardScaler(X_train, X_test)

Scaled X_train data: [[ 0.15308503  0.45918772 -0.79444332 ...  0.56716782  1.18124735
   0.99729053]
 [ 0.33235871  0.45918772  1.4795827  ...  0.56716782  1.18124735
   0.99729053]
 [-0.29509918  0.45918772  0.98040626 ...  0.56716782 -0.41792117
   0.99729053]
 ...
 [ 0.69090608  1.36312595  1.4795827  ...  0.56716782  1.18124735
  -1.00271683]
 [ 0.01862977  0.45918772  1.09133436 ...  0.56716782  1.18124735
   0.99729053]
 [ 0.10826661  0.45918772 -0.18433878 ...  0.56716782  1.18124735
  -1.00271683]]
Scaled X_train data: [[ 0.9598166   0.45918772  1.64597485 ...  0.56716782  1.18124735
   0.99729053]
 [-1.46037812 -0.44475051 -0.84990737 ... -1.78901681 -0.95097735
  -1.00271683]
 [ 0.28754029 -2.25262698  1.42411866 ...  0.56716782  1.18124735
   0.99729053]
 ...
 [-0.02618865  0.45918772  1.09133436 ...  0.56716782  1.18124735
   0.99729053]
 [-0.29509918  0.45918772 -0.18433878 ...  0.56716782  1.18124735
  -1.00271683]
 [-0.07100708 -3.15656521  1.42411866 ...  0.56716782  1

## Create and Fit a Logistic Regression Model

Create a Logistic Regression model, fit it to the training data, make predictions with the testing data, and print the model's accuracy score. You may choose any starting settings you like. 

In [13]:
# All requirements above have been coded into pipeline_utilities python program file

random_state = 1
p_util.logistic_regression_model_generator(X_train_scaled, X_test_scaled, y_train, y_test, random_state)


Logistic Regression Training Data Score: 0.9468063778885607
Logistic Regression Testing Data Score: 0.9502826670943426
Logistic Regression Predictions Accuracy Score: 0.9502826670943426
              precision    recall  f1-score   support

           1       0.67      0.28      0.40      1447
           0       0.96      0.99      0.97     23494

    accuracy                           0.95     24941
   macro avg       0.81      0.64      0.68     24941
weighted avg       0.94      0.95      0.94     24941

Logistic Regression Balanced Accuracy Score: 0.6360551318400398


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Create and Fit a Random Forest Classifier Model

Create a Random Forest Classifier model, fit it to the training data, make predictions with the testing data, and print the model's accuracy score. You may choose any starting settings you like. 

In [14]:
# All requirements above have been coded into pipeline_utilities python program file

random_state = 1
n_estimators = 100
#p_util.random_forest_model_generator(X_train_scaled, X_test_scaled, y_train, y_test, random_state, n_estimators, X.columns)

model = RandomForestClassifier(n_estimators=100, random_state=1)

# Fit the model and print the training and testing scores
model.fit(X_train, y_train)
print(f"Random Forest Training Data Score: {model.score(X_train, y_train)}")
print(f"Random Forest Testing Data Score: {model.score(X_test, y_test)}")

# Make predictions using testing data
predictions = model.predict(X_test)
#print(f"Random Forest Predictions: {predictions}")

# Print the accuracy score
print(f"Random Forest Predictions Accuracy Score: {accuracy_score(y_test, predictions)}")
print(classification_report(y_test, predictions, labels = [1, 0]))
print(f"Random Forest Balanced Accuracy Score: {balanced_accuracy_score(y_test, predictions)}")
    
# Get the feature importance array
importances = model.feature_importances_

# List the top 10 most important features
importances_sorted = sorted(zip(model.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]



Random Forest Training Data Score: 0.9999732695366275
Random Forest Testing Data Score: 0.9557756304879516
Random Forest Predictions Accuracy Score: 0.9557756304879516


NameError: name 'classification_report' is not defined

In [17]:
print(f"Random Forest Predictions Accuracy Score: {accuracy_score(y_test, predictions)}")
print(classification_report(y_test, predictions, labels = [1, 0]))
print(f"Random Forest Balanced Accuracy Score: {balanced_accuracy_score(y_test, predictions)}")
    
# Get the feature importance array
importances = model.feature_importances_

# List the top 10 most important features
importances_sorted = sorted(zip(model.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]


Random Forest Predictions Accuracy Score: 0.9557756304879516
              precision    recall  f1-score   support

           1       0.72      0.39      0.51      1447
           0       0.96      0.99      0.98     23494

    accuracy                           0.96     24941
   macro avg       0.84      0.69      0.74     24941
weighted avg       0.95      0.96      0.95     24941

Random Forest Balanced Accuracy Score: 0.6898796640222041


[(0.09684750675688562, 'Occupation'),
 (0.09303529681607177, 'Dividends'),
 (0.09298539057667281, 'Age'),
 (0.08749220870601834, 'HHDFMX'),
 (0.08236579832238548, 'Capital Gains'),
 (0.05361188639857999, 'Industry'),
 (0.04921398147543852, 'Education'),
 (0.04062353187375138, 'MIGSAME'),
 (0.03918421014663092, 'Weeks Worked'),
 (0.03874078297854178, 'Major Industry')]

## Evaluate the Models

Which model performed better? How does that compare to your prediction? Write down your results and thoughts in the following markdown cell.

* Both models had over 90% accuracy score, indicating that both models performed reasonably well, but the Random Forest model performed better than the Logistic Regression model by 2 percentage points. The Random Forest model achieved an accuracy of 95.22% with testing data and predictions while the Logistic Regression model achieved a score of 93.22%.
* My prediction that Logistic Regression model would perform better was incorrect!