# Census Income Modeling

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import linregress
import pipeline_utilities as p_util
from sklearn.model_selection import train_test_split
#from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report
#from sklearn.ensemble import RandomForestClassifier


In [2]:
# Import and examine the data
original_data = pd.read_csv("../Project2_Resources/census-income-test.csv")
original_data.head()

Unnamed: 0,Age,Class,Industry,Occupation,Education,AHGA,AHRSPAY,Marital,Major Industry,Major Occupation,...,NOEMP,PARENT,PEARNVAL,Citizenship,PEMNTVTY,PENATVTY,PRCITSHP,Weeks Worked,Year,Above50K
0,38,Private,6,36,1st 2nd 3rd or 4th grade,0,Not in universe,Married-civilian spouse present,Manufacturing-durable goods,Machine operators assmblrs & inspctrs,...,Mexico,Mexico,Mexico,Foreign born- Not a citizen of U S,0,Not in universe,2,12,95,-50000
1,44,Self-employed-not incorporated,37,12,Associates degree-occup /vocational,0,Not in universe,Married-civilian spouse present,Business and repair services,Professional specialty,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,26,95,-50000
2,2,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,95,-50000
3,35,Private,29,3,High school graduate,0,Not in universe,Divorced,Transportation,Executive admin and managerial,...,United-States,United-States,United-States,Native- Born in the United States,2,Not in universe,2,52,94,-50000
4,49,Private,4,34,High school graduate,0,Not in universe,Divorced,Construction,Precision production craft & repair,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,50,95,-50000


In [3]:
# Review the columns
#original_data.columns

In [4]:
# Review the values
#original_data.describe()

In [5]:
# Define a function
def set_target(above50k):
    if above50k == '-50000':
        return 0
    return 1

# "Apply" the function to the amount column
original_data['KTarget'] = original_data['Above50K'].apply(set_target)
original_data['KTarget'].value_counts()

KTarget
0    93576
1     6186
Name: count, dtype: int64

In [6]:
#original_data[['KTarget', 'Age', 'Industry', 'Occupation']].corr()

In [7]:
columns_to_encode = ['Class', 'Education', 'AHRSPAY', 'Marital',
                     'Major Industry', 'Major Occupation', 'Race',
                     'Hispanic', 'Gender', 'ASEX', 'AUNMEM', 'AUNTYPE', 
                     'Tax Status', 'Region', 'State', 'General Status', 
                     'Other Status', 'HHDREL', 'MARSUPWT', 'MIGMTR1',
                     'MIGMTR3', 'MIGMTR4', 'MIGSUN', 'NOEMP', 'PARENT', 
                     'PEARNVAL', 'Citizenship', 'PENATVTY' 
                    ]

# Make a copy of the dataset
data_copy = original_data.copy()

# Loop through columns_to_encode and convert the columns to category codes
for column in columns_to_encode:
    data_copy[column] = data_copy[column].astype("category").cat.codes

data_copy.head()

Unnamed: 0,Age,Class,Industry,Occupation,Education,AHGA,AHRSPAY,Marital,Major Industry,Major Occupation,...,PARENT,PEARNVAL,Citizenship,PEMNTVTY,PENATVTY,PRCITSHP,Weeks Worked,Year,Above50K,KTarget
0,38,4,6,36,3,0,2,2,10,5,...,26,26,0,0,1,2,12,95,-50000,0
1,44,6,37,12,8,0,2,2,2,10,...,40,40,4,0,1,2,26,95,-50000,0
2,2,3,0,0,10,0,2,4,14,6,...,40,40,4,0,1,0,0,95,-50000,0
3,35,4,29,3,12,0,2,0,21,2,...,40,40,4,2,1,2,52,94,-50000,0
4,49,4,4,34,12,0,2,0,4,8,...,40,40,4,0,1,2,50,95,-50000,0


In [8]:
data_copy = data_copy.drop(columns='Above50K')

In [9]:
data_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99762 entries, 0 to 99761
Data columns (total 42 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Age               99762 non-null  int64  
 1   Class             99762 non-null  int8   
 2   Industry          99762 non-null  int64  
 3   Occupation        99762 non-null  int64  
 4   Education         99762 non-null  int8   
 5   AHGA              99762 non-null  int64  
 6   AHRSPAY           99762 non-null  int8   
 7   Marital           99762 non-null  int8   
 8   Major Industry    99762 non-null  int8   
 9   Major Occupation  99762 non-null  int8   
 10  Race              99762 non-null  int8   
 11  Hispanic          99762 non-null  int8   
 12  Gender            99762 non-null  int8   
 13  ASEX              99762 non-null  int8   
 14  AUNMEM            99762 non-null  int8   
 15  AUNTYPE           99762 non-null  int8   
 16  Capital Gains     99762 non-null  int64 

## Split the Data into Training and Testing Sets

In [10]:
# Create the labels set `y` and features DataFrame `X`
y = data_copy['KTarget']

X = data_copy.copy()
X = X.drop(columns='KTarget')

In [11]:
# Check the balance of the labels variable (`y`) by using the `value_counts` function.
y.value_counts()

KTarget
0    93576
1     6186
Name: count, dtype: int64

In [12]:
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Scale the Features

Use the `StandardScaler` to scale the features data. Remember that only `X_train` and `X_test` DataFrames should be scaled.

In [13]:
# Transforming the test dataset based on the fit from the training dataset
X_train_scaled, X_test_scaled = p_util.scale_data_with_StandardScaler(X_train, X_test)

Scaled X_train data: [[ 1.17841165  0.45602893  0.59130483 ...  0.56588911  1.18126583
  -0.99817064]
 [ 0.55180672  1.36177979  1.64506522 ...  0.56588911  1.18126583
   1.00183272]
 [-0.43285818  0.45602893  0.75768805 ...  0.56588911  1.18126583
   1.00183272]
 ...
 [ 0.23850425  0.45602893  1.09045449 ...  0.56588911  1.18126583
   1.00183272]
 [-0.79091814  0.45602893  0.97953234 ...  0.56588911 -0.37716786
  -0.99817064]
 [ 0.10423177 -0.44972193 -0.85068307 ...  0.56588911 -0.95132763
   1.00183272]]
Scaled X_train data: [[-0.1643132   0.45602893  1.478682   ...  0.56588911  1.18126583
   1.00183272]
 [-0.96994812 -0.44972193 -0.85068307 ... -1.79231575 -0.95132763
  -0.99817064]
 [-1.37276557 -0.44972193 -0.85068307 ... -1.79231575 -0.95132763
   1.00183272]
 ...
 [ 1.89453157 -0.44972193 -0.85068307 ...  0.56588911 -0.95132763
   1.00183272]
 [ 0.14898926  1.36177979 -0.12968912 ...  0.56588911  1.18126583
   1.00183272]
 [-0.38810068  0.45602893  1.09045449 ... -0.61321332  1

## Create and Fit a Logistic Regression Model

Create a Logistic Regression model, fit it to the training data, make predictions with the testing data, and print the model's accuracy score. You may choose any starting settings you like. 

In [14]:
# All requirements above have been coded into pipeline_utilities python program file

random_state = 1
p_util.logistic_regression_model_generator(X_train_scaled, X_test_scaled, y_train, y_test, random_state)


Logistic Regression Training Data Score: 0.9474211785461301
Logistic Regression Testing Data Score: 0.9474359488392606
Logistic Regression Predictions Accuracy Score: 0.9474359488392606
              precision    recall  f1-score   support

           1       0.71      0.26      0.38      1553
           0       0.95      0.99      0.97     23388

    accuracy                           0.95     24941
   macro avg       0.83      0.63      0.68     24941
weighted avg       0.94      0.95      0.94     24941

Logistic Regression Balanced Accuracy Score: 0.6275092531808377
Logistic Regression roc_auc_score: 0.92972219865586


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Create and Fit a Random Forest Classifier Model

Create a Random Forest Classifier model, fit it to the training data, make predictions with the testing data, and print the model's accuracy score. You may choose any starting settings you like. 

In [15]:
# All details have been coded into pipeline_utilities python program file

random_state = 1
n_estimators = 100
p_util.random_forest_model_generator(X_train_scaled, X_test_scaled, y_train, y_test, random_state, n_estimators, X.columns)


Random Forest Training Data Score: 1.0
Random Forest Testing Data Score: 0.954171845555511
Random Forest Predictions Accuracy Score: 0.954171845555511
              precision    recall  f1-score   support

           1       0.77      0.37      0.50      1553
           0       0.96      0.99      0.98     23388

    accuracy                           0.95     24941
   macro avg       0.87      0.68      0.74     24941
weighted avg       0.95      0.95      0.95     24941

Random Forest Balanced Accuracy Score: 0.6831010085358659
[(0.09239906963881157, 'Dividends'), (0.09227470086776152, 'Age'), (0.09105557707209763, 'Occupation'), (0.08789696383758598, 'HHDFMX'), (0.08771592311103271, 'Capital Gains'), (0.052904446841888664, 'Industry'), (0.048350757650258355, 'Education'), (0.04116697347035583, 'MIGSAME'), (0.03964275403793694, 'Weeks Worked'), (0.03761638659715492, 'Major Industry')]


## Create and Fit an SVM Model

Create a Support Vector Machine model, fit it to the training data, make predictions with the testing data, and print the model's accuracy score. 

In [16]:
# All details have been coded into pipeline_utilities python program file

kernel_type = 'linear'
p_util.svm_model_generator(X_train_scaled, X_test_scaled, y_train, y_test, kernel_type)


SVM Training Data Score: 0.9442803490998517
SVM Testing Data Score: 0.9429453510284271
SVM Predictions Accuracy Score: 0.9429453510284271
              precision    recall  f1-score   support

           1       0.88      0.10      0.18      1553
           0       0.94      1.00      0.97     23388

    accuracy                           0.94     24941
   macro avg       0.91      0.55      0.57     24941
weighted avg       0.94      0.94      0.92     24941

SVM Balanced Accuracy Score: 0.5481666345645247


AttributeError: predict_proba is not available when  probability=False

## Evaluate the Models

Which model performed better? How does that compare to your prediction? Write down your results and thoughts in the following markdown cell.

* Both models had over 90% accuracy score, indicating that both models performed reasonably well, but the Random Forest model performed better than the Logistic Regression model by 2 percentage points. The Random Forest model achieved an accuracy of 95.22% with testing data and predictions while the Logistic Regression model achieved a score of 93.22%.
* My prediction that Logistic Regression model would perform better was incorrect!