# Census Income Modeling

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import linregress
import pipeline_utilities as p_util
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
# Import the data
original_data = pd.read_csv("../Project2_Resources/census-income-test.csv")
original_data.head()

Unnamed: 0,Age,Class,Industry,Occupation,Education,AHGA,AHRSPAY,Marital,Major Industry,Major Occupation,...,NOEMP,PARENT,PEARNVAL,Citizenship,PEMNTVTY,PENATVTY,PRCITSHP,Weeks Worked,Year,Above50K
0,38,Private,6,36,1st 2nd 3rd or 4th grade,0,Not in universe,Married-civilian spouse present,Manufacturing-durable goods,Machine operators assmblrs & inspctrs,...,Mexico,Mexico,Mexico,Foreign born- Not a citizen of U S,0,Not in universe,2,12,95,-50000
1,44,Self-employed-not incorporated,37,12,Associates degree-occup /vocational,0,Not in universe,Married-civilian spouse present,Business and repair services,Professional specialty,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,26,95,-50000
2,2,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,95,-50000
3,35,Private,29,3,High school graduate,0,Not in universe,Divorced,Transportation,Executive admin and managerial,...,United-States,United-States,United-States,Native- Born in the United States,2,Not in universe,2,52,94,-50000
4,49,Private,4,34,High school graduate,0,Not in universe,Divorced,Construction,Precision production craft & repair,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,50,95,-50000


In [3]:
original_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99762 entries, 0 to 99761
Data columns (total 42 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Age               99762 non-null  int64  
 1   Class             99762 non-null  object 
 2   Industry          99762 non-null  int64  
 3   Occupation        99762 non-null  int64  
 4   Education         99762 non-null  object 
 5   AHGA              99762 non-null  int64  
 6   AHRSPAY           99762 non-null  object 
 7   Marital           99762 non-null  object 
 8   Major Industry    99762 non-null  object 
 9   Major Occupation  99762 non-null  object 
 10  Race              99762 non-null  object 
 11  Hispanic          99762 non-null  object 
 12  Gender            99762 non-null  object 
 13  ASEX              99762 non-null  object 
 14  AUNMEM            99762 non-null  object 
 15  AUNTYPE           99762 non-null  object 
 16  Capital Gains     99762 non-null  int64 

In [7]:
# Define a function
def set_target(above50k):
    if above50k == '-50000':
        return 0
    return 1

# "Apply" the function to the amount column
original_data['KTarget'] = original_data['Above50K'].apply(set_target)
original_data.head()

Unnamed: 0,Age,Class,Industry,Occupation,Education,AHGA,AHRSPAY,Marital,Major Industry,Major Occupation,...,PARENT,PEARNVAL,Citizenship,PEMNTVTY,PENATVTY,PRCITSHP,Weeks Worked,Year,Above50K,KTarget
0,38,Private,6,36,1st 2nd 3rd or 4th grade,0,Not in universe,Married-civilian spouse present,Manufacturing-durable goods,Machine operators assmblrs & inspctrs,...,Mexico,Mexico,Foreign born- Not a citizen of U S,0,Not in universe,2,12,95,-50000,0
1,44,Self-employed-not incorporated,37,12,Associates degree-occup /vocational,0,Not in universe,Married-civilian spouse present,Business and repair services,Professional specialty,...,United-States,United-States,Native- Born in the United States,0,Not in universe,2,26,95,-50000,0
2,2,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,95,-50000,0
3,35,Private,29,3,High school graduate,0,Not in universe,Divorced,Transportation,Executive admin and managerial,...,United-States,United-States,Native- Born in the United States,2,Not in universe,2,52,94,-50000,0
4,49,Private,4,34,High school graduate,0,Not in universe,Divorced,Construction,Precision production craft & repair,...,United-States,United-States,Native- Born in the United States,0,Not in universe,2,50,95,-50000,0


In [9]:
original_data[['KTarget', 'Age', 'Industry', 'Occupation']].corr()

Unnamed: 0,KTarget,Age,Industry,Occupation
KTarget,1.0,0.137499,0.194715,0.014418
Age,0.137499,1.0,0.150886,0.110132
Industry,0.194715,0.150886,1.0,0.564211
Occupation,0.014418,0.110132,0.564211,1.0


In [18]:
original_data['AHRSPAY'].unique()

array([' Not in universe', ' College or university', ' High school'],
      dtype=object)

In [13]:
#columns_to_encode = ["Marital"]
columns_to_encode = ['Class',
                    'Education',
                    'AHRSPAY',
                    ]

# Make a copy of the dataset
data_copy = original_data.copy()

# Loop through columns_to_encode and convert the columns to category codes
for column in columns_to_encode:
    data_copy[column] = data_copy[column].astype("category").cat.codes

data_copy.head()

Unnamed: 0,Age,Class,Industry,Occupation,Education,AHGA,AHRSPAY,Marital,Major Industry,Major Occupation,...,PARENT,PEARNVAL,Citizenship,PEMNTVTY,PENATVTY,PRCITSHP,Weeks Worked,Year,Above50K,KTarget
0,38,Private,6,36,1st 2nd 3rd or 4th grade,0,Not in universe,2,Manufacturing-durable goods,Machine operators assmblrs & inspctrs,...,Mexico,Mexico,Foreign born- Not a citizen of U S,0,Not in universe,2,12,95,-50000,0
1,44,Self-employed-not incorporated,37,12,Associates degree-occup /vocational,0,Not in universe,2,Business and repair services,Professional specialty,...,United-States,United-States,Native- Born in the United States,0,Not in universe,2,26,95,-50000,0
2,2,Not in universe,0,0,Children,0,Not in universe,4,Not in universe or children,Not in universe,...,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,95,-50000,0
3,35,Private,29,3,High school graduate,0,Not in universe,0,Transportation,Executive admin and managerial,...,United-States,United-States,Native- Born in the United States,2,Not in universe,2,52,94,-50000,0
4,49,Private,4,34,High school graduate,0,Not in universe,0,Construction,Precision production craft & repair,...,United-States,United-States,Native- Born in the United States,0,Not in universe,2,50,95,-50000,0


In [None]:
data = data.drop(columns=['Column5', 'Column6'])

In [None]:
data[['Column2', 'Column3', 'Column4', 'Column7', 'Column8', 'Column9', 'Column10']].corr()

In [None]:
data = data.drop(columns='Column10')

## Split the Data into Training and Testing Sets

In [None]:
# Create the labels set `y` and features DataFrame `X`
y = data['spam']

X = data.copy()
X = X.drop(columns='spam')

In [None]:
# Check the balance of the labels variable (`y`) by using the `value_counts` function.
y.value_counts()

In [None]:
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Scale the Features

Use the `StandardScaler` to scale the features data. Remember that only `X_train` and `X_test` DataFrames should be scaled.

In [None]:
# Transforming the test dataset based on the fit from the training dataset
X_train_scaled, X_test_scaled = p_util.scale_data_with_StandardScaler(X_train, X_test)

## Create and Fit a Logistic Regression Model

Create a Logistic Regression model, fit it to the training data, make predictions with the testing data, and print the model's accuracy score. You may choose any starting settings you like. 

In [None]:
# Train a Logistic Regression model and print the model score
# Make and save testing predictions with the saved logistic regression model using the test data
# Review the predictions
# Calculate the accuracy score by evaluating `y_test` vs. `testing_predictions`

# All requirements above have been coded into pipeline_utilities python program file

random_state = 1
p_util.logistic_regression_model_generator(X_train_scaled, X_test_scaled, y_train, y_test, random_state)


## Create and Fit a Random Forest Classifier Model

Create a Random Forest Classifier model, fit it to the training data, make predictions with the testing data, and print the model's accuracy score. You may choose any starting settings you like. 

In [None]:
# Train a Random Forest Classifier model and print the model score
# Make and save testing predictions with the saved logistic regression model using the test data
# Review the predictions
# Calculate the accuracy score by evaluating `y_test` vs. `testing_predictions`.

# All requirements above have been coded into pipeline_utilities python program file

random_state = 1
n_estimators = 100
p_util.random_forest_model_generator(X_train_scaled, X_test_scaled, y_train, y_test, random_state, n_estimators, X.columns)


## Evaluate the Models

Which model performed better? How does that compare to your prediction? Write down your results and thoughts in the following markdown cell.

* Both models had over 90% accuracy score, indicating that both models performed reasonably well, but the Random Forest model performed better than the Logistic Regression model by 2 percentage points. The Random Forest model achieved an accuracy of 95.22% with testing data and predictions while the Logistic Regression model achieved a score of 93.22%.
* My prediction that Logistic Regression model would perform better was incorrect!