# Census Income Modeling

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import linregress
import pipeline_utilities as p_util


In [3]:
# Import and examine the training dataset
train_data = pd.read_csv("../Project2_Resources/census-income-train.csv")
train_data.head()

Unnamed: 0,Age,Class,Industry,Occupation,Education,Wage per hour,Education last week,Marital,Major Industry,Major Occupation,...,FatherCountry,MotherCountry,BirthCountry,Citizenship,Self employed,VetQtnaire,Veteran,Weeks Worked,Year,Above50K
0,73,Not in universe,0,0,High school graduate,0,Not in universe,Widowed,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,0,95,-50000
1,58,Self-employed-not incorporated,4,34,Some college but no degree,0,Not in universe,Divorced,Construction,Precision production craft & repair,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,94,-50000
2,18,Not in universe,0,0,10th grade,0,High school,Never married,Not in universe or children,Not in universe,...,Vietnam,Vietnam,Vietnam,Foreign born- Not a citizen of U S,0,Not in universe,2,0,95,-50000
3,9,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,-50000
4,10,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,-50000


In [4]:
# Import and examine the test dataset
test_data = pd.read_csv("../Project2_Resources/census-income-test.csv")
test_data.head()

Unnamed: 0,Age,Class,Industry,Occupation,Education,Wage per hour,Education last week,Marital,Major Industry,Major Occupation,...,FatherCountry,MotherCountry,BirthCountry,Citizenship,Self employed,VetQtnaire,Veteran,Weeks Worked,Year,Above50K
0,38,Private,6,36,1st 2nd 3rd or 4th grade,0,Not in universe,Married-civilian spouse present,Manufacturing-durable goods,Machine operators assmblrs & inspctrs,...,Mexico,Mexico,Mexico,Foreign born- Not a citizen of U S,0,Not in universe,2,12,95,-50000
1,44,Self-employed-not incorporated,37,12,Associates degree-occup /vocational,0,Not in universe,Married-civilian spouse present,Business and repair services,Professional specialty,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,26,95,-50000
2,2,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,95,-50000
3,35,Private,29,3,High school graduate,0,Not in universe,Divorced,Transportation,Executive admin and managerial,...,United-States,United-States,United-States,Native- Born in the United States,2,Not in universe,2,52,94,-50000
4,49,Private,4,34,High school graduate,0,Not in universe,Divorced,Construction,Precision production craft & repair,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,50,95,-50000


In [5]:
# Review the columns
train_data.columns

Index(['Age', 'Class', 'Industry', 'Occupation', 'Education', 'Wage per hour',
       'Education last week', 'Marital', 'Major Industry', 'Major Occupation',
       'Race', 'Hispanic', 'Gender', 'Labor union', 'Unemployment Reason',
       'Employment Status', 'Capital Gains', 'Capital Loss', 'Dividends',
       'TaxFiler', 'Previous Region', 'Previous State', 'Family Status',
       'Household', 'Instance Weight', 'MIGMSA', 'MIGRegion', 'MIGMove',
       '1YearAgo', 'PrevSunBelt', 'Under18', 'Parents', 'FatherCountry',
       'MotherCountry', 'BirthCountry', 'Citizenship', 'Self employed',
       'VetQtnaire', 'Veteran', 'Weeks Worked', 'Year', 'Above50K'],
      dtype='object')

In [6]:
# Review the values
#train_data.describe()

In [7]:
# Define a function
def set_target(above50k):
    if above50k == '-50000':
        return 0
    return 1

# "Apply" the function to the amount column in the two data sets
train_data['KTarget'] = train_data['Above50K'].apply(set_target)
train_data['KTarget'].value_counts()

KTarget
0    187141
1     12382
Name: count, dtype: int64

In [8]:
test_data['KTarget'] = test_data['Above50K'].apply(set_target)
test_data['KTarget'].value_counts()

KTarget
0    93576
1     6186
Name: count, dtype: int64

In [12]:
columns_to_encode = ['Class', 'Education', 'Education last week', 'Marital',
                     'Major Industry', 'Major Occupation', 'Race', 'Employment Status',
                     'Hispanic', 'Gender', 'Labor union', 'Unemployment Reason', 
                     'TaxFiler', 'Previous Region', 'Previous State', 'Family Status', 
                     'Household', 'MIGMSA', 'MIGRegion', 'MIGMove',
                     '1YearAgo', 'PrevSunBelt', 'Parents', 'FatherCountry', 'MotherCountry', 
                     'BirthCountry', 'Citizenship', 'Self employed', 
                     'VetQtnaire', 'Veteran' 
                    ]

# Make a copy of the datasets
train_data_copy = train_data.copy()
test_data_copy = test_data.copy()

# Loop through columns_to_encode and convert the columns to category codes
for column in columns_to_encode:
    train_data_copy[column] = train_data_copy[column].astype("category").cat.codes
    test_data_copy[column] = test_data_copy[column].astype("category").cat.codes

train_data_copy.head()

Unnamed: 0,Age,Class,Industry,Occupation,Education,Wage per hour,Education last week,Marital,Major Industry,Major Occupation,...,MotherCountry,BirthCountry,Citizenship,Self employed,VetQtnaire,Veteran,Weeks Worked,Year,Above50K,KTarget
0,73,3,0,0,12,0,2,6,14,6,...,40,40,4,0,1,2,0,95,-50000,0
1,58,6,4,34,16,0,2,0,4,8,...,40,40,4,0,1,2,52,94,-50000,0
2,18,3,0,0,0,0,1,4,14,6,...,41,41,0,0,1,2,0,95,-50000,0
3,9,3,0,0,10,0,2,4,14,6,...,40,40,4,0,1,0,0,94,-50000,0
4,10,3,0,0,10,0,2,4,14,6,...,40,40,4,0,1,0,0,94,-50000,0


In [13]:
X_train = train_data_copy.drop(columns=['Above50K', 'KTarget'])
X_test = test_data_copy.drop(columns=['Above50K', 'KTarget'])

In [14]:
#train_data_copy[['KTarget', 'Capital Gains', 'Parents', 'Veteran', 'Family Status', 'Industry', 'Weeks Worked', 'Wage per hour']].corr()
print(X_train.corr())

                          Age     Class  Industry  Occupation  Education  \
Age                  1.000000  0.087751  0.157822    0.117414   0.019436   
Class                0.087751  1.000000  0.305050    0.377095   0.072739   
Industry             0.157822  0.305050  1.000000    0.563493   0.179267   
Occupation           0.117414  0.377095  0.563493    1.000000   0.072849   
Education            0.019436  0.072739  0.179267    0.072849   1.000000   
Wage per hour        0.036938  0.036878  0.165980    0.194498   0.046064   
Education last week  0.166565 -0.022285 -0.042954   -0.043606   0.017986   
Marital             -0.295850 -0.166327 -0.311821   -0.262534  -0.137404   
Major Industry      -0.060803 -0.136778 -0.027164   -0.187754  -0.035434   
Major Occupation     0.010021  0.056247  0.118254    0.074611  -0.017060   
Race                 0.083260  0.055184  0.026234    0.024634   0.037215   
Hispanic            -0.103743 -0.037743 -0.057725    0.002744  -0.101863   
Gender      

In [None]:
X_train.info()

## Split the Data into Training and Testing Sets

In [16]:
# Create the labels set `y` and features DataFrame `X`
y_train = train_data_copy['KTarget']
y_test = test_data_copy['KTarget']

In [17]:
# Check the balance of the labels variable (`y`) by using the `value_counts` function.
y_train.value_counts()

KTarget
0    187141
1     12382
Name: count, dtype: int64

In [18]:
# Split the data into X_train, X_test, y_train, y_test
#X_train, X_test, y_train, y_test = train_test_split(X, y)

## Scale the Features

Use the `StandardScaler` to scale the features data. Remember that only `X_train` and `X_test` DataFrames should be scaled.

In [19]:
# Transforming the test dataset based on the fit from the training dataset
X_train_scaled, X_test_scaled = p_util.scale_data_with_StandardScaler(X_train, X_test)

Scaled X_train data: [[ 1.72587866 -0.43914626 -0.84973982 ...  0.56979856 -0.94934627
   1.00065678]
 [ 1.05355971  2.27510902 -0.62834271 ...  0.56979856  1.18080368
  -0.99934365]
 [-0.73929082 -0.43914626 -0.84973982 ...  0.56979856 -0.94934627
   1.00065678]
 ...
 [ 0.56052581 -0.43914626 -0.84973982 ...  0.56979856  1.18080368
   1.00065678]
 [-0.82893335 -0.43914626 -0.84973982 ...  0.56979856 -0.94934627
   1.00065678]
 [-0.11179313  0.4656055   1.47492979 ...  0.56979856  1.18080368
  -0.99934365]]
Scaled X_train data: [[ 0.15713444  0.4656055  -0.51764416 ...  0.56979856 -0.4577732
   1.00065678]
 [ 0.42606202  2.27510902  1.19818341 ...  0.56979856  0.11572871
   1.00065678]
 [-1.45643103 -0.43914626 -0.84973982 ... -1.77907682 -0.94934627
   1.00065678]
 ...
 [-0.47036324  2.27510902 -0.79439054 ...  0.56979856  1.18080368
  -0.99934365]
 [-0.20143566  0.4656055   1.64097762 ...  0.56979856  1.18080368
   1.00065678]
 [ 1.45695108 -0.43914626 -0.84973982 ...  0.56979856 -0.

In [None]:
random_state = 1
p_util.gradient_boost_model_generator(X_train_scaled, X_test_scaled, y_train, y_test, random_state)

In [None]:
random_state = 1
p_util.ada_boost_model_generator(X_train_scaled, X_test_scaled, y_train, y_test, random_state)

In [None]:
%%time
random_state = 1
p_util.extra_trees_model_generator(X_train_scaled, X_test_scaled, y_train, y_test, random_state)

## Try a Decision Tree model

In [None]:
p_util.decision_tree_model_generator(X_train_scaled, X_test_scaled, y_train, y_test)

## Create and Fit a PCA Model

Try a PCA model. 

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_data_pca = pca.fit_transform(X_train_scaled)
X_data_pca[:5]
pca_df = pd.DataFrame(
    X_data_pca,
    columns=["PCA1", "PCA2"]
)

In [None]:
X_data_pca[:5]

In [None]:
pca.explained_variance_ratio_

In [None]:
from sklearn.cluster import KMeans
inertia = []
k = list(range(1, 11))
for i in k:
    k_model = KMeans(n_clusters=i, n_init='auto', random_state=1)
    k_model.fit(pca_df)
    inertia.append(k_model.inertia_)

# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)


In [None]:
# Review the DataFrame
df_elbow.head()
df_elbow.plot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

In [None]:
# Determine the rate of decrease between each k value
k = df_elbow["k"]
inertia = df_elbow["inertia"]
for i in range(1, len(k)):
    percentage_decrease = (inertia[i-1] - inertia[i]) / inertia[i-1] * 100
    print(f"Percentage decrease from k={k[i-1]} to k={k[i]}: {percentage_decrease:.2f}%")

In [None]:
# Define the model with 5 clusters
model = KMeans(n_clusters=4, n_init='auto', random_state=0)

# Fit the model
model.fit(pca_df)

# Make predictions
k_3 = model.predict(pca_df)

# Create a copy of the PCA DataFrame
pca_predictions_df = pca_df.copy()

# Add a class column with the labels
pca_predictions_df["income_segments"] = k_3

In [None]:
pca_predictions_df.plot.scatter(
    x="PCA1",
    y="PCA2",
    c="income_segments",
    colormap='rainbow')

In [None]:
pca_component_weights = pd.DataFrame(pca.components_.T, columns=['PCA1', 'PCA2'], index=X_train.columns)
pca_component_weights.sort_values('PCA1', ascending=False)

## Create and Fit a Logistic Regression Model

Create a Logistic Regression model, fit it to the training data, make predictions with the testing data, and print the model's accuracy score. You may choose any starting settings you like. 

In [None]:
# All requirements above have been coded into pipeline_utilities python program file

random_state = 1
p_util.logistic_regression_model_generator(X_train_scaled, X_test_scaled, y_train, y_test, random_state)


## Create and Fit a Random Forest Classifier Model

Create a Random Forest Classifier model, fit it to the training data, make predictions with the testing data, and print the model's accuracy score. You may choose any starting settings you like. 

In [20]:
# All details have been coded into pipeline_utilities python program file
# Tried 500 estimators, almost no difference and balanced accuracy score was slightly worse, putting it back to 100

#random_state = 1
#n_estimators = 100
#p_util.random_forest_model_generator(X_train_scaled, X_test_scaled, y_train, y_test, random_state, n_estimators, X_train.columns)

# Import RandomUnderSampler from imblearn
from imblearn.under_sampling import RandomUnderSampler

# Instantiate the RandomUnderSampler instance
rus = RandomUnderSampler(random_state=1)

# Fit the data to the model
X_resampled, y_resampled = rus.fit_resample(X_train_scaled, y_train)

# Count distinct resampled values
y_resampled.value_counts()


KTarget
0    12382
1    12382
Name: count, dtype: int64

In [22]:
# Instantiate a second RamdonForestClassifier instance
random_state = 1
n_estimators = 100
p_util.random_forest_model_generator(X_resampled, X_test_scaled, y_resampled, y_test, random_state, n_estimators, X_train.columns)


Random Forest Training Data Score: 0.9999192376029721
Random Forest Testing Data Score: 0.8607485816242657
Random Forest Predictions Accuracy Score: 0.8607485816242657
              precision    recall  f1-score   support

           1       0.29      0.89      0.44      6186
           0       0.99      0.86      0.92     93576

    accuracy                           0.86     99762
   macro avg       0.64      0.87      0.68     99762
weighted avg       0.95      0.86      0.89     99762

Random Forest Balanced Accuracy Score: 0.8724040768907312
[(0.120355017798187, 'Occupation'), (0.09667998585404697, 'Age'), (0.09547593286477608, 'Weeks Worked'), (0.0730341365736758, 'Under18'), (0.0669678799733952, 'Dividends'), (0.06368943498046313, 'Industry'), (0.05528971996256306, 'Instance Weight'), (0.041941953202647424, 'Education'), (0.03804338135999633, 'Family Status'), (0.03655995234678803, 'Gender')]


## Create and Fit an SVM Model

Create a Support Vector Machine model, fit it to the training data, make predictions with the testing data, and print the model's accuracy score. 

In [None]:
# All details have been coded into pipeline_utilities python program file, takes FOREVER to run

#kernel_type = 'linear'
#p_util.svm_model_generator(X_train_scaled, X_test_scaled, y_train, y_test, kernel_type)


## Findings and Conclusions

What was the result of your analysis? Which model performed better?

* All models that I tried achieved over 90% accuracy score with testing data and predictions. The Random Forest model had the best F1 scores of 52%-98% with Gradient Boost running a close second at 51%-98%. PCA analysis resulted in a very pretty set of 4 segments, but the total explained variance by PCA1 and PCA2 was 0.27, so did not reduce dimensionality effectively. My SVC model took forever to run. I ran a total of 6 models.