# Census Income Modeling

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import linregress
import pipeline_utilities as p_util
#import pickle
#from sklearn.model_selection import train_test_split


In [None]:
#with open('probabilities.pkl', 'rb') as file:
#    model = pickle.load(file)
#    print(model)

In [None]:
# Import and examine the training dataset
train_data = pd.read_csv("../Project2_Resources/census-income-train.csv")
train_data.head()

In [None]:
# Import and examine the test dataset
test_data = pd.read_csv("../Project2_Resources/census-income-test.csv")
test_data.head()

In [None]:
# Review the columns
train_data.columns

In [None]:
# Review the values
#train_data.describe()

In [None]:
# Define a function
def set_target(above50k):
    if above50k == '-50000':
        return 0
    return 1

# "Apply" the function to the amount column in the two data sets
train_data['KTarget'] = train_data['Above50K'].apply(set_target)
train_data['KTarget'].value_counts()

In [None]:
test_data['KTarget'] = test_data['Above50K'].apply(set_target)
test_data['KTarget'].value_counts()

In [None]:
columns_to_encode = ['Class', 'Education', 'Education last week', 'Marital',
                     'Major Industry', 'Major Occupation', 'Race', 'Employment Status',
                     'Hispanic', 'Gender', 'Labor union', 'Unemployment Reason', 
                     'TaxFiler', 'Previous Region', 'Previous State', 'Family Status', 
                     'Household', 'MIGMSA', 'MIGRegion', 'MIGMove',
                     '1YearAgo', 'PrevSunBelt', 'Parents', 'FatherCountry', 'MotherCountry', 
                     'BirthCountry', 'Citizenship', 'Self employed', 
                     'VetQtnaire', 'Veteran' 
                    ]

# Make a copy of the datasets
train_data_copy = train_data.copy()
test_data_copy = test_data.copy()

# Loop through columns_to_encode and convert the columns to category codes
for column in columns_to_encode:
    train_data_copy[column] = train_data_copy[column].astype("category").cat.codes
    test_data_copy[column] = test_data_copy[column].astype("category").cat.codes

train_data_copy.head()

In [None]:
X_train = train_data_copy.drop(columns=['Above50K', 'KTarget'])
X_test = test_data_copy.drop(columns=['Above50K', 'KTarget'])

In [None]:
#train_data_copy[['KTarget', 'Capital Gains', 'Parents', 'Veteran', 'Family Status', 'Industry', 'Weeks Worked', 'Wage per hour']].corr()
print(train_data_copy.drop(columns='Above50K').corr())

In [None]:
X_train.info()

## Split the Data into Training and Testing Sets

In [None]:
# Create the labels set `y` and features DataFrame `X`
y_train = train_data_copy['KTarget']
y_test = test_data_copy['KTarget']

In [None]:
# Check the balance of the labels variable (`y`) by using the `value_counts` function.
y_train.value_counts()

In [None]:
# Split the data into X_train, X_test, y_train, y_test
#X_train, X_test, y_train, y_test = train_test_split(X, y)

## Scale the Features

Use the `StandardScaler` to scale the features data. Remember that only `X_train` and `X_test` DataFrames should be scaled.

In [None]:
# Transforming the test dataset based on the fit from the training dataset
X_train_scaled, X_test_scaled = p_util.scale_data_with_StandardScaler(X_train, X_test)

In [None]:
random_state = 1
p_util.gradient_boost_model_generator(X_train_scaled, X_test_scaled, y_train, y_test, random_state)

In [None]:
random_state = 1
p_util.ada_boost_model_generator(X_train_scaled, X_test_scaled, y_train, y_test, random_state)

In [None]:
%%time
random_state = 1
p_util.extra_trees_model_generator(X_train_scaled, X_test_scaled, y_train, y_test, random_state)

## Try a Decision Tree model

In [None]:
p_util.decision_tree_model_generator(X_train_scaled, X_test_scaled, y_train, y_test)

## Create and Fit a PCA Model

Try a PCA model. 

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_data_pca = pca.fit_transform(X_train_scaled)
X_data_pca[:5]
pca_df = pd.DataFrame(
    X_data_pca,
    columns=["PCA1", "PCA2"]
)

In [None]:
X_data_pca[:5]

In [None]:
pca.explained_variance_ratio_

In [None]:
from sklearn.cluster import KMeans
inertia = []
k = list(range(1, 11))
for i in k:
    k_model = KMeans(n_clusters=i, n_init='auto', random_state=1)
    k_model.fit(pca_df)
    inertia.append(k_model.inertia_)

# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)


In [None]:
# Review the DataFrame
df_elbow.head()
df_elbow.plot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

In [None]:
# Determine the rate of decrease between each k value
k = df_elbow["k"]
inertia = df_elbow["inertia"]
for i in range(1, len(k)):
    percentage_decrease = (inertia[i-1] - inertia[i]) / inertia[i-1] * 100
    print(f"Percentage decrease from k={k[i-1]} to k={k[i]}: {percentage_decrease:.2f}%")

In [None]:
# Define the model with 5 clusters
model = KMeans(n_clusters=4, n_init='auto', random_state=0)

# Fit the model
model.fit(pca_df)

# Make predictions
k_3 = model.predict(pca_df)

# Create a copy of the PCA DataFrame
pca_predictions_df = pca_df.copy()

# Add a class column with the labels
pca_predictions_df["income_segments"] = k_3

In [None]:
pca_predictions_df.plot.scatter(
    x="PCA1",
    y="PCA2",
    c="income_segments",
    colormap='rainbow')

In [None]:
pca_component_weights = pd.DataFrame(pca.components_.T, columns=['PCA1', 'PCA2'], index=X_train.columns)
pca_component_weights.sort_values('PCA1', ascending=False)

## Create and Fit a Logistic Regression Model

Create a Logistic Regression model, fit it to the training data, make predictions with the testing data, and print the model's accuracy score. You may choose any starting settings you like. 

In [None]:
# All requirements above have been coded into pipeline_utilities python program file

random_state = 1
p_util.logistic_regression_model_generator(X_train_scaled, X_test_scaled, y_train, y_test, random_state)


## Create and Fit a Random Forest Classifier Model

Create a Random Forest Classifier model, fit it to the training data, make predictions with the testing data, and print the model's accuracy score. You may choose any starting settings you like. 

In [None]:
# All details have been coded into pipeline_utilities python program file
# Tried 500 estimators, almost no difference and balanced accuracy score was slightly worse, putting it back to 100

random_state = 1
n_estimators = 100
p_util.random_forest_model_generator(X_train_scaled, X_test_scaled, y_train, y_test, random_state, n_estimators, X_train.columns)


## Create and Fit an SVM Model

Create a Support Vector Machine model, fit it to the training data, make predictions with the testing data, and print the model's accuracy score. 

In [None]:
# All details have been coded into pipeline_utilities python program file, takes FOREVER to run

#kernel_type = 'linear'
#p_util.svm_model_generator(X_train_scaled, X_test_scaled, y_train, y_test, kernel_type)


## Findings and Conclusions

What was the result of your analysis? Which model performed better?

* All models that I tried achieved over 90% accuracy score with testing data and predictions. The Random Forest model had the best F1 scores of 52%-98% with Gradient Boost running a close second at 51%-98%. PCA analysis resulted in a very pretty set of 4 segments, but the total explained variance by PCA1 and PCA2 was 0.27, so did not reduce dimensionality effectively. My SVC model took forever to run. I ran a total of 6 models.