<a href="https://colab.research.google.com/github/bnnguyen/USIncomePredictionTraining/blob/main/US_Income_Data_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data pre-processing



## Feature Encoding

Some of our features are random strings. So let's first start by encoding them!

In [None]:
def featureEncoding(df):
  import pandas as pd
  from sklearn.preprocessing import LabelEncoder
  # Create a LabelEncoder instance
  le = LabelEncoder()

  # Loop through each column (except the last one, which is the target 'income_50k' column)
  for column in df.columns[:-1]:
    if (df[column].dtypes == 'object'):
      df[column] = le.fit_transform(df[column])
      column_map = dict(zip(le.classes_, le.transform(le.classes_)))
      for d in column_map.items():
        if d[0] == '?':
          df[column]=df[column].replace(d[1],-1)
  # Check if the 'income_50k' column exists before transforming
  if 'income_50k' in df.columns:
      df['income_50k'] = le.fit_transform(df['income_50k'])
      class_map = dict(zip(le.classes_, le.transform(le.classes_)))
      print(f"income_50k encoding: {class_map}")
  else:
      print("No 'income_50k' column found in the DataFrame.")
  return df


## Feature Imputation

Notice that there are some values that are missing. Therefore we will first perform data imputation over the missing data.  

In [None]:
def simpleImputation(df):
  from sklearn.impute import SimpleImputer, KNNImputer
  # Determine the column names dynamically
  column_names = df.columns.tolist()

  # Create a list of column names that need imputation (excluding the target column 'income_50k')
  columns_to_impute = [col for col in column_names if col != 'income_50k']

  # Create an instance of the SimpleImputer and fit-transform the dataset
  imputer = SimpleImputer(missing_values=-1,strategy='most_frequent')
  df[columns_to_impute] = imputer.fit_transform(df[columns_to_impute])
  return df



You can also use the KNNImputer! Infact, you are encouraged to go through the functions listed here: https://scikit-learn.org/stable/modules/impute.html#

## Feature Scaling

Now we seet that the values are in different ranges! We can handle this by scaling the values of the DataFrame! Note that this might not always be neccessary, so don't assume that this is something that you have to do for all your problems!

In [None]:
def featureScaling(df):
  from sklearn.preprocessing import MinMaxScaler
  from sklearn.impute import SimpleImputer, KNNImputer
  scaled_array = MinMaxScaler().fit_transform(df.values)
  df = pd.DataFrame(scaled_array, columns = df.columns)
  return df

## Data Balancing

### Using imbalanced-learn

imbalanced-learn is a library for working with imbalanced-data. Find more at: https://imbalanced-learn.org/stable/

In this example, we will see 3 popuplar methods of working with imbalanced data using the library imbalanced-learn

Below is an example to use `RandomUnderSampler`. If you specify a number between 0 and 1 for sampling_strategy, it will make sure that the proportion of minority/majority samples is equal to that number.

In [None]:
def Undersampling(df):
  from imblearn.under_sampling import RandomUnderSampler
  from collections import Counter

  column_names = df.columns
  # This line splits dataset into features and values
  X, y = df[df.columns[:-1]].values.astype(float), df[df.columns[-1]].values.astype(int)
  rus = RandomUnderSampler(random_state=0, sampling_strategy='not minority')
  X_resampled, y_resampled = rus.fit_resample(X, y)
  print(sorted(Counter(y_resampled).items()))
  df=pd.DataFrame(X_resampled)
  df["income_50k"]=y_resampled
  df.columns=column_names
  return df

Similarly for oversampling...

In [None]:
def Oversampling(df):
  from imblearn.over_sampling import RandomOverSampler
  from collections import Counter
  column_names = df.columns
  # This line splits dataset into features and values
  X, y = df[df.columns[:-1]].values.astype(float), df[df.columns[-1]].values.astype(int)
  rus = RandomOverSampler(random_state=0, sampling_strategy='not majority')
  X_resampled, y_resampled = rus.fit_resample(X, y)
  print(sorted(Counter(y_resampled).items()))
  df=pd.DataFrame(X_resampled)
  df["income_50k"]=y_resampled
  df.columns=column_names
  return df

Using the SMOTE method is also simple using imbalance-learn:

In [None]:
def smote(df):
  from imblearn.over_sampling import SMOTE
  from collections import Counter
  column_names = df.columns
  # This line splits dataset into features and values
  X, y = df[df.columns[:-1]].values.astype(float), df[df.columns[-1]].values.astype(int)
  rus = SMOTE(random_state=0, sampling_strategy='not majority', k_neighbors = 5)
  X_resampled, y_resampled = rus.fit_resample(X, y)
  print(sorted(Counter(y_resampled).items()))
  df=pd.DataFrame(X_resampled);
  df["income_50k"]=y_resampled
  df.columns=column_names
  return df

## Try to apply PCA

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

def plot_num_component_vs_explained_variance(X_train):
    pca = PCA(n_components=X_train.shape[1])
    X_train_pca = pca.fit_transform(X_train)
    x_axis = [i+1 for i in range(X_train.shape[1])]
    y_axis = [np.sum(pca.explained_variance_ratio_[:i+1]) * 100 for i in range(X_train.shape[1])]

    plt.clf()
    plt.plot(x_axis, y_axis)
    plt.xlabel("Number of selected features")
    plt.ylabel("%age of explained variance")
    plt.title("Num Features vs Explained Variance")
    plt.show()

plot_num_component_vs_explained_variance(X_train)

We can almost look at this graph and tell that after about 5 features, there is 0 variance explained.

Let's continue with our PCA of 5 features and modified train and test sets.

In [None]:
from sklearn import datasets
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
import time

# Load the dataset from Google Drive
file_id = '1ArItH0Fuovf5VtsV1p9waK6hMg4uLVTa'
url = 'https://drive.google.com/uc?id={}'.format(file_id)
df = pd.read_csv(url)

# Apply preprocessing steps
df = featureEncoding(df)
df = simpleImputation(df)
# df = featureScaling(df)
df= smote(df)

# This line splits dataset into features and values
X, y = df[df.columns[:-1]].values.astype(float), df[df.columns[-1]].values.astype(int)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

def make_pca(X_train, num_components):
    pca = PCA(n_components = num_components)
    X_train_pca = pca.fit_transform(X_train)
    return pca, X_train_pca

pca_5, X_train_pca_5 = make_pca(X_train, num_components = 5)
X_test_pca_5 = pca_5.transform(X_test)

# Models

## KNN

We can almost look at this graph and tell that after about 5 features, there is 0 variance explained.

Let's continue with our PCA of 5 features and modified train and test sets.

In [None]:
from sklearn import datasets
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
import time

# Load the dataset from Google Drive
file_id = '1ArItH0Fuovf5VtsV1p9waK6hMg4uLVTa'
url = 'https://drive.google.com/uc?id={}'.format(file_id)
df = pd.read_csv(url)

# Apply preprocessing steps
df = featureEncoding(df)
df = simpleImputation(df)
# df = featureScaling(df)
df= smote(df)

# This line splits dataset into features and values
X, y = df[df.columns[:-1]].values.astype(float), df[df.columns[-1]].values.astype(int)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Plotting fundtion
def plot_accuracy(accuracy_per_k, classifier_name):
    # Extract the values of k and accuracy from the dictionary
    k_values = list(accuracy_per_k.keys())
    accuracy = list(accuracy_per_k.values())
    # Create a bar chart of the accuracy per k
    plt.bar(k_values, accuracy)
    plt.xlabel("k")
    plt.ylabel("Accuracy (%)")
    plt.title(f"Accuracy of KNN per k for {classifier_name}")
    plt.show()

def find_best_k(X_train, X_test, y_train, y_test):
    best_k = 0
    best_accuracy = 0
    accuracy_per_k = {}
    for k in range(1, 11):
        sklearn_classifier = KNeighborsClassifier(k)
        sklearn_classifier.fit(X_train, y_train)
        accuracy_per_k[k] = sklearn_classifier.score(X_test, y_test)
        if accuracy_per_k[k] > best_accuracy:
            best_k = k
            best_accuracy = accuracy_per_k[k]
    # call the plot_accuracy function
    plot_accuracy(accuracy_per_k, "Classifier")
    return best_k, best_accuracy

X_train=X_train_pca_5
X_test=X_test_pca_5

train_start_time = time.time()
best_k, best_score = find_best_k(X_train, X_test, y_train, y_test)
train_total_time = time.time() - train_start_time
print(f"The best value of k is {best_k} with accuracy {best_score * 100}%")
print(f"Time for evaluate the best k: {train_total_time}seconds")

clf = KNeighborsClassifier(best_k)

train_start_time = time.time()
clf.fit(X_train, y_train)
train_total_time = time.time() - train_start_time

test_start_time = time.time()
score = clf.score(X_test, y_test)
test_total_time = time.time() - test_start_time

print(f"Training finished in {train_total_time} seconds")
print(f"Score: {score*100}%. Scoring took {test_total_time} seconds")

orig_test_start_time = time.time()
orig_score = clf.predict([X_test[2]])
orig_test_total_time = time.time() - orig_test_start_time
print(f"Predicting the class of 1 sample took {orig_test_total_time} seconds")

## Decision Tree

In [None]:
from sklearn import tree
from sklearn.model_selection import train_test_split
import time
import pandas as pd

# Load the dataset from Google Drive
file_id = '1ArItH0Fuovf5VtsV1p9waK6hMg4uLVTa'
url = 'https://drive.google.com/uc?id={}'.format(file_id)
df = pd.read_csv(url)

# Apply preprocessing steps
df = featureEncoding(df)
df = simpleImputation(df)
#df = featureScaling(df)
df= smote(df)

# This line splits dataset into features and values
X, y = df[df.columns[:-1]].values.astype(float), df[df.columns[-1]].values.astype(int)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

import numpy as np

X_train=X_train_pca_5
X_test=X_test_pca_5

clf = tree.DecisionTreeClassifier()
#clf = tree.DecisionTreeClassifier(criterion = 'gini')

orig_train_start_time = time.time()
clf.fit(X_train, y_train)
orig_train_total_time = time.time() - orig_train_start_time

orig_test_start_time = time.time()
orig_score = clf.score(X_test, y_test)
orig_test_total_time = time.time() - orig_test_start_time

print(f"Training finished in {orig_train_total_time} seconds")
print(f"Score: {orig_score*100}%. Scoring took {orig_test_total_time} seconds")

orig_test_start_time = time.time()
orig_score = clf.predict([X_test[2]])
orig_test_total_time = time.time() - orig_test_start_time
print(f"Predicting the class of 1 sample took {orig_test_total_time} seconds")

## Logistic Regression

In [None]:
import pandas as pd
from IPython.display import display
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import time
# Load the dataset from Google Drive
file_id = '1ArItH0Fuovf5VtsV1p9waK6hMg4uLVTa'
url = 'https://drive.google.com/uc?id={}'.format(file_id)
df = pd.read_csv(url)

# Apply preprocessing steps
df = featureEncoding(df)
df = simpleImputation(df)
#df = featureScaling(df)
df= smote(df)

# This line splits dataset into features and values
X, y = df[df.columns[:-1]].values.astype(float), df[df.columns[-1]].values.astype(int)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

X_train=X_train_pca_5
X_test=X_test_pca_5
# Initialize and train the LogisticRegression model
clf = LogisticRegression()

orig_train_start_time = time.time()
clf.fit(X_train, y_train)
orig_train_total_time = time.time() - orig_train_start_time

orig_test_start_time = time.time()
orig_score = clf.score(X_test, y_test)
orig_test_total_time = time.time() - orig_test_start_time

print(f"Training finished in {orig_train_total_time} seconds")
print(f"Score: {orig_score*100}%. Scoring took {orig_test_total_time} seconds")

orig_test_start_time = time.time()
orig_score = clf.predict([X_test[2]])
orig_test_total_time = time.time() - orig_test_start_time
print(f"Predicting the class of 1 sample took {orig_test_total_time} seconds")

## Neural Networks

In [None]:
from sklearn.datasets import make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
from IPython.display import display
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
# Load the dataset from Google Drive
file_id = '1ArItH0Fuovf5VtsV1p9waK6hMg4uLVTa'
url = 'https://drive.google.com/uc?id={}'.format(file_id)
df = pd.read_csv(url)

# Apply preprocessing steps
df = featureEncoding(df)
df = simpleImputation(df)
#df = featureScaling(df)
# df= smote(df)

# This line splits dataset into features and values
X, y = df[df.columns[:-1]].values.astype(float), df[df.columns[-1]].values.astype(int)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)



In [None]:

nn_clf = MLPClassifier(
    hidden_layer_sizes = (5,3),
    activation = 'relu',
    solver = 'adam',
    alpha = 0.001,
    learning_rate_init = 0.001,
    tol = 0.0001,
    n_iter_no_change=10,
    max_iter = 10000
)

nn_clf_pca_5 = MLPClassifier(
    hidden_layer_sizes = (5,3),
    activation = 'relu',
    solver = 'adam',
    alpha = 0.001,
    learning_rate_init = 0.001,
    tol = 0.0001,
    n_iter_no_change=10,
    max_iter = 10000
)
pca_5, X_train_pca_5 = make_pca(X_train, num_components = 5)
X_test_pca_5 = pca_5.transform(X_test)

import time

pca_5_train_start_time = time.time()
nn_clf_pca_5.fit(X_train_pca_5, y_train)
pca_5_train_total_time = time.time() - pca_5_train_start_time

pca_5_test_start_time = time.time()
pca_5_score = nn_clf_pca_5.score(X_test_pca_5, y_test)
pca_5_test_total_time = time.time() - pca_5_test_start_time

print("With reduced dataframe:")
print(f"Training finished in {pca_5_train_total_time} seconds")
print(f"Score: {pca_5_score*100}%. Scoring took {pca_5_test_total_time} seconds")

orig_test_start_time = time.time()
orig_score = nn_clf_pca_5.predict([X_test_pca_5[2]])
orig_test_total_time = time.time() - orig_test_start_time
print(f"Predicting the class of 1 sample took {orig_test_total_time} seconds")

orig_train_start_time = time.time()
nn_clf.fit(X_train, y_train)
orig_train_total_time = time.time() - orig_train_start_time

orig_test_start_time = time.time()
orig_score = nn_clf.score(X_test, y_test)
orig_test_total_time = time.time() - orig_test_start_time

print("With original dataframe:")
print(f"Training finished in {orig_train_total_time} seconds")
print(f"Score: {orig_score*100}%. Scoring took {orig_test_total_time} seconds")

orig_test_start_time = time.time()
orig_score = nn_clf.predict([X[2]])
orig_test_total_time = time.time() - orig_test_start_time
print(f"Predicting the class of 1 sample took {orig_test_total_time} seconds")

# Testing program

In [None]:
import pandas as pd
from IPython.display import display
from collections import Counter

# Replace 'your_file.csv' with the actual path to your CSV file
file_id = '1ArItH0Fuovf5VtsV1p9waK6hMg4uLVTa'
url = 'https://drive.google.com/uc?id={}'.format(file_id)
df = pd.read_csv(url)

# Preprocessing
df=featureEncoding(df)
# This line splits dataset into features and values
X, y = df[df.columns[:-1]].values.astype(float), df[df.columns[-1]].values.astype(int)
print(sorted(Counter(y).items()))
