Task: Develop a machine learning classification model to predict if a person's salary is over or under 50k.

In [8]:
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

## Import data

In [9]:
df = pd.read_excel('./data/Ingresos_por_persona.xlsx')
df.head()

Unnamed: 0,edad,nivel_educ,raza,sexo,ganancias_capital,perdidas_capital,Horas_semana,Ingresos
0,39,13,White,Male,2174,0,40,<=50K
1,50,13,White,Male,0,0,13,<=50K
2,38,9,White,Male,0,0,40,<=50K
3,53,7,Black,Male,0,0,40,<=50K
4,28,13,Black,Female,0,0,40,<=50K


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   edad               32561 non-null  int64 
 1   nivel_educ         32561 non-null  int64 
 2   raza               32561 non-null  object
 3   sexo               32561 non-null  object
 4   ganancias_capital  32561 non-null  int64 
 5   perdidas_capital   32561 non-null  int64 
 6   Horas_semana       32561 non-null  int64 
 7   Ingresos           32561 non-null  object
dtypes: int64(5), object(3)
memory usage: 2.0+ MB


## Pre-processing

In [11]:
df = pd.get_dummies(df, columns=['raza', 'sexo'], drop_first=True)
df.head()

Unnamed: 0,edad,nivel_educ,ganancias_capital,perdidas_capital,Horas_semana,Ingresos,raza_ Asian-Pac-Islander,raza_ Black,raza_ Other,raza_ White,sexo_ Male
0,39,13,2174,0,40,<=50K,False,False,False,True,True
1,50,13,0,0,13,<=50K,False,False,False,True,True
2,38,9,0,0,40,<=50K,False,False,False,True,True
3,53,7,0,0,40,<=50K,False,True,False,False,True
4,28,13,0,0,40,<=50K,False,True,False,False,False


In [13]:
X = df.drop(columns=['Ingresos']).values
y = df['Ingresos'].values

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)

## Classifier
### Model selection

In [17]:
models = []
models.append(('LR', LogisticRegression(max_iter=200)))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

#### Training - Parameter optimization

In [18]:
results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10)
    cv_results = model_selection.cross_val_score(model, X, y, cv=kfold)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LR: 0.820982 (0.004938)
LDA: 0.811032 (0.006049)
KNN: 0.828691 (0.007374)
CART: 0.815761 (0.005940)
NB: 0.796290 (0.005260)
SVM: 0.802341 (0.005970)


Decission tree seems to have the best potential, although it doesn't have the lowest std.

### Final model - optimization

In [26]:
CART = DecisionTreeClassifier(random_state=0)
criterion = ['gini', 'entropy']
max_depth = [None, 10, 20, 30, 40, 50]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
param_grid = {
    'criterion': criterion,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf
}
cart_cv = GridSearchCV(CART, param_grid, verbose = 3, cv=5)
cart_cv.fit(X, y)
print("Best parameters: ", cart_cv.best_params_)
print("Best score: ", cart_cv.best_score_)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV 1/5] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=0.815 total time=   0.1s
[CV 2/5] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=0.814 total time=   0.1s
[CV 3/5] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=0.814 total time=   0.1s
[CV 4/5] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=0.811 total time=   0.1s
[CV 5/5] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=0.812 total time=   0.1s
[CV 1/5] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5;, score=0.822 total time=   0.1s
[CV 2/5] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5;, score=0.818 total time=   0.1s
[CV 3/5] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5;, score=0.822 total time= 

In [27]:
CART_final = DecisionTreeClassifier(
    criterion=cart_cv.best_params_['criterion'],
    max_depth=cart_cv.best_params_['max_depth'],
    min_samples_split=cart_cv.best_params_['min_samples_split'],
    min_samples_leaf=cart_cv.best_params_['min_samples_leaf']
)
CART_final.fit(X, y)

## New predictions

In [30]:
df_new = pd.read_excel('./data/Ingresos_nuevos_datos.xlsx')
df_new = pd.get_dummies(df_new, columns=['raza', 'sexo'], drop_first=True)
X_new = df_new.values
y_prediction = CART_final.predict(X_new)
print(f"Predictions: {y_prediction}")

Predictions: [' <=50K' ' <=50K' ' <=50K' ' <=50K' ' <=50K' ' <=50K' ' >50K' ' >50K'
 ' <=50K' ' >50K' ' <=50K' ' <=50K' ' <=50K' ' >50K']
