In [1]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import graphviz
import subprocess
from sklearn import datasets, tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score, make_scorer, plot_roc_curve, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from os import system
from IPython.display import Image

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)



# Read the csv file from the local machine

In [None]:
df = pd.read_csv("../input/adult-data/adult-dataset.csv")
df.head()
df

# load test data
 Pre-process the test set as same as train set.

In [None]:
names = ["age","workclass","fnlwgt","education","education-num","marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country","class-label"]
df_test = pd.read_csv("../input/adult-data/adult.test",names = names)
df_test

# Pre-processing data
We find out that many instances contain a question mark (missing data). Then, we decided to remove this data from our analysis

In [None]:
df = df.replace({' ?': np.nan})
df.dropna(how='any',inplace = True)

df

# Univariate analysis 

In [None]:
df['class-label'].value_counts().plot(kind = "bar")


In [None]:
df.hist(column = "age",bins = 20)


In [None]:
df['workclass'].value_counts().plot(kind = "bar")

In [None]:
df['education'].value_counts().plot(kind = "bar")

In [None]:
df['marital-status'].value_counts().plot(kind = "bar")

In [None]:
df['occupation'].value_counts().plot(kind = "bar")

In [None]:
df['relationship'].value_counts().plot(kind = "bar")

In [None]:
df['race'].value_counts().plot(kind = "bar")

In [None]:
df['sex'].value_counts().plot(kind = "bar")

According to the previous bar chart, it shows that most of the data belong to United-states. Therefore, we can remove it from our analysis.

Because the capital-loss and capital-gain are numeric (most of them distributed in a wide range), we decided to normalize the data. 

In [None]:
df["capital-gain"]=((df["capital-gain"]-df["capital-gain"].min())/(df["capital-gain"].max()-df["capital-gain"].min()))
bins= [0, 0.05, 0.1, 0.15, 0.25, 0.3, 0.35, 0.40, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
plt.hist(df["capital-gain"], bins=bins, edgecolor="k")
plt.xlabel('Capital_Gain')
plt.ylabel('Frequency')
plt.show()

In [None]:
df["capital-loss"]=((df["capital-loss"]-df["capital-loss"].min())/(df["capital-loss"].max()-df["capital-loss"].min()))
bins= [0, 0.05, 0.1, 0.15, 0.25, 0.3, 0.35, 0.40, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
plt.hist(df["capital-loss"], bins=bins, edgecolor="k")
plt.xlabel('Capital_Loss')
plt.ylabel('Frequency')
plt.show()

In [None]:
df["fnlwgt"]=((df["fnlwgt"]-df["fnlwgt"].min())/(df["fnlwgt"].max()-df["fnlwgt"].min()))

bins= [0, 0.05, 0.1, 0.15, 0.25, 0.3, 0.35, 0.40, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
plt.hist(df["fnlwgt"], bins=bins, edgecolor="k")
plt.xticks(bins)
plt.show()

In [None]:
df['education-num'].value_counts().plot(kind = "bar")


In [None]:
bins= [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
plt.hist(df["hours-per-week"], bins=bins, edgecolor="k")
plt.xticks(bins)
plt.show()




In [None]:
df.boxplot(column=['age'])

In [None]:
df.boxplot(column=['fnlwgt'])

In [None]:
df.boxplot(column=['hours-per-week'])

we can drop 'native-country' and 'workclass', 'capital-loss', and 'capital-gain' as features, because they are very imbalanced(dominated by one value), and we assume that the occupation feature is more important than workclass.

In [None]:
# Drop "native-country","workclass", "capital-loss", "capital-gain" features
df = df.drop(labels=["native-country","workclass", "capital-loss", "capital-gain"],axis=1)
df_bin = pd.get_dummies(df["class-label"])

# Convert class-label to bin
df["class-label"]=df_bin.iloc[:,1]
df

In [None]:
# Feature Transformation
df.education = df.education.replace([' Preschool',' 1st-4th',' 5th-6th',' 7th-8th',' 9th',' 10th',' 11th',' 12th'],'school')
df.education = df.education.replace(' HS-grad','high school')
df.education = df.education.replace([' Assoc-voc' , ' Assoc-acdm' , ' prof-school' , ' Some-college'],'higher')
df.education = df.education.replace(' Bachelors','undergrad')
df.education = df.education.replace (' Masters','grad')
df.education = df.education.replace(' Doctorate','doc')
df['marital-status'] = df['marital-status'].replace ([' Married-civ-spouse',' Married-AF-spouse'],'married')
df['marital-status'] = df['marital-status'].replace ([' Never-married'],'not-married')
df['marital-status'] = df['marital-status'].replace ([' Divorced',' Separated',' Widowed',' Married-spouse-absent'],'other')
df

In [None]:
sns.countplot(df['class-label'],palette = 'coolwarm' , hue = 'marital-status' , data=df)


In [None]:
sns.countplot(df['class-label'],palette = 'coolwarm' , hue = 'education' , data=df)


In [None]:
sns.countplot(df['class-label'],palette = 'coolwarm' , hue = 'relationship' , data=df)


# Bivariate analysis

In [None]:

plt.figure(figsize=(12, 9))

hm = sns.heatmap(df.corr(), annot = True)

hm.set(title = "Correlation matrix of Adult dataset\n")

plt.show()

# Pre-Prossessing test set data

In [None]:
# Drop the rows which contain question mark.
df_test = df_test.replace({' ?': np.nan})
df_test.dropna(how='any',inplace = True)


# Remove 'native-country' and 'workclass', 'capital-loss', and 'capital-gain' feature.
df_test = df_test.drop(labels=["native-country","workclass", "capital-loss", "capital-gain"],axis=1)

# Normilize fnlwgt
df_test["fnlwgt"]=((df_test["fnlwgt"]-df_test["fnlwgt"].min())/(df_test["fnlwgt"].max()-df_test["fnlwgt"].min()))

# Convert "class-label" feature to bin
df_bin = pd.get_dummies(df_test["class-label"])
df_test["class-label"]=df_bin.iloc[:,1]
df_test

In [None]:
# Feature Transformation of test set
df_test.education = df_test.education.replace([' Preschool',' 1st-4th',' 5th-6th',' 7th-8th',' 9th',' 10th',' 11th',' 12th'],'school')
df_test.education = df_test.education.replace(' HS-grad','high school')
df_test.education = df_test.education.replace([' Assoc-voc' , ' Assoc-acdm' , ' prof-school' , ' Some-college'],'higher')
df_test.education = df_test.education.replace(' Bachelors','undergrad')
df_test.education = df_test.education.replace (' Masters','grad')
df_test.education = df_test.education.replace(' Doctorate','doc')
df_test['marital-status'] = df_test['marital-status'].replace ([' Married-civ-spouse',' Married-AF-spouse'],'married')
df_test['marital-status'] = df_test['marital-status'].replace ([' Never-married'],'not-married')
df_test['marital-status'] = df_test['marital-status'].replace ([' Divorced',' Separated',' Widowed',' Married-spouse-absent'],'other')
df_test

In [None]:
# label encoding for train set
df['education'] = df.apply(LabelEncoder().fit_transform)['education']
df['marital-status'] = df.apply(LabelEncoder().fit_transform)['marital-status']
df['occupation'] = df.apply(LabelEncoder().fit_transform)['occupation']
df['relationship'] = df.apply(LabelEncoder().fit_transform)['relationship']
df['race'] = df.apply(LabelEncoder().fit_transform)['race']
df[ 'sex'] = df.apply(LabelEncoder().fit_transform)['sex']

# label encoding for test set
df_test['education'] = df_test.apply(LabelEncoder().fit_transform)['education']
df_test['marital-status'] = df_test.apply(LabelEncoder().fit_transform)['marital-status']
df_test['occupation'] = df_test.apply(LabelEncoder().fit_transform)['occupation']
df_test['relationship'] = df_test.apply(LabelEncoder().fit_transform)['relationship']
df_test['race'] = df_test.apply(LabelEncoder().fit_transform)['race']
df_test[ 'sex'] = df_test.apply(LabelEncoder().fit_transform)['sex']
df

In [None]:
# Specify Train data
x_train = df.iloc[:,0:9]
y_train = df.iloc[:,10]

# Specify Test data
x_test = df_test.iloc[:,0:9]
y_test = df_test.iloc[:,10]


## 3.3 Classification

# 3.3.1 Decision Trees Classification

In [None]:
#Train a DT classifier with gini index

clf_gini = tree.DecisionTreeClassifier(random_state=1, criterion="gini")
clf_gini = clf_gini.fit(x_train,y_train)
y_pred = clf_gini.predict(x_test)
print(classification_report(y_true = y_test, y_pred = y_pred))

# 3.3.1.1 Model parameter tuning 


In [None]:
from sklearn.model_selection import GridSearchCV

# defining parameter range
serach_space ={'criterion': ['gini','entropy'],
                'max_depth': [7, 8, 9], 
                'min_samples_split': [4, 5], 
                'min_samples_leaf': [4, 5], 
                'max_leaf_nodes':[20, 25, 30]}
grid = GridSearchCV(estimator=tree.DecisionTreeClassifier(), param_grid=serach_space, scoring=['accuracy', 'precision', 'recall','f1_macro'], 
refit='accuracy', cv=10, verbose=5, 
error_score='raise')

# fitting the model for grid search
grid.fit(x_train, y_train)
best_dts_model = grid.best_estimator_
print ('The best parameters for evaluation are as per following:')
print (grid.best_params_)
print("the accuracy of the best model is  %0.2f" % grid.best_score_)

# 3.3.1.2 Evaluation setup & measures


In [None]:

# Measuring Evaluation using accuracy
scores = cross_val_score(best_dts_model, x_train, y_train, cv=10, scoring='accuracy')
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Measuring Evaluation using f1_macro
scores = cross_val_score(best_dts_model, x_train, y_train, cv=10, scoring='f1_macro')
print("%0.2f f1_macro with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Measuring Evaluation using sensitivity
sensitivity = make_scorer(recall_score, pos_label=0)
scores=cross_val_score(best_dts_model, x_train, y_train, cv=10, scoring=sensitivity)
print("%0.2f sensitivity with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Measuring Evaluation using specificity
specificity = make_scorer(recall_score, pos_label=1)
scores=cross_val_score(best_dts_model, x_train, y_train, cv=10, scoring=specificity)
print("%0.2f specificity with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# confusion matrix 
y_eval = cross_val_predict(best_dts_model, x_train, y_train, cv=10)
print ('The Confusion Matrix is:')
print (confusion_matrix(y_train, y_eval))

# plot AUC
fpr, tpr, threshold = roc_curve(y_eval, y_train)
roc_auc = auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

To summarize, we tuned the Decision Tree Classifier using input parameters. We found that "max_depth" and "max_leaf_nodes" are the most effective parameters which increased the accuracy of our model. It is worth mentioning that using "gini" or "entropy" criteria didn't change the accuracy of the mentioned model.

In [None]:
# predict x_test from Evaluated model in the previous section and calculate precision, recall, f1-score and accuracy
# Construct the model based on the tuned parameters
from sklearn import datasets, tree
best_dts_model = tree.DecisionTreeClassifier(criterion="gini", max_depth = 9, min_samples_split=4, min_samples_leaf=4, max_leaf_nodes=30)
best_dts_model = best_dts_model.fit(x_train,y_train)
y_pred = best_dts_model.predict(x_test)
print(classification_report(y_true = y_test, y_pred = y_pred))

By comparing the measuring criteria before and after evaluation, it is obvious that precision, recall, f1-score, and accuracy have been improved by

# 3.3.1.3 Model interpretation/visualization


To visualize our model, we used dimention redction.
This process is really costly and we just used the 2000 instances of our test data.

In [None]:
from sklearn.manifold import MDS
from matplotlib import pyplot as plt
import sklearn.datasets as dt
import seaborn as sns         
import numpy as np
from sklearn.metrics.pairwise import manhattan_distances, euclidean_distances
from matplotlib.offsetbox import OffsetImage, AnnotationBbox

# Reduce the dimention of x_test
x_test_num = x_train.to_numpy()
dist_manhattan = manhattan_distances(x_test_num[0:2000])
mds = MDS(dissimilarity='precomputed', random_state=0)
# Get the embeddings
x_test_num_L1 = mds.fit_transform(dist_manhattan)

Model visualization based on best DTs and Dimensionality reduction

In [None]:
# Visualize the DTs
y_pred_num = np.expand_dims(y_pred[0:2000], axis=1)
y_pred_num = np.append(x_test_num_L1, y_pred_num, axis=1)
df_vis_dts = pd. DataFrame(y_pred_num, columns=['dim_1', 'dim_2', 'class-label'])

sns.set(rc={'figure.figsize':(11.7,8.27)})

sns.scatterplot(data=df_vis_dts , x="dim_1", y="dim_2", hue="class-label", size="class-label",
    sizes=(100, 20), palette=['dodgerblue','red'], legend="full")

In [None]:
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz/bin'
tree.export_graphviz(best_dts_model, out_file='tree.dot')
system("dot -Tpng tree.dot -o tree1.png")
Image("tree1.png")

# 3.3.1.4 Discriminative behaviour

In [None]:
all_features= pd.concat([x_train,x_test])
y_pred = clf_gini.predict(all_features)
final_df = all_features.assign(target=y_pred)

plt.figure(figsize=(12, 9))
hm = sns.heatmap(final_df.corr(), annot = True, cmap="crest")
hm.set(title = "Correlation matrix Trained model\n")
plt.show()

According to the above Correlation matrix of the model, we can see that there is a positive correlation between "sex" and "target" which means this model can not avoid discrimination against individuals in the "sex" feature. However, this model can avoid discrimination against "race" because the correlation is negligible.

# 3.3.2 KNNs Classification

In [None]:
# Build KNNs model
from sklearn import neighbors
clf_knn = neighbors.KNeighborsClassifier()
clf_knn.fit(x_train, y_train)
y_pred = clf_knn.predict(x_test)
print(classification_report(y_true = y_test, y_pred = y_pred))

# 3.3.2.1 Model parameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn import neighbors
# defining parameter range
skf = StratifiedKFold(n_splits=10)

serach_space ={'n_neighbors':[3,5,10,15,20,50],
              'weights':['uniform','distance'],
              'metric':['euclidean','manhattan']}
grid = GridSearchCV(estimator=neighbors.KNeighborsClassifier(), param_grid=serach_space, scoring=['accuracy', 'precision', 'recall','f1_macro'], 
refit='accuracy', cv=skf, verbose=1, n_jobs = -1,
error_score='raise')

# fitting the model for grid search
grid.fit(x_train, y_train)
best_knn_model = grid.best_estimator_
print ('The best parameters for evaluation are as per following:')
print (grid.best_params_)
print("the accuracy of the best model is  %0.2f" % grid.best_score_)




# 3.3.2.2 Evaluation setup 

In [None]:

# Measuring Evaluation using accuracy
scores = cross_val_score(best_knn_model, x_train, y_train, cv=10, scoring='accuracy')
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Measuring Evaluation using f1_macro
scores = cross_val_score(best_knn_model, x_train, y_train, cv=10, scoring='f1_macro')
print("%0.2f f1_macro with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Measuring Evaluation using sensitivity
sensitivity = make_scorer(recall_score, pos_label=0)
scores=cross_val_score(best_knn_model, x_train, y_train, cv=10, scoring=sensitivity)
print("%0.2f sensitivity with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Measuring Evaluation using specificity
specificity = make_scorer(recall_score, pos_label=1)
scores=cross_val_score(best_knn_model, x_train, y_train, cv=10, scoring=specificity)
print("%0.2f specificity with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# confusion matrix 
y_eval = cross_val_predict(best_knn_model, x_train, y_train, cv=10)
print ('The Confusion Matrix is:')
print (confusion_matrix(y_train, y_eval))

# plot AUC
fpr, tpr, threshold = roc_curve(y_eval, y_train)
roc_auc = auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

The parameters which affect our model are: the neighbors (k), how the label is predicted in relation to the labels of the k nearest neighbours, and what the distance metric is. For example, acording to our parameter tuning above, the best metric for choosing the label is not the majority, but the weighted majority, considering the distances.

In [None]:
# Construct the model based on the tuned parameters
from sklearn import neighbors
best_knn_model = neighbors.KNeighborsClassifier(metric = 'manhattan', n_neighbors = 50, weights = 'uniform')
best_knn_model = best_knn_model.fit(x_train,y_train)
y_pred = best_knn_model.predict(x_test)
print(classification_report(y_true = y_test, y_pred = y_pred))

# 3.3.2.3 Model interpretation/visualization

To visualize our model, we used dimention redction.
This process is really costly and we just used the 2000 instances of our test data.

In [None]:
from sklearn.manifold import MDS
from matplotlib import pyplot as plt
import sklearn.datasets as dt
import seaborn as sns         
import numpy as np
from sklearn.metrics.pairwise import manhattan_distances, euclidean_distances
from matplotlib.offsetbox import OffsetImage, AnnotationBbox

# Reduce the dimention of x_test
x_test_num = x_train.to_numpy()
dist_manhattan = manhattan_distances(x_test_num[0:2000])
mds = MDS(dissimilarity='precomputed', random_state=0)
# Get the embeddings
x_test_num_L1 = mds.fit_transform(dist_manhattan)

Model visualization based on best KNN and Dimensionality reduction

In [None]:
# Visualize the KNN
y_pred_num = np.expand_dims(y_pred[0:2000], axis=1)
y_pred_num = np.append(x_test_num_L1, y_pred_num, axis=1)
df_vis_dts = pd. DataFrame(y_pred_num, columns=['dim_1', 'dim_2', 'class-label'])

sns.set(rc={'figure.figsize':(11.7,8.27)})

sns.scatterplot(data=df_vis_dts , x="dim_1", y="dim_2", hue="class-label", size="class-label",
    sizes=(100, 20), palette=['green','orange'], legend="full")

# 3.3.2.4 Discriminative behaviour

In [None]:
all_features= pd.concat([x_train,x_test])
y_pred = best_knn_model.predict(all_features)
final_df = all_features.assign(target=y_pred)

plt.figure(figsize=(12, 9))
hm = sns.heatmap(final_df.corr(), annot = True, cmap="crest")
hm.set(title = "Correlation matrix Trained model\n")
plt.show()

# 3.3.3 NBs Classification

In [None]:
# Build NBs model
from sklearn.naive_bayes import GaussianNB
clf_NB = GaussianNB()
clf_NB.fit(x_train, y_train)
y_pred = clf_NB.predict(x_test)
print(classification_report(y_true = y_test, y_pred = y_pred))

# 3.3.3.1 Model parameter tuning

In [None]:
# defining parameter range
skf = StratifiedKFold(n_splits=10)

serach_space ={'var_smoothing': np.logspace(0, -9, num = 100)}
grid = GridSearchCV(estimator=GaussianNB(), param_grid=serach_space, scoring=['accuracy', 'recall','f1_macro'], 
refit='accuracy', cv=skf, verbose=1,n_jobs = -1, 
error_score='raise')

# fitting the model for grid search
grid.fit(x_train, y_train)
best_NB_model = grid.best_estimator_
print ('The best parameters for evaluation are as per following:')
print (grid.best_params_)
print("the accuracy of the best model is  %0.2f" % grid.best_score_)




# 3.3.3.2 Evaluation setup 

In [None]:
# Measuring Evaluation using accuracy
scores = cross_val_score(best_NB_model, x_train, y_train, cv=10, scoring='accuracy')
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Measuring Evaluation using f1_macro
scores = cross_val_score(best_NB_model, x_train, y_train, cv=10, scoring='f1_macro')
print("%0.2f f1_macro with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Measuring Evaluation using sensitivity
sensitivity = make_scorer(recall_score, pos_label=0)
scores=cross_val_score(best_NB_model, x_train, y_train, cv=10, scoring=sensitivity)
print("%0.2f sensitivity with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Measuring Evaluation using specificity
specificity = make_scorer(recall_score, pos_label=1)
scores=cross_val_score(best_NB_model, x_train, y_train, cv=10, scoring=specificity)
print("%0.2f specificity with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# confusion matrix 
y_eval = cross_val_predict(best_NB_model, x_train, y_train, cv=10)
print ('The Confusion Matrix is:')
print (confusion_matrix(y_train, y_eval))

# plot AUC
fpr, tpr, threshold = roc_curve(y_eval, y_train)
roc_auc = auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
# predict x_test from Evaluate model in the previous section and calculate precision, recall, f1-score and accuracy
from sklearn.naive_bayes import GaussianNB
best_NB_model = GaussianNB(var_smoothing = 1e-09)
best_NB_model = best_NB_model.fit(x_train,y_train)
y_pred = best_NB_model.predict(x_test)
print(classification_report(y_true = y_test, y_pred = y_pred))

# 3.3.3.3 Model interpretation/visualization

To visualize our model, we used dimention redction.
This process is really costly and we just used the 2000 instances of our test data.

In [None]:
from sklearn.manifold import MDS
from matplotlib import pyplot as plt
import sklearn.datasets as dt
import seaborn as sns         
import numpy as np
from sklearn.metrics.pairwise import manhattan_distances, euclidean_distances
from matplotlib.offsetbox import OffsetImage, AnnotationBbox

# Reduce the dimention of x_test
x_test_num = x_train.to_numpy()
dist_manhattan = manhattan_distances(x_test_num[0:2000])
mds = MDS(dissimilarity='precomputed', random_state=0)
# Get the embeddings
x_test_num_L1 = mds.fit_transform(dist_manhattan)

Model visualization based on best NB and Dimensionality reduction

In [None]:
# Visualize the NB
y_pred_num = np.expand_dims(y_pred[0:2000], axis=1)
y_pred_num = np.append(x_test_num_L1, y_pred_num, axis=1)
df_vis_dts = pd. DataFrame(y_pred_num, columns=['dim_1', 'dim_2', 'class-label'])

sns.set(rc={'figure.figsize':(11.7,8.27)})

sns.scatterplot(data=df_vis_dts , x="dim_1", y="dim_2", hue="class-label", size="class-label",
    sizes=(100, 20), palette=['red','blue'], legend="full")

# 3.3.3.4 Discriminative behaviour

In [None]:
all_features= pd.concat([x_train,x_test])
y_pred = best_NB_model.predict(all_features)
final_df = all_features.assign(target=y_pred)

plt.figure(figsize=(12, 9))
hm = sns.heatmap(final_df.corr(), annot = True, cmap="crest")
hm.set(title = "Correlation matrix Trained model\n")
plt.show()

# 3.3.4 Classification SVMs

In [None]:
#Train a SVM classifier
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
clf_svm = svclassifier.fit(x_train, y_train)
y_pred = clf_svm.predict(x_test)
print(classification_report(y_true = y_test, y_pred = y_pred))

# 3.3.4.1 Model parameter tuning

In [None]:

# Parameters Tunning and Evaluation
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import pickle


# defining parameter range
svm_linear = {'C': [0.2, 0.3], 
              'kernel': ['linear']} 
svm_others = {'C': [0.3, 0.5],
              'gamma': [0.001, 0.003], 
              'kernel': ['rbf', 'sigmoid']}

svm_poly =  {'C': [0.5, 1], 'gamma': [0.001, 0.003], 'kernel': ['poly'], 'degree': [2, 3]}  # 1, 0.001, 5 acc 0.81

serach_space = [svm_poly, svm_linear, svm_others]  
 
grid = GridSearchCV(estimator=SVC(), param_grid=serach_space, scoring=['accuracy', 'precision', 'recall','f1_macro'], 
refit='accuracy', cv=10, verbose=5, 
error_score='raise')

# fitting the model for grid search
grid.fit(x_train, y_train)
best_svm_model = grid.best_estimator_
print ('The best parameters for evaluation are as per following:')
print (grid.best_params_)
print("the accuracy of the best model is  %0.2f" % grid.best_score_)

filename_svm = 'best_svm_model.sav'
pickle.dump(best_svm_model, open(filename_svm, 'wb'))

# confusion matrix 
y_eval_svm = cross_val_predict(best_svm_model, x_train, y_train, cv=10)

print ('The Confusion Matrix is:')
print (confusion_matrix(y_train, y_eval_svm))

# plot AUC
fpr, tpr, threshold = roc_curve(y_eval_svm, y_train)
roc_auc = auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# 3.3.4.2 Evaluation setup 

In [None]:
# Measuring Evaluation using accuracy
scores = cross_val_score(best_svm_model, x_train, y_train, cv=10, scoring='accuracy')
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Measuring Evaluation using f1_macro
scores = cross_val_score(best_svm_model, x_train, y_train, cv=10, scoring='f1_macro')
print("%0.2f f1_macro with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Measuring Evaluation using sensitivity
sensitivity = make_scorer(recall_score, pos_label=0)
scores=cross_val_score(best_svm_model, x_train, y_train, cv=10, scoring=sensitivity)
print("%0.2f sensitivity with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Measuring Evaluation using specificity
specificity = make_scorer(recall_score, pos_label=1)
scores=cross_val_score(best_svm_model, x_train, y_train, cv=10, scoring=specificity)
print("%0.2f specificity with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# confusion matrix 
y_eval = cross_val_predict(best_svm_model, x_train, y_train, cv=10)
print ('The Confusion Matrix is:')
print (confusion_matrix(y_train, y_eval))

# plot AUC
fpr, tpr, threshold = roc_curve(y_eval, y_train)
roc_auc = auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
# predict x_test from Evaluate model in the previous section and calculate precision, recall, f1-score and accuracy
from sklearn.svm import SVC
svclassifier = SVC(kernel='poly' , C= 1, degree= 3, gamma= 0.003)
best_svm_model = svclassifier.fit(x_train, y_train)

y_pred = best_svm_model.predict(x_test)
print(classification_report(y_true = y_test, y_pred = y_pred))


# 3.3.4.3 Model interpretation/visualization

To visualize our model, we used dimention redction.
This process is really costly and we just used the 2000 instances of our test data.

In [None]:
from sklearn.manifold import MDS
from matplotlib import pyplot as plt
import sklearn.datasets as dt
import seaborn as sns         
import numpy as np
from sklearn.metrics.pairwise import manhattan_distances, euclidean_distances
from matplotlib.offsetbox import OffsetImage, AnnotationBbox

# Reduce the dimention of x_test
x_test_num = x_train.to_numpy()
dist_manhattan = manhattan_distances(x_test_num[0:2000])
mds = MDS(dissimilarity='precomputed', random_state=0)
# Get the embeddings
x_test_num_L1 = mds.fit_transform(dist_manhattan)

Model visualization based on best SVM and Dimensionality reduction

In [None]:
# Visualize the SVM
y_pred_num = np.expand_dims(y_pred[0:2000], axis=1)
y_pred_num = np.append(x_test_num_L1, y_pred_num, axis=1)
df_vis_dts = pd. DataFrame(y_pred_num, columns=['dim_1', 'dim_2', 'class-label'])

sns.set(rc={'figure.figsize':(11.7,8.27)})

sns.scatterplot(data=df_vis_dts , x="dim_1", y="dim_2", hue="class-label", size="class-label",
    sizes=(100, 20), palette=['black','red'], legend="full")

# 3.3.4.4 Discriminative behaviour

In [None]:
all_features= pd.concat([x_train,x_test])
y_pred = best_svm_model.predict(all_features)
final_df = all_features.assign(target=y_pred)

plt.figure(figsize=(12, 9))
hm = sns.heatmap(final_df.corr(), annot = True, cmap="crest")
hm.set(title = "Correlation matrix Trained model\n")
plt.show()

# 3.3.5 Perceptron

In [None]:
from sklearn import preprocessing, neighbors
from os import system

df1=df.copy()
df1=df1.apply(preprocessing.LabelEncoder().fit_transform)
ss = preprocessing.StandardScaler().fit(df1.drop('class-label',axis=1))
x_train=ss.transform(df1.drop('class-label',axis=1))
y_train = df['class-label'].to_numpy()

df2=df_test.copy()
df2=df2.apply(preprocessing.LabelEncoder().fit_transform)
ss = preprocessing.StandardScaler().fit(df2.drop('class-label',axis=1))
x_test=ss.transform(df2.drop('class-label',axis=1))
y_test = df['class-label'].to_numpy()

x_train.shape

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim


class AdultDataset(object):
    def __init__(self, x, y):
        self.x = torch.from_numpy(x)
        self.y = torch.from_numpy(y)
        self.n = x.shape[0]

    def __len__(self):
        return self.n

    def __getitem__(self, idx):
        return  self.x[idx], self.y[idx]

y_train = y_train.astype(np.int64)
y_test = y_test.astype(np.int64)
train_data = AdultDataset(x_train, y_train)
test_data = AdultDataset(x_test,y_test)
trainset = DataLoader(train_data, batch_size = 10, shuffle = True)
testset = DataLoader(test_data, batch_size = 10, shuffle = True)

In [None]:
#Perceptron
class AdultNet(nn.Module):
    def __init__(self, input_size , hidden_size, num_classes):
        super(AdultNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, num_classes)
    def forward(self, x):
        out = F.relu(self.fc1(x))
        out = self.fc2(out)
        return F.log_softmax(out, dim = 1)
model = AdultNet(10,10,2)
print(model)

In [None]:
X = torch.rand(10)
X = X.view(-1,10)
output = model(X)
output

In [None]:
optimizer = optim.Adam(model.parameters(), lr = 0.001)

In [None]:
EPOCHS = 3
for epoch in range(EPOCHS):
    for data in trainset:
        x, y = data
        model.zero_grad()
        output = model(x.view(-1,10).float())
        loss = F.nll_loss(output, y)
        loss.backward()
        optimizer.step()
    print(loss)

In [None]:
correct = 0
total = 0
true_pos = 0
true_neg = 0
false_pos = 0
false_neg = 0
with torch.no_grad():
    for data in testset:
        X, y = data
        output = model(X.view(-1,10).float())
        for idx, i in enumerate(output):
            if torch.argmax(i) == y[idx]:
                correct +=1
                if torch.argmax(i):
                    true_pos += 1
                else:
                    true_neg += 1
            else:
                if torch.argmax(i):
                    false_pos += 1
                else:
                    false_neg += 1
            total += 1
print("Accuracy: ", round(correct/total, 3))

In [None]:
#finding best learning rate.
import torch.optim.lr_scheduler as lr_scheduler

lambda_fct = lambda epoch: 0.95**epoch
scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda = lambda_fct)
print(optimizer.state_dict()['param_groups'][0]['lr'])
for epoch in range(20):
    #train
    model.train()
    for data in trainset:
        x, y = data
        model.zero_grad()
        optimizer.zero_grad()
        output = model(x.view(-1,10).float())
        loss = F.nll_loss(output, y)
        loss.backward()
        optimizer.step()
    #validate:
    correct_ = 0
    total_ = 0
    with torch.no_grad():
        for data in testset:
            X, y = data
            output = model(X.view(-1,10).float())
            for idx, i in enumerate(output):
                if torch.argmax(i) == y[idx]:
                    correct_ +=1
                total_ += 1
    print("Accuracy: ", round(correct_/total_, 3))
    #scheduler.step
    scheduler.step()
    print(optimizer.state_dict()['param_groups'][0]['lr'])

In [None]:
#confusion matrix
con_mat = pd.DataFrame({'positive-pred':[true_pos, false_pos],'negative-pred':[false_neg,true_neg]})
con_mat

In [None]:
specificity_1 = true_neg/(true_neg+false_pos)
sensitivity_1 = true_pos/(true_pos+false_neg)
precision_1 = true_pos/(true_pos+false_pos)
f1_score_1 = ((precision_1*sensitivity_1*2)/(precision_1+sensitivity_1))
specificity_0 = true_pos/(true_pos+false_neg)
sensitivity_0 = true_neg/(true_neg+false_pos)
precision_0 = true_neg/(true_neg+false_neg)
f1_score_0 = ((precision_0*sensitivity_0*2)/(precision_0+sensitivity_0))
print("specificity for class 0: ", specificity_0)
print('sensitivity for class 0: ', sensitivity_0)
print('precision for class 0: ', precision_0)
print('f1_score for class 0: ',f1_score_0)
print("specificity for class 1: ", specificity_1)
print('sensitivity for class 1: ', sensitivity_1)
print('precision for class 1: ', precision_1)
print('f1_score for class 1: ',f1_score_1)

# Result

In [None]:
import numpy as np
import pandas as pd

acc_result = np.array([[0.83],[0.82], [0.82], [0.82],[0.66]])
acc_result.reshape(1,5)
data = {'Classifier': ['DTs', 'KNN', 'NB', 'svm', 'Perceptron'], 'Accuracy': [0.83, 0.75, 0.76, 0.82, 0.66]}
df_result = pd.DataFrame(data)
df_result

In [None]:
import seaborn as sns         
plot_ac = sns.barplot(data =df_result, x = 'Classifier', y='Accuracy', color='blue')
sns.set(rc={'figure.figsize':(7,8.27)})

plot_ac.set_xticklabels(plot_ac.get_xmajorticklabels(), fontsize = 18)
plot_ac.set_xlabel("Classifier",fontsize=18)
plot_ac.set_ylabel("Accuracy",fontsize=18)


# 3.6) KNN implementation

In [None]:

#kNN
def kNN(data, labels, new, dist_func, k):
    distances = []
    for idx in range(len(data)):
        distances.append([dist_func(data[idx],new),labels[idx]])
    distances = sorted(distances)
    assigned_label = {0:0 , 1:0}
    for idx in range(k):
        assigned_label[distances[k][1]] += 1
    return max(assigned_label,key = assigned_label.get)  

In [None]:
#distance func:
def L1_dist(arr1,arr2):
    res = 0
    for i in range(len(arr1)):
        res += abs(arr1[i] - arr2[i])
    return res
def L2_dist(arr1,arr2):
    return np.linalg.norm(arr1-arr2)

In [None]:
correct = 0
total = 0
x = x_train
y = y_train
for idx in range(100):
    guess = kNN(x,y,x_test[idx],L2_dist,100)
    if guess == y_test[idx]:
        correct += 1
    total += 1
print("Accuracy: ", round(correct/total, 3))