In [None]:
# Installing required libraries
! pip install sklearn
! pip install pandas
! pip install sweetviz
! pip install imbalanced-learn

In [6]:
# Import the libraries
import pandas as pd
import sklearn as sk
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import sweetviz as sv
from sklearn import metrics
from scipy import stats
import imblearn
from sklearn import preprocessing
from imblearn.under_sampling import CondensedNearestNeighbour

In [7]:
# Fetch the data
data_train = pd.read_csv('Train.csv', sep=',',header=0)
data_eval = pd.read_csv('Eval.csv', sep=',',header=0)
data_train.dropna(how='all',inplace=True)
data_train = data_train[(np.abs(stats.zscore(data_train)) < 5).all(axis=1)]
feat =  data_train.iloc[:,:-1]
label = data_train['class_col']
print(pd.unique(label)) # Binary Classification

[0. 1.]


In [8]:
#Normalization
min_max_scaler = preprocessing.Normalizer()
x_scaled = min_max_scaler.fit_transform(feat.values)

x_scaled

array([[ 3.85893870e-01, -2.17003331e-01,  7.16110992e-01, ...,
         0.00000000e+00,  2.38703664e-01,  4.77407328e-01],
       [-1.66506942e-02,  8.52122884e-01,  0.00000000e+00, ...,
        -2.84040961e-01,  0.00000000e+00,  0.00000000e+00],
       [ 7.58127547e-01,  1.07798948e-01,  0.00000000e+00, ...,
        -2.37157685e-01,  0.00000000e+00,  0.00000000e+00],
       ...,
       [ 3.92478379e-02,  8.42800248e-02,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  9.27080272e-01],
       [ 2.00566653e-05, -1.14022133e-03,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 8.48889598e-02,  2.20756232e-01,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00]])

In [9]:
# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(
    x_scaled, label,test_size =0.30,shuffle=True)

The next step was to do under-sampling to balance the training data to have better representation of both the classes, the technique I am using is condensed nearest neighbours, unfortunately due to the lack of computing power I was not able to complete the test for this run.

In [None]:

# # define the undersampling method
# undersample = CondensedNearestNeighbour(n_neighbors=1)
# # transform the dataset
# X_train, y_train = undersample.fit_resample(X_train, y_train)


In [98]:
feature = pd.DataFrame(x_scaled)

# EDA of the training file
eda_report = sv.analyze(feature)

# Saving results to HTML file
eda_report.show_html('eda_report_norm_sklearn.html')

                                             |          | [  0%]   00:00 -> (? left)

Report eda_report_norm_sklearn.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [40]:
# KNN-classification
KNN = KNeighborsClassifier(n_neighbors=10).fit(X_train, y_train)

KNN_pred = KNN.predict(X_test)
print("KNN model accuracy(in %):",metrics.accuracy_score(y_test, KNN_pred)*100)
print("KNN model f1-score(in %):",metrics.f1_score(y_test, KNN_pred)*100)

KNN model accuracy(in %): 93.47672184202543
KNN model accuracy(in %): 19.07151819322459


In [46]:
# NaiveBayes Classification
gnb = GaussianNB().fit(X_train, y_train) 
gnb_pred = gnb.predict(X_test)
print("Gaussian Naive Bayes model accuracy(in %):", metrics.accuracy_score(y_test, gnb_pred)*100)
print("Gaussian Naive Bayes model f1-score(in %):",metrics.f1_score(y_test, gnb_pred)*100)

Gaussian Naive Bayes model accuracy(in %): 90.13923069143377
Gaussian Naive Bayes model f1-score(in %): 35.75664397100813


In [44]:
# Logistic Regression

LR = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').fit(X_train, y_train)
LR_pred = LR.predict(X_test)
print("LR model accuracy(in %):",metrics.accuracy_score(y_test, LR_pred)*100)
print("LR  model f1-score(in %):",metrics.f1_score(y_test, LR_pred)*100)

LR model accuracy(in %): 93.65202440751104
LR  model f1-score(in %): 18.094823836450633


In [None]:
# Support Vector Machine
# SVM = svm.LinearSVC().fit(X_train, y_train)
SVM = svm.SVC(probability=True).fit(X_train, y_train)
SVM_pred = SVM.predict(X_test)
print("SVM model accuracy(in %):",metrics.accuracy_score(y_test, SVM_pred)*100)
print("SVM model f1-score(in %):",metrics.f1_score(y_test, SVM_pred)*100)

In [42]:
#Multi-Layer Perceptron
NN = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1).fit(X_train, y_train)
NN_pred = NN.predict(X_test)
print("MLP model accuracy(in %):",metrics.accuracy_score(y_test, NN_pred)*100)
print("MLP model f1-score(in %):",metrics.f1_score(y_test, NN_pred)*100)

MLP model accuracy(in %): 93.6250547820517
MLP model f1-score(in %): 21.37214137214137


In [47]:
GNB_prob = gnb.predict_proba(data_eval)

In [48]:
prob = list(GNB_prob[:,-1])
class1_prob = [round(x,3) for x in prob]



In [49]:
output = pd.DataFrame(columns=['row_id', 'yhat'])
output['yhat'] = class1_prob
output['row_id'] = [x for x in range(len(class1_prob))]

In [50]:
output.to_csv('output.csv',index=False)