## Importing libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import warnings
warnings.filterwarnings('ignore')

### Reading dataset

In [3]:
missing_values = ["n/a", "na", "--"]
train_data = pd.read_csv('training.csv', delimiter=';', na_values=missing_values)
test_data = pd.read_csv('validation.csv', delimiter=';', na_values=missing_values)

In [4]:
train_data.head()

Unnamed: 0,variable1,variable2,variable3,variable4,variable5,variable6,variable7,variable8,variable9,variable10,variable11,variable12,variable13,variable14,variable15,variable17,variable18,variable19,classLabel
0,a,1792,0.00054,u,g,c,v,175,f,t,1,t,g,80.0,5,800000.0,t,0,no.
1,b,1692,0.00335,y,p,k,v,29,f,f,0,f,s,200.0,0,2000000.0,,0,no.
2,b,3125,1125.0,u,g,ff,ff,0,f,t,1,f,g,96.0,19,960000.0,t,0,no.
3,a,4817,1335.0,u,g,i,o,335,f,f,0,f,g,0.0,120,0.0,,0,no.
4,b,3233,35.0,u,g,k,v,5,f,f,0,t,g,232.0,0,2320000.0,f,0,no.


In [5]:
test_data.head()

Unnamed: 0,variable1,variable2,variable3,variable4,variable5,variable6,variable7,variable8,variable9,variable10,variable11,variable12,variable13,variable14,variable15,variable17,variable18,variable19,classLabel
0,b,3233,75.0,u,g,e,bb,1585,t,f,0,t,s,420.0,0,4200000.0,,1,no.
1,b,2358,179.0,u,g,c,v,54,f,f,0,t,g,136.0,1,1360000.0,,0,no.
2,b,3642,0.00075,y,p,d,v,585,f,f,0,f,g,240.0,3,2400000.0,,1,no.
3,b,1842,10415.0,y,p,aa,v,125,t,f,0,f,g,120.0,375,1200000.0,,0,no.
4,b,245,13335.0,y,p,aa,v,4,f,f,0,t,g,120.0,475,1200000.0,f,1,no.


In [6]:
train_data.isna().sum()

variable1       39
variable2       39
variable3        0
variable4       64
variable5       64
variable6       66
variable7       66
variable8        0
variable9        0
variable10       0
variable11       0
variable12       0
variable13       0
variable14     100
variable15       0
variable17     100
variable18    2145
variable19       0
classLabel       0
dtype: int64

#### Variable18 have missing values more than half it's values so it's better to be dropped

In [7]:
y_train = list(train_data['classLabel'])
train_data = train_data.drop(['classLabel', 'variable18'], axis=1)
y_test = list(test_data['classLabel'])
test_data = test_data.drop(['classLabel', 'variable18'], axis=1)

#### Shuffling the training and test set

In [8]:
from sklearn.utils import shuffle

train_data, y_train = shuffle(train_data, y_train)
test_data, y_test = shuffle(test_data, y_test)

In [9]:
for i in range(len(y_train)):
    if y_train[i] == 'no.':
        y_train[i] = 0
    else:
        y_train[i] = 1

for i in range(len(y_test)):
    if y_test[i] == 'no.':
        y_test[i] = 0
    else:
        y_test[i] = 1

In [10]:
train_data.shape

(3700, 17)

In [11]:
test_data.shape

(200, 17)

## Pre-processing of the data

### Adjust the numeric columns

In [12]:
def remove_commas(variable, col):
    col = list(col)
    col = [float(str(val).replace(',', '.')) for val in col]
    return col

In [13]:
variables = ['variable2', 'variable3', 'variable8']

for var in variables:
    train_data[var] = remove_commas(var, train_data[var])
    test_data[var] = remove_commas(var, test_data[var])

In [14]:
train_data.head(5)

Unnamed: 0,variable1,variable2,variable3,variable4,variable5,variable6,variable7,variable8,variable9,variable10,variable11,variable12,variable13,variable14,variable15,variable17,variable19
480,b,34.17,0.000917,u,g,c,v,4.5,t,t,12,t,g,0.0,221,0.0,1
1408,a,21.5,0.0006,u,g,aa,v,2.5,t,t,3,f,g,80.0,918,800000.0,1
213,b,19.42,0.000725,u,g,m,v,0.04,f,t,1,f,g,100.0,1,1000000.0,0
2785,a,32.17,0.000146,u,g,W,v,1.085,t,t,16,f,g,120.0,2079,1200000.0,1
2251,b,56.0,0.00125,u,g,k,h,8.0,t,f,0,t,g,24.0,2028,240000.0,1


### Replacing missing categorical values with the mode of the feature

In [15]:
categorical = ['variable1', 'variable4', 'variable5', 'variable6', 'variable7', 'variable9', 'variable10',
      'variable12', 'variable13']

for col in categorical:
    train_data[col].fillna(train_data[col].mode()[0], inplace=True)
    test_data[col].fillna(train_data[col].mode()[0], inplace=True)

### Convert String features to one-hot encoded features

In [16]:
col = ['variable1', 'variable4', 'variable5', 'variable6', 'variable7', 'variable9', 'variable10',
      'variable12', 'variable13']

temp = pd.get_dummies(pd.concat([train_data, test_data], keys=[0,1]), columns=col)
train_data, test_data = temp.xs(0),temp.xs(1)

### Replacing missing values with median of the feature

In [17]:
median_train = train_data.median()
train_data = train_data.fillna(median_train)
test_data = test_data.fillna(median_train)

In [18]:
train_data.isna().sum().sum()

0

In [19]:
test_data.isna().sum().sum()

0

In [21]:
train_data.shape

(3700, 48)

In [20]:
test_data.shape

(200, 48)

### Scaling the numerical features

In [22]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(train_data)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [23]:
train_data = scaler.transform(train_data)

In [24]:
test_data = scaler.transform(test_data)

## Classification model training and testing

In [25]:
clf = LinearDiscriminantAnalysis()

clf.fit(train_data, y_train)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

In [32]:
y_pred = clf.predict(train_data)

In [33]:
from sklearn.metrics import accuracy_score

print("train accuracy :", accuracy_score(y_train, y_pred)*100)

train accuracy : 93.8108108108108


In [26]:
y_pred = clf.predict(test_data)

In [30]:
print("test accuracy :", accuracy_score(y_test, y_pred)*100)

test accuracy : 83.0


In [31]:
from sklearn.metrics import f1_score

print("f1 score :", f1_score(y_test, y_pred)*100)

f1 score : 83.96226415094338
