In [36]:
import pandas as pd
from scipy import stats
import numpy as np
import numpy.ma as ma
from scipy.stats import mstats
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Getting the data from csv file to pandas dataframe
df = pd.read_csv('data.csv', header=None)

# Reading the target variable into y and removing it from original dataframe
y = df[41]
del(df[41])

print(df.head(5))
print(y.head(5))

     0     1     2     3     4     5     6     7   8   9  ...  31  32  33  34  \
0  48.0  32.0  47.0  64.0  34.0  14.0  14.0  15.0  42  61 ...  42  62  33  33   
1  34.0  21.0  82.0  48.0  29.0  11.0  14.0  14.0  34  31 ...  47  48  61  63   
2  45.0  34.0  54.0  65.0  43.0  13.0  11.0   9.0  42  61 ...  42  62  33  33   
3  69.0  57.0  47.0  85.0  47.0   6.0  11.0  10.0  37  32 ...  52  61  62  64   
4  36.0  30.0  50.0  59.0  35.0   6.0  13.0  13.0  37  32 ...  52  61  62  64   

   35  36  37  38  39  40  
0  41  11  13  14   7   6  
1  58   7   5  14  13  12  
2  41  11  13  14   7   6  
3  60   8   6  15  14  13  
4  60   8   6  15  14  13  

[5 rows x 41 columns]
0    0
1    1
2    0
3    1
4    1
Name: 41, dtype: int64


<b> Check for missing values </b>

In [3]:
df.isnull().values.any()

False

In [4]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,40
count,20854.0,20854.0,20854.0,20854.0,20854.0,20854.0,20854.0,20854.0,20854.0,20854.0,...,20854.0,20854.0,20854.0,20854.0,20854.0,20854.0,20854.0,20854.0,20854.0,20854.0
mean,55.946885,51.186513,58.790715,63.548063,50.059329,17.920492,13.673587,14.056053,36.904766,39.794428,...,45.481538,54.945478,42.496643,38.513139,38.419344,14.767287,13.279179,20.379591,17.112544,15.774911
std,15.846184,17.353144,13.560364,12.541247,16.577858,18.497045,11.288914,12.213878,12.595262,14.124485,...,7.814612,8.086104,19.405069,21.568836,20.189269,19.13173,17.915217,19.506041,16.973474,18.954116
min,6.0,5.0,10.0,15.0,5.0,2.0,2.0,2.0,19.0,18.0,...,16.0,40.0,12.0,11.0,11.0,7.0,5.0,14.0,7.0,6.0
25%,46.0,39.0,51.0,58.0,35.0,7.0,8.0,8.0,28.0,31.0,...,42.0,48.0,20.0,14.0,18.0,7.0,5.0,14.0,7.0,6.0
50%,58.0,53.0,61.0,65.0,53.0,11.0,11.0,11.0,34.0,32.0,...,47.0,61.0,55.0,34.0,41.0,8.0,6.0,14.0,13.0,12.0
75%,68.0,66.0,67.0,72.0,63.0,14.0,14.0,14.0,37.0,59.0,...,47.0,62.0,57.0,60.0,58.0,11.0,13.0,15.0,14.0,13.0
max,84.0,82.0,90.0,85.0,81.0,79.0,72.0,78.0,67.0,61.0,...,63.0,64.0,62.0,64.0,60.0,75.0,69.0,82.0,70.0,75.0


<b>Check if data is Gaussian distribution </b>

In [29]:
z,pval = mstats.normaltest(df)

#Using the confidence value as 95% 
if(pval.all() < 0.05):
    print('Not normal distribution')

Not normal distribution


<b>Why to use Min-Max Scalar?</b>
<br>
<br>
-- As data is not normally distributed we cannot use <b>StandardScalar</b>.<br>
-- <b>Min-Max Scalar</b> works better if the distribution is not Gaussian or if the standard deviation is very small.<br>

In [6]:
scalar = MinMaxScaler()
scalar.fit(df)

  return self.partial_fit(X, y)


MinMaxScaler(copy=True, feature_range=(0, 1))

In [7]:
scaled_data = scalar.transform(df)
scaled_data

array([[0.53846154, 0.35064935, 0.4625    , ..., 0.        , 0.        ,
        0.        ],
       [0.35897436, 0.20779221, 0.9       , ..., 0.        , 0.0952381 ,
        0.08695652],
       [0.5       , 0.37662338, 0.55      , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.64102564, 0.19480519, 0.7125    , ..., 0.        , 0.0952381 ,
        0.08695652],
       [0.53846154, 0.87012987, 0.6875    , ..., 0.01470588, 0.11111111,
        0.10144928],
       [0.75641026, 0.88311688, 0.6875    , ..., 0.01470588, 0.11111111,
        0.10144928]])

<b>Dimensionality Reduction using PCA</b>
<br>
PCA is applied to take the necessary attributes that are representative of the variance of the dataset.

In [8]:
for i in range(1, df.shape[1]+1):
    pca = PCA(n_components=i)
    pca.fit(scaled_data)
    print(sum(pca.explained_variance_ratio_))

0.4740565350106719
0.7079832514318333
0.8175816601205612
0.8655186382785133
0.9046341748038949
0.9249495743049728
0.9413642290762874
0.955211231934536
0.9645962642818509
0.9738063097105232
0.9791376165338971
0.9828728359055369
0.9863682391333966
0.9894399321665592
0.991859622019979
0.9939920558446469
0.9959087742588518
0.997412299246496
0.9981635830033067
0.9987693521850495
0.9991801677019031
0.9995150963622441
0.9997481069037992
0.9998864458476638
0.9999573507143993
0.9999821657957485
0.9999926929921165
0.9999999999999989
0.9999999999999986
0.9999999999999996
0.9999999999999994
0.9999999999999993
0.9999999999999999
0.9999999999999999
0.9999999999999999
0.9999999999999999
0.9999999999999999
0.9999999999999999
0.9999999999999999
0.9999999999999999
0.9999999999999999


Therefore after applying PCA we can see that <b>15 variable are capturing approximately 99% of the variance.</b>
<br>
Hence we are considering 15 features for our analysis. 

In [9]:
pca = PCA(n_components=15)
pca.fit(scaled_data)

PCA(copy=True, iterated_power='auto', n_components=15, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [10]:
x_pca = pca.transform(scaled_data)

In [11]:
scaled_data.shape

(20854, 41)

In [12]:
x_pca.shape

(20854, 15)

Here we are spliting data into training and testing data. 80% of the data is taken for training and 20% for testing.

In [13]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2)

Now we will apply all the algorithms one by one on training and testing data and see the outcome using accuracy as the performance metric.

<b>1. Decision Tree</b>

In [14]:
clf=DecisionTreeClassifier()
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [15]:
predicted= clf.predict(X_test)
print(metrics.accuracy_score(y_test, predicted))

0.5602972908175498


<b>2. Random Forest</b>

In [41]:
clf=RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [17]:
predicted= clf.predict(X_test)
print(metrics.accuracy_score(y_test, predicted))

0.5662910573004076


<b>3. K Nearset Neighbour</b>

In [18]:
clf=KNeighborsClassifier()
clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [19]:
predicted= clf.predict(X_test)
print(metrics.accuracy_score(y_test, predicted))

0.5348837209302325


<b>4. Naive Bayes Classifier</b>

In [20]:
clf=BernoulliNB()
clf.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [21]:
predicted= clf.predict(X_test)
print(metrics.accuracy_score(y_test, predicted))

0.5648525533445217


<b>5. Support Vector Machines</b>

In [39]:
clf=SVC(gamma='auto')
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [27]:
predicted= clf.predict(X_test)
print(metrics.accuracy_score(y_test, predicted))

0.5504675137856629


<b>6. Logistic Regression</b>

In [40]:
logreg = LogisticRegression(solver='lbfgs')
logreg.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [37]:
predicted=logreg.predict(X_test)
print(metrics.accuracy_score(y_test, predicted)) 
print(classification_report(y_test, predicted))

0.6008151522416687
              precision    recall  f1-score   support

           0       0.54      0.58      0.56      1815
           1       0.66      0.61      0.63      2356

   micro avg       0.60      0.60      0.60      4171
   macro avg       0.60      0.60      0.60      4171
weighted avg       0.61      0.60      0.60      4171



<b> Conclusion </b><br>
    As we can see that the accuracy of <b> Logistic Regression </b> is the highest. Hence we should use Logistic Regression for doing predictions on this data set.