In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

In [2]:
# Getting the data from csv file to pandas dataframe
df = pd.read_csv('data.csv', header=None)

# Reading the target variable into y and removing it from original dataframe
y = df[41]
del(df[41])

print(df.head(5))
print(y.head(5))

     0     1     2     3     4     5     6     7   8   9  ...  31  32  33  34  \
0  48.0  32.0  47.0  64.0  34.0  14.0  14.0  15.0  42  61 ...  42  62  33  33   
1  34.0  21.0  82.0  48.0  29.0  11.0  14.0  14.0  34  31 ...  47  48  61  63   
2  45.0  34.0  54.0  65.0  43.0  13.0  11.0   9.0  42  61 ...  42  62  33  33   
3  69.0  57.0  47.0  85.0  47.0   6.0  11.0  10.0  37  32 ...  52  61  62  64   
4  36.0  30.0  50.0  59.0  35.0   6.0  13.0  13.0  37  32 ...  52  61  62  64   

   35  36  37  38  39  40  
0  41  11  13  14   7   6  
1  58   7   5  14  13  12  
2  41  11  13  14   7   6  
3  60   8   6  15  14  13  
4  60   8   6  15  14  13  

[5 rows x 41 columns]
0    0
1    1
2    0
3    1
4    1
Name: 41, dtype: int64


<b>Why to use StandardScaler?</b>
<br>
The idea behind StandardScaler is that it will transform your data such that its distribution will have a mean value 0 and standard deviation of 1. Given the distribution of the data, each value in the dataset will have the sample mean value subtracted, and then divided by the standard deviation of the whole dataset.

In [3]:
scalar = StandardScaler()
scalar.fit(df)

  return self.partial_fit(X, y)


StandardScaler(copy=True, with_mean=True, with_std=True)

In [4]:
scaled_data = scalar.transform(df)
scaled_data

  """Entry point for launching an IPython kernel.


array([[-0.50151355, -1.105677  , -0.86951923, ..., -0.32706505,
        -0.59579946, -0.51572682],
       [-1.38502819, -1.73958308,  1.71159423, ..., -0.32706505,
        -0.24229824, -0.19916528],
       [-0.69083811, -0.99042135, -0.35329654, ..., -0.32706505,
        -0.59579946, -0.51572682],
       ...,
       [ 0.00335196, -1.79721091,  0.60540275, ..., -0.32706505,
        -0.24229824, -0.19916528],
       [-0.50151355,  1.19943602,  0.45791055, ..., -0.27579765,
        -0.18338137, -0.14640503],
       [ 0.57132566,  1.25706384,  0.45791055, ..., -0.27579765,
        -0.18338137, -0.14640503]])

<b>Dimentionality Reduction using PCA</b>
<br>
PCA is applied to take the necessary attributes that are representative of the variance of the dataset.

In [5]:
for i in range(1, df.shape[1]+1):
    pca = PCA(n_components=i)
    pca.fit(scaled_data)
    print(sum(pca.explained_variance_ratio_))

0.42869030726780255
0.6080006399028212
0.7125945190109808
0.8157670987420704
0.8528935398187707
0.8891502420410962
0.9099947555546344
0.9304276383836045
0.9473024028465722
0.9607837740498926
0.9709586183856
0.9774314313555001
0.9824788313044726
0.9870264949727627
0.9906833591280451
0.9931964500010292
0.9951985699347266
0.9965630382469157
0.9975918459825293
0.9984839944055472
0.9991677116074416
0.9995706945705655
0.999798165671342
0.999909202713477
0.9999655624238089
0.999982285775475
0.9999936540695805
1.0
0.9999999999999999
0.9999999999999997
0.9999999999999999
1.0000000000000002
0.9999999999999999
0.9999999999999999
0.9999999999999999
0.9999999999999999
0.9999999999999999
0.9999999999999999
0.9999999999999999
0.9999999999999999
0.9999999999999999


Therefore after applying PCA we can see that <b>15 variable are capturing approximately 99% of the variance.</b>
<br>
Hence we are considering 15 features for our analysis. 

In [6]:
pca = PCA(n_components=15)
pca.fit(scaled_data)

PCA(copy=True, iterated_power='auto', n_components=15, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [7]:
x_pca = pca.transform(scaled_data)

In [8]:
scaled_data.shape

(20854, 41)

In [9]:
x_pca.shape

(20854, 15)

Here we are spliting data into training and testing data. 80% of the data is taken for training and 20% for testing.

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2)

Now we will apply all the algorithms one by one on training and testing data and see the outcome using accuracy as the performance metric.

<b>1. Decision Tree</b>

In [11]:
clf=DecisionTreeClassifier()
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [12]:
predicted= clf.predict(X_test)
print(metrics.accuracy_score(y_test, predicted))

0.5487892591704627


<b>2. Random Forest</b>

In [13]:
clf=RandomForestClassifier()
clf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [14]:
predicted= clf.predict(X_test)
print(metrics.accuracy_score(y_test, predicted))

0.5617357947734356


<b>3. K Nearset Neighbour</b>

In [15]:
clf=KNeighborsClassifier()
clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [16]:
predicted= clf.predict(X_test)
print(metrics.accuracy_score(y_test, predicted))

0.5346439702709183


<b>4. Naive Bayes Classifier</b>

In [17]:
clf=BernoulliNB()
clf.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [18]:
predicted= clf.predict(X_test)
print(metrics.accuracy_score(y_test, predicted))

0.5718053224646368


<b>5. Support Vector Machines</b>

In [19]:
clf=SVC()
clf.fit(X_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [20]:
predicted= clf.predict(X_test)
print(metrics.accuracy_score(y_test, predicted))

0.5646128026852074


<b>6. Logistic Regression</b>

In [21]:
logreg = LogisticRegression()
logreg.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [22]:
predicted=logreg.predict(X_test)
print(metrics.accuracy_score(y_test, predicted))

0.6065691680652122


<b> Conclusion </b>
    As we can see that the accuracy of <b> Logistic Regression </b> is the highest. Hence we should use Logistic Regression for doing predictions on this data set.