In [32]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [33]:
data = "./data/Assignment_PA.csv"

In [34]:
df = pd.read_csv(data)
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V25,V26,V27,V28,V29,V30,V31,V32,V33,Class
0,42,50,270900,270944,267,17,44,24220,76,108,...,0.8182,-0.2913,0.5822,1,0,0,0,0,0,1
1,645,651,2538079,2538108,108,10,30,11397,84,123,...,0.7931,-0.1756,0.2984,1,0,0,0,0,0,1
2,829,835,1553913,1553931,71,8,19,7972,99,125,...,0.6667,-0.1228,0.215,1,0,0,0,0,0,1
3,853,860,369370,369415,176,13,45,18996,99,126,...,0.8444,-0.1568,0.5212,1,0,0,0,0,0,1
4,1289,1306,498078,498335,2409,60,260,246930,37,126,...,0.9338,-0.1992,1.0,1,0,0,0,0,0,1


In [35]:
features = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
       'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31',
       'V32', 'V33']
X = df[features]
y = df["Class"]
X.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33
0,42,50,270900,270944,267,17,44,24220,76,108,...,1.6435,0.8182,-0.2913,0.5822,1,0,0,0,0,0
1,645,651,2538079,2538108,108,10,30,11397,84,123,...,1.4624,0.7931,-0.1756,0.2984,1,0,0,0,0,0
2,829,835,1553913,1553931,71,8,19,7972,99,125,...,1.2553,0.6667,-0.1228,0.215,1,0,0,0,0,0
3,853,860,369370,369415,176,13,45,18996,99,126,...,1.6532,0.8444,-0.1568,0.5212,1,0,0,0,0,0
4,1289,1306,498078,498335,2409,60,260,246930,37,126,...,2.4099,0.9338,-0.1992,1.0,1,0,0,0,0,0


In [36]:
# Get Zero Variance Features
sel = VarianceThreshold(threshold=0)
sel.fit_transform(X)
sel.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True])

In [37]:
X.columns[sel.get_support()]

Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
       'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31',
       'V32', 'V33'],
      dtype='object')

In [38]:
# Remove Columns which are zero variance
constant_columns = [column for column in X.columns
                    if column not in X.columns[sel.get_support()]]

# Note: In this dataset there isn't anything with zero variance so above operation is useless
print(len(constant_columns))


0


In [39]:
sel = VarianceThreshold(threshold=0.05)
sel.fit_transform(X)
sel.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True, False,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True, False, False,  True])

In [40]:
X.columns[sel.get_support()]

Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V12', 'V13', 'V14', 'V15', 'V17', 'V19', 'V20', 'V21', 'V22', 'V23',
       'V24', 'V25', 'V27', 'V28', 'V29', 'V30', 'V33'],
      dtype='object')

In [41]:
# Remove Columns which are zero variance
constant_columns = [column for column in X.columns
                    if column not in X.columns[sel.get_support()]]

# Note: Some columns are removed
print(len(constant_columns))

5


In [11]:
# Create training dataset
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

In [12]:
assignment_model = DecisionTreeRegressor(random_state=1)

In [13]:
train_X_1 = train_X.drop(constant_columns,axis=1)
val_X_1 = val_X.drop(constant_columns, axis=1)

In [14]:
assignment_model.fit(train_X_1, train_y)

DecisionTreeRegressor(random_state=1)

In [15]:
val_predictions = assignment_model.predict(val_X_1)

In [16]:
print(val_predictions)
print(val_y)

[1. 1. 1. 1. 1. 1. 1. 2. 1. 2. 1. 2. 2. 2. 1. 2. 1. 1. 1. 1. 1. 1. 2. 2.
 2. 1. 1. 1. 2. 2. 1. 2. 1. 1. 1. 2. 1. 1. 2. 1. 2. 1. 1. 1. 2. 2. 1. 2.
 1. 2. 1. 1. 1. 1. 2. 1. 1. 2. 2. 1. 1. 1. 2. 1. 1. 1. 1. 1. 1. 1. 1. 2.
 1. 1. 2. 1. 1. 1. 2. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 2. 1. 2. 1. 1. 2.
 1. 1. 1. 1. 2. 1. 1. 1. 1. 2. 1. 1. 2. 2. 1. 1. 1. 1. 1. 1. 1. 1. 2. 1.
 1. 2. 2. 1. 1. 1. 1. 2. 2. 1. 1. 1. 2. 1. 1. 1. 1. 2. 2. 2. 2. 2. 1. 1.
 2. 1. 1. 1. 2. 2. 2. 1. 1. 2. 2. 1. 1. 1. 2. 2. 2. 2. 2. 1. 2. 1. 1. 2.
 1. 1. 2. 1. 1. 2. 1. 1. 2. 1. 1. 1. 1. 1. 1. 1. 1. 2. 2. 2. 1. 1. 1. 1.
 2. 2. 1. 1. 1. 1. 1. 2. 1. 1. 2. 1. 2. 1. 1. 1. 1. 2. 1. 1. 1. 2. 1. 1.
 1. 1. 1. 2. 2. 1. 1. 1. 1. 1. 2. 1. 1. 2. 1. 1. 2. 2. 2. 2. 1. 1. 1. 1.
 2. 2. 1. 1. 1. 1. 1. 1. 1. 1. 2. 1. 1. 1. 1. 2. 1. 1. 1. 1. 1. 1. 2. 2.
 1. 1. 2. 1. 1. 1. 1. 1. 1. 1. 2. 1. 2. 2. 2. 1. 1. 2. 1. 1. 1. 1. 1. 1.
 1. 2. 2. 2. 1. 1. 1. 2. 2. 1. 1. 1. 1. 1. 1. 1. 2. 1. 1. 1. 2. 2. 1. 2.
 2. 2. 1. 1. 2. 1. 1. 2. 1. 1. 1. 2. 1. 2. 1. 1. 1.

In [17]:
mean_absolute_error(val_y, val_predictions)

0.01440329218106996

In [18]:
assignment_model.feature_importances_

array([0.00947864, 0.03729292, 0.00991444, 0.00201085, 0.00141771,
       0.        , 0.0022622 , 0.00150813, 0.02138261, 0.00281518,
       0.00721131, 0.        , 0.        , 0.01117137, 0.02987354,
       0.        , 0.00669453, 0.        , 0.        , 0.10420434,
       0.        , 0.00025049, 0.00265934, 0.        , 0.20330514,
       0.17627374, 0.23026248, 0.14001103])

In [19]:
# Final Feature set after evaluati
final_features = ['V28', 'V29', 'V30', 'V31', 'V32', 'V33']
X = df[final_features]
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

train_X.head()
val_X

Unnamed: 0,V28,V29,V30,V31,V32,V33
768,0,0,0,1,0,0
704,0,0,1,0,0,0
726,0,0,1,0,0,0
215,0,1,0,0,0,0
1158,0,0,0,0,0,1
...,...,...,...,...,...,...
1638,0,0,0,0,0,0
667,0,0,1,0,0,0
981,0,0,0,0,0,1
1038,0,0,0,0,0,1


In [20]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC# Spot Check Algorithms
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process.kernels import RBF

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [21]:
# LinearDiscriminantAnalysis
model = LinearDiscriminantAnalysis()
model.fit(train_X, train_y)
predictions = model.predict(val_X)
# Evaluate predictions
print(accuracy_score(val_y, predictions))
print(confusion_matrix(val_y, predictions))
print(classification_report(val_y, predictions))

0.6666666666666666
[[324   0]
 [162   0]]
              precision    recall  f1-score   support

           1       0.67      1.00      0.80       324
           2       0.00      0.00      0.00       162

    accuracy                           0.67       486
   macro avg       0.33      0.50      0.40       486
weighted avg       0.44      0.67      0.53       486



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
# LogisticRegression
model = LogisticRegression(solver='liblinear', multi_class='ovr')
model.fit(train_X, train_y)
predictions = model.predict(val_X)
# Evaluate predictions
print(accuracy_score(val_y, predictions))
print(confusion_matrix(val_y, predictions))
print(classification_report(val_y, predictions))

1.0
[[324   0]
 [  0 162]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       324
           2       1.00      1.00      1.00       162

    accuracy                           1.00       486
   macro avg       1.00      1.00      1.00       486
weighted avg       1.00      1.00      1.00       486



In [23]:
# DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(train_X, train_y)
predictions = model.predict(val_X)
# Evaluate predictions
print(accuracy_score(val_y, predictions))
print(confusion_matrix(val_y, predictions))
print(classification_report(val_y, predictions))

1.0
[[324   0]
 [  0 162]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       324
           2       1.00      1.00      1.00       162

    accuracy                           1.00       486
   macro avg       1.00      1.00      1.00       486
weighted avg       1.00      1.00      1.00       486



In [24]:
#3.KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(train_X, train_y)
predictions = model.predict(val_X)
# Evaluate predictions
print(accuracy_score(val_y, predictions))
print(confusion_matrix(val_y, predictions))
print(classification_report(val_y, predictions))

1.0
[[324   0]
 [  0 162]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       324
           2       1.00      1.00      1.00       162

    accuracy                           1.00       486
   macro avg       1.00      1.00      1.00       486
weighted avg       1.00      1.00      1.00       486



In [25]:
#5.GaussianNB
model = GaussianNB()
model.fit(train_X, train_y)
predictions = model.predict(val_X)
# Evaluate predictions
print(accuracy_score(val_y, predictions))
print(confusion_matrix(val_y, predictions))
print(classification_report(val_y, predictions))

1.0
[[324   0]
 [  0 162]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       324
           2       1.00      1.00      1.00       162

    accuracy                           1.00       486
   macro avg       1.00      1.00      1.00       486
weighted avg       1.00      1.00      1.00       486



In [26]:
#6.SVC
model = SVC(gamma='auto')
model.fit(train_X, train_y)
predictions = model.predict(val_X)
# Evaluate predictions
print(accuracy_score(val_y, predictions))
print(confusion_matrix(val_y, predictions))
print(classification_report(val_y, predictions))

1.0
[[324   0]
 [  0 162]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       324
           2       1.00      1.00      1.00       162

    accuracy                           1.00       486
   macro avg       1.00      1.00      1.00       486
weighted avg       1.00      1.00      1.00       486



In [27]:
#7.BaggingClassifier
model = BaggingClassifier()
model.fit(train_X, train_y)
predictions = model.predict(val_X)
# Evaluate predictions
print(accuracy_score(val_y, predictions))
print(confusion_matrix(val_y, predictions))
print(classification_report(val_y, predictions))

1.0
[[324   0]
 [  0 162]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       324
           2       1.00      1.00      1.00       162

    accuracy                           1.00       486
   macro avg       1.00      1.00      1.00       486
weighted avg       1.00      1.00      1.00       486



In [29]:
#8.SGDClassifier
model = SGDClassifier()
model.fit(train_X, train_y)
predictions = model.predict(val_X)
# Evaluate predictions
print(accuracy_score(val_y, predictions))
print(confusion_matrix(val_y, predictions))
print(classification_report(val_y, predictions))

1.0
[[324   0]
 [  0 162]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       324
           2       1.00      1.00      1.00       162

    accuracy                           1.00       486
   macro avg       1.00      1.00      1.00       486
weighted avg       1.00      1.00      1.00       486



In [30]:
#9.MLPClassifier
model = MLPClassifier(alpha=1, max_iter=1000)
model.fit(train_X, train_y)
predictions = model.predict(val_X)
# Evaluate predictions
print(accuracy_score(val_y, predictions))
print(confusion_matrix(val_y, predictions))
print(classification_report(val_y, predictions))

1.0
[[324   0]
 [  0 162]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       324
           2       1.00      1.00      1.00       162

    accuracy                           1.00       486
   macro avg       1.00      1.00      1.00       486
weighted avg       1.00      1.00      1.00       486



In [31]:
#10.GaussianProcessClassifier
model = GaussianProcessClassifier(1.0 * RBF(1.0))
model.fit(train_X, train_y)
predictions = model.predict(val_X)
# Evaluate predictions
print(accuracy_score(val_y, predictions))
print(confusion_matrix(val_y, predictions))
print(classification_report(val_y, predictions))



1.0
[[324   0]
 [  0 162]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       324
           2       1.00      1.00      1.00       162

    accuracy                           1.00       486
   macro avg       1.00      1.00      1.00       486
weighted avg       1.00      1.00      1.00       486

