### Importing Libraries

In [7]:
import os
from scipy.io import arff
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None  
import warnings
import sklearn
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

### Loading files

In [8]:
df = arff.loadarff(r"C:\Users\CHIRAG\Desktop\sem_6\mtl782\dataset\\3year.arff")
data = pd.DataFrame(df[0])

In [9]:
print(data.isnull().any(axis = 1).sum())
print(len(data))
print(data.shape)

5618
10503
(10503, 65)


In [10]:
def format_data(df):
    for i in range(64):
        name = "Attr" + str(i+1)
        df[name] = pd.to_numeric(df[name], errors='coerce', downcast = "float")
    df['class'] = df['class'].astype('int')
    return df

### Preprocessing : Replacing NA with Mean, Scaling  

In [11]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

In [12]:
def replace_na_mean(df):
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp_data = imp.fit_transform(df)
    return pd.DataFrame(imp_data, index=df.index, columns=df.columns)
def scaling(df):
    scaler = MinMaxScaler()
    df[:-1] = scaler.fit_transform(df[:-1])
    return df

In [13]:
data = replace_na_mean(data)
data = scaling(data)

### X,y

In [14]:
data0 = data.loc[data['class'] == 0]
data1 = data.loc[data['class'] == 1]

In [15]:
X,y = data.iloc[:,:-1], data.iloc[:,-1]
X0,y0 = data0.iloc[:,:-1], data0.iloc[:,-1]
X1,y1 = data1.iloc[:,:-1], data1.iloc[:,-1]

### PCA

In [16]:
from sklearn.decomposition import PCA
def apply_pca(df, n_comp):
    pca = PCA(n_components= n_comp)
    principalComponents = pca.fit_transform(df)
    return pd.DataFrame(principalComponents)
X = apply_pca(X,20)

### Various Classification Models

In [17]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [22]:
def try_all_classifiers(X,y, classifiers, sampling  = None):
    accuracy = [0]*len(classifiers)
    f1 = [0]*len(classifiers)
    precision = [0]*len(classifiers)
    recall = [0]*len(classifiers)
    i = 0
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state=1, shuffle = True)
    if sampling == "SMOTE":
        smote = SMOTE()
        X_train, y_train = smote.fit_resample(X_train, y_train)
        print("SMOTE")
    if sampling == "RUS":
        rus = RandomUnderSampler()
        X_train, y_train = rus.fit_resample(X_train, y_train)
        print("RUS")
    
    for i in range(len(classifiers)):
        classif = classifiers[i]
        classif.fit(X_train, y_train)
        y_pred = classif.predict(X_test)
        
        accuracy[i] = classif.score(X_test, y_test)
        f1[i] = metrics.f1_score(y_test, y_pred, labels=np.unique(y_pred))
        precision[i] = metrics.precision_score(y_test, y_pred)
        recall[i] = metrics.recall_score(y_test, y_pred)
    print("Done")
    return accuracy,f1,precision,recall

In [19]:
classifiers_names = ["Logistic Regression", "LDA", "KNN 5", "KNN 10", "GNB", "DT", "SVM", "RFC", "XGB"]
classifiers = [LogisticRegression(), LinearDiscriminantAnalysis(),KNeighborsClassifier(n_neighbors=5), KNeighborsClassifier(n_neighbors=10),GaussianNB(),DecisionTreeClassifier(),SVC(),RandomForestClassifier(),XGBClassifier(use_label_encoder=False)]

In [20]:
accuracy,f1,precision,recall = try_all_classifiers(X,y, classifiers)

Done


### Sampling

In [23]:
accuracy_sm,f1_sm,precision_sm,recall_sm = try_all_classifiers(X,y, classifiers, sampling = "SMOTE")

SMOTE
Done


In [24]:
accuracy_rus,f1_rus,precision_rus,recall_rus = try_all_classifiers(X,y, classifiers, sampling = "RUS")

RUS
Done


### Ensemble Classifiers

In [25]:
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
kfold = model_selection.KFold(n_splits=10, random_state=7, shuffle = True)
model = BaggingClassifier(base_estimator= DecisionTreeClassifier(), n_estimators=20, random_state=7)
results = model_selection.cross_val_score(model, X, y, cv=kfold)
print(results.mean())

0.9550602147614518


In [26]:
from sklearn.ensemble import AdaBoostClassifier
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed, shuffle = True)
model = AdaBoostClassifier(n_estimators=20, random_state= seed)
results = model_selection.cross_val_score(model, X, y, cv=kfold)
print(results.mean())

0.9515379457206288


In [27]:
from sklearn.ensemble import VotingClassifier
seed = 6
kfold = model_selection.KFold(n_splits=10, random_state=seed, shuffle = True)
# create the sub models
estimators = []
model1 = LogisticRegression()
estimators.append(('logistic', model1))
model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))
model3 = SVC()
estimators.append(('svm', model3))
model4 = KNeighborsClassifier(n_neighbors=10)
estimators.append(('knn', model4))
model5 = GaussianNB()
estimators.append(('gnb', model5))
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, X, y, cv=kfold)
print(results.mean())

0.9527757691087853


In [46]:
accuracy_table = np.vstack((classifiers_names, accuracy, accuracy_sm, accuracy_rus))
display(accuracy_table)

array([['Logistic Regression', 'LDA', 'KNN 5', 'KNN 10', 'GNB', 'DT',
        'SVM', 'RFC', 'XGB'],
       ['0.9574738178356077', '0.9552523008568709', '0.9552523008568709',
        '0.9574738178356077', '0.9552523008568709', '0.9165344335131704',
        '0.9574738178356077', '0.9565217391304348', '0.9562043795620438'],
       ['0.6172643605204697', '0.6648682957791178', '0.7483338622659473',
        '0.7258013329101872', '0.9562043795620438', '0.8540145985401459',
        '0.9574738178356077', '0.9247857822913361', '0.902253252935576'],
       ['0.5483973341796256', '0.6667724531894637', '0.5963186290066645',
        '0.6724849254205014', '0.952078705172961', '0.6385274516026658',
        '0.9574738178356077', '0.6940653760710885', '0.6969216121866074']],
      dtype='<U32')

In [47]:
f1_table = np.vstack((classifiers_names, f1, f1_sm, f1_rus))
display(f1_table)

array([['Logistic Regression', 'LDA', 'KNN 5', 'KNN 10', 'GNB', 'DT',
        'SVM', 'RFC', 'XGB'],
       ['0.0', '0.0', '0.013986013986013986', '0.0',
        '0.013986013986013986', '0.1488673139158576', '0.0',
        '0.16969696969696968', '0.21590909090909088'],
       ['0.10267857142857142', '0.11557788944723618',
        '0.09371428571428571', '0.09243697478991598', '0.0',
        '0.1726618705035971', '0.0', '0.23300970873786406',
        '0.20207253886010365'],
       ['0.08252740167633785', '0.12353923205342236',
        '0.09272467902995722', '0.10104529616724738',
        '0.013071895424836602', '0.12720306513409962', '0.0',
        '0.15734265734265734', '0.15561450044208666']], dtype='<U32')

In [48]:
precision_table = np.vstack((classifiers_names,precision, precision_sm, precision_rus))
display(precision_table)

array([['Logistic Regression', 'LDA', 'KNN 5', 'KNN 10', 'GNB', 'DT',
        'SVM', 'RFC', 'XGB'],
       ['0.0', '0.0', '0.1111111111111111', '0.0', '0.1111111111111111',
        '0.13142857142857142', '0.0', '0.45161290322580644',
        '0.4523809523809524'],
       ['0.05702479338842975', '0.06509433962264151',
        '0.0553306342780027', '0.05378973105134474', '0.0',
        '0.11374407582938388', '0.0', '0.2057142857142857',
        '0.15476190476190477'],
       ['0.04516584333098095', '0.06954887218045112',
        '0.051261829652996846', '0.05719921104536489',
        '0.05263157894736842', '0.07087959009393681', '0.0',
        '0.0891089108910891', '0.08826479438314945']], dtype='<U32')

In [49]:
recall_table = np.vstack((classifiers_names,recall, recall_sm, recall_rus))
display(recall_table)

array([['Logistic Regression', 'LDA', 'KNN 5', 'KNN 10', 'GNB', 'DT',
        'SVM', 'RFC', 'XGB'],
       ['0.0', '0.0', '0.007462686567164179', '0.0',
        '0.007462686567164179', '0.17164179104477612', '0.0',
        '0.1044776119402985', '0.1417910447761194'],
       ['0.5149253731343284', '0.5149253731343284',
        '0.30597014925373134', '0.3283582089552239', '0.0',
        '0.3582089552238806', '0.0', '0.26865671641791045',
        '0.291044776119403'],
       ['0.47761194029850745', '0.5522388059701493',
        '0.48507462686567165', '0.43283582089552236',
        '0.007462686567164179', '0.6194029850746269', '0.0',
        '0.6716417910447762', '0.6567164179104478']], dtype='<U32')