In [1]:
import numpy as np
import pandas as pd
import enum

In [2]:
#Feature Scoring Method:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif

In [3]:
#Model Imports
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier

  from numpy.core.umath_tests import inner1d


In [4]:
from sklearn.model_selection import KFold

In [5]:
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

In [6]:
#Data Imbalence Resolver
from imblearn.over_sampling import SMOTE

In [7]:
from sklearn import preprocessing

In [8]:
#Math Formulas
import math

In [9]:
import matplotlib.pyplot as plt

In [114]:
#Models to Run
clf1 = LogisticRegression(solver = 'liblinear')
clf2 = Perceptron(tol=1e-3, random_state=42)
clf3 = GaussianNB()
clf4 = LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=1e-05, verbose=3)
clf5 = RandomForestClassifier(n_estimators='warn', criterion='gini', max_depth=None, 
                          min_samples_split=2, min_samples_leaf=1, 
                          min_weight_fraction_leaf=0.0, max_features='auto', 
                          max_leaf_nodes=None, min_impurity_decrease=0.0, 
                          min_impurity_split=None, bootstrap=True, oob_score=False, 
                          n_jobs=None, random_state=None, verbose=0, warm_start=False, 
                          class_weight=None)
clf6 = KNeighborsClassifier(n_neighbors=5)

In [117]:
def run_indices(X_train, y_train, clf, indices):
    list_of_acc = []
    for i in range (5):
        print(i)
        list_of_acc.append(filter_features(X_train, y_train, indices[i], clf))
    return list_of_acc

In [123]:
def filter_features (X_train, y_train, index, clf):
    sm = SMOTE(random_state = 42, ratio = 1)
    kf = KFold(n_splits=5, random_state=42, shuffle=False)
    list_of_avg_f1 =[]
    list_of_std = []
    
    for i in range(len(index)):
        sum_of_acc = 0
        avg_f1 = 0
        accur_list = []
        print("This is iteration:", i+1, "out of", len(index))
        index_train = X_train.iloc[:,index[0:i+1]]
        for train_index, test_index in kf.split(index_train):
            #print("TRAIN:", train_index, "TEST:", test_index)
            k_X_train, k_X_test = index_train.iloc[train_index], index_train.iloc[test_index]
            k_y_train, k_y_test = y_train.iloc[train_index], y_train.iloc[test_index]
            k_X_train, k_y_train = sm.fit_sample(k_X_train, k_y_train)
            clf.fit(k_X_train,k_y_train)
            k_pred_y = clf.predict(k_X_test)
            accur_list.append(f1_score(y_true = k_y_test, y_pred = k_pred_y, average = 'macro'))
        avg_f1 = sum(accur_list)/len(accur_list)
        print(avg_f1)
        for i in range(len(accur_list)):
            sum_of_acc = sum_of_acc +(accur_list[0] - avg_f1)**2
        std = math.sqrt((sum_of_acc)/len(accur_list))
        print("The average accuracy is: ",avg_f1, "and the std is: ", std)
        list_of_std.append(std)
        list_of_avg_f1.append(avg_f1)
        
    return list_of_avg_f1, list_of_std

def get_scores(score_func, X, Y, X_train):
    k_best = SelectKBest(score_func = score_func, k = X.shape[1])
    fit = k_best.fit(X,Y)
    scores, std = pd.Series(index = X_train.columns, data = fit.scores_)
    return scores, std

In [91]:
#Put this in global
sm = SMOTE(random_state = 42, ratio = 1)
kf = KFold(n_splits=5, random_state=42, shuffle=False)

In [92]:
train = pd.read_csv('Data/Train_set.csv', index_col='Unnamed: 0')
test = pd.read_csv('Data/Test_set.csv', index_col='Unnamed: 0')

In [93]:
X_train = train.iloc[:,:89]
y_train = train['y']

In [94]:
scores = pd.read_csv('Data/Scores.csv', index_col = 'Unnamed: 0')

In [96]:
indices = []
indices.append(scores['corr_Rank'].sort_values().index)
indices.append(scores['chi2_Rank'].sort_values().index)
indices.append(scores['MI_Rank'].sort_values().index)
indices.append(scores['rand_for_imp_Rank'].sort_values().index)


In [None]:
logit_acc, logit_std = run_indices(X_train, y_train, clf1, indices)

0
This is iteration: 1 out of 89




0.4158180674954492
The average accuracy is:  0.4158180674954492 and the std is:  0.001682602655962817
This is iteration: 2 out of 89




0.45628989183679414
The average accuracy is:  0.45628989183679414 and the std is:  0.0008956893401474985
This is iteration: 3 out of 89




0.4645436003540576
The average accuracy is:  0.4645436003540576 and the std is:  0.0006658544468693961
This is iteration: 4 out of 89




0.5289928254901989
The average accuracy is:  0.5289928254901989 and the std is:  0.00046364888956451633
This is iteration: 5 out of 89




0.5253347486842026
The average accuracy is:  0.5253347486842026 and the std is:  0.0001551739040606126
This is iteration: 6 out of 89




0.5191125899576784
The average accuracy is:  0.5191125899576784 and the std is:  0.001740162466849382
This is iteration: 7 out of 89




0.5177284223112353
The average accuracy is:  0.5177284223112353 and the std is:  5.60900665660391e-05
This is iteration: 8 out of 89




0.5171740379651418
The average accuracy is:  0.5171740379651418 and the std is:  5.935695923209306e-05
This is iteration: 9 out of 89




0.517227030820655
The average accuracy is:  0.517227030820655 and the std is:  0.0005126007980739145
This is iteration: 10 out of 89




0.5161044602522553
The average accuracy is:  0.5161044602522553 and the std is:  3.2435295532762254e-05
This is iteration: 11 out of 89




0.5157819049233999
The average accuracy is:  0.5157819049233999 and the std is:  0.00031614653223543954
This is iteration: 12 out of 89




0.5137162759893462
The average accuracy is:  0.5137162759893462 and the std is:  0.002680230406545303
This is iteration: 13 out of 89




0.5128275975802596
The average accuracy is:  0.5128275975802596 and the std is:  7.428725555314042e-05
This is iteration: 14 out of 89




0.5044393219402938
The average accuracy is:  0.5044393219402938 and the std is:  0.002652851537095713
This is iteration: 15 out of 89




0.5053600190174172
The average accuracy is:  0.5053600190174172 and the std is:  0.0030878071046744893
This is iteration: 16 out of 89




0.5055850233316258
The average accuracy is:  0.5055850233316258 and the std is:  0.0038026285555629036
This is iteration: 17 out of 89




0.5050343712837136
The average accuracy is:  0.5050343712837136 and the std is:  0.0039906352952062685
This is iteration: 18 out of 89




0.5055094903000927
The average accuracy is:  0.5055094903000927 and the std is:  0.004407623770229296
This is iteration: 19 out of 89




0.516060877236349
The average accuracy is:  0.516060877236349 and the std is:  0.007188405259986874
This is iteration: 20 out of 89




0.5152496997874245
The average accuracy is:  0.5152496997874245 and the std is:  0.007183722440819995
This is iteration: 21 out of 89




0.5152267429649686
The average accuracy is:  0.5152267429649686 and the std is:  0.00734447843978614
This is iteration: 22 out of 89




0.5156046722938418
The average accuracy is:  0.5156046722938418 and the std is:  0.00624289806340228
This is iteration: 23 out of 89




0.5159814112088708
The average accuracy is:  0.5159814112088708 and the std is:  0.006501937180289574
This is iteration: 24 out of 89




0.5147751454058749
The average accuracy is:  0.5147751454058749 and the std is:  0.007931449876455665
This is iteration: 25 out of 89




0.5137303119669732
The average accuracy is:  0.5137303119669732 and the std is:  0.008796528384769187
This is iteration: 26 out of 89




0.5124920893736093
The average accuracy is:  0.5124920893736093 and the std is:  0.007623464052488372
This is iteration: 27 out of 89




0.5128976865243201
The average accuracy is:  0.5128976865243201 and the std is:  0.006277170167270807
This is iteration: 28 out of 89




0.511944073339017
The average accuracy is:  0.511944073339017 and the std is:  0.005561422174068187
This is iteration: 29 out of 89




0.5120537225495578
The average accuracy is:  0.5120537225495578 and the std is:  0.00545098422832091
This is iteration: 30 out of 89




0.512155902032549
The average accuracy is:  0.512155902032549 and the std is:  0.00571009266915623
This is iteration: 31 out of 89




0.5117971692324375
The average accuracy is:  0.5117971692324375 and the std is:  0.006113173197224819
This is iteration: 32 out of 89




0.5118770877772848
The average accuracy is:  0.5118770877772848 and the std is:  0.005939842545877094
This is iteration: 33 out of 89




0.5119042234991087
The average accuracy is:  0.5119042234991087 and the std is:  0.006588591389107168
This is iteration: 34 out of 89




0.5117750159362633
The average accuracy is:  0.5117750159362633 and the std is:  0.004605552172509464
This is iteration: 35 out of 89




0.511710948170055
The average accuracy is:  0.511710948170055 and the std is:  0.005565037287601493
This is iteration: 36 out of 89




0.5110330497004819
The average accuracy is:  0.5110330497004819 and the std is:  0.005659377392514764
This is iteration: 37 out of 89




0.5107017713418864
The average accuracy is:  0.5107017713418864 and the std is:  0.0050429435015468815
This is iteration: 38 out of 89




0.5117196551976171
The average accuracy is:  0.5117196551976171 and the std is:  0.004285326966350311
This is iteration: 39 out of 89




0.511681307838433
The average accuracy is:  0.511681307838433 and the std is:  0.005365024038996857
This is iteration: 40 out of 89




0.5113114602155903
The average accuracy is:  0.5113114602155903 and the std is:  0.005325363847943687
This is iteration: 41 out of 89




0.5116690829505638
The average accuracy is:  0.5116690829505638 and the std is:  0.0053290195786138694
This is iteration: 42 out of 89




0.5122069485573204
The average accuracy is:  0.5122069485573204 and the std is:  0.0055526127347717
This is iteration: 43 out of 89




0.5118921399931134
The average accuracy is:  0.5118921399931134 and the std is:  0.0050535488221746805
This is iteration: 44 out of 89




0.5114837992599784
The average accuracy is:  0.5114837992599784 and the std is:  0.00466855094486196
This is iteration: 45 out of 89




0.5116378444989956
The average accuracy is:  0.5116378444989956 and the std is:  0.005680854843859384
This is iteration: 46 out of 89




0.5118964323549333
The average accuracy is:  0.5118964323549333 and the std is:  0.005242898240000127
This is iteration: 47 out of 89




0.5124154281145094
The average accuracy is:  0.5124154281145094 and the std is:  0.005220750991171141
This is iteration: 48 out of 89




0.5126478668406593
The average accuracy is:  0.5126478668406593 and the std is:  0.005393891985706389
This is iteration: 49 out of 89




0.5126919383566907
The average accuracy is:  0.5126919383566907 and the std is:  0.004992535669035348
This is iteration: 50 out of 89




0.5133659216477382
The average accuracy is:  0.5133659216477382 and the std is:  0.004855471260317268
This is iteration: 51 out of 89




0.5136056723747895
The average accuracy is:  0.5136056723747895 and the std is:  0.005287363885650098
This is iteration: 52 out of 89




0.5133018923120783
The average accuracy is:  0.5133018923120783 and the std is:  0.005052490882036076
This is iteration: 53 out of 89




0.5130953777944753
The average accuracy is:  0.5130953777944753 and the std is:  0.004399753597513545
This is iteration: 54 out of 89




0.5128267278636736
The average accuracy is:  0.5128267278636736 and the std is:  0.00440163638815394
This is iteration: 55 out of 89




0.5130321305890689
The average accuracy is:  0.5130321305890689 and the std is:  0.004509708331548845
This is iteration: 56 out of 89




0.5128007005562327
The average accuracy is:  0.5128007005562327 and the std is:  0.005148003183755745
This is iteration: 57 out of 89




0.5127489869722095
The average accuracy is:  0.5127489869722095 and the std is:  0.00506545248939827
This is iteration: 58 out of 89




0.5124444474287849
The average accuracy is:  0.5124444474287849 and the std is:  0.004421262929303227
This is iteration: 59 out of 89




0.5123796265565558
The average accuracy is:  0.5123796265565558 and the std is:  0.004889543567297094
This is iteration: 60 out of 89




0.5120228201486527
The average accuracy is:  0.5120228201486527 and the std is:  0.005070061071110189
This is iteration: 61 out of 89




0.5120575059833329
The average accuracy is:  0.5120575059833329 and the std is:  0.005481227917410458
This is iteration: 62 out of 89




0.5121810479188067
The average accuracy is:  0.5121810479188067 and the std is:  0.005082950718679813
This is iteration: 63 out of 89




0.5122348823903198
The average accuracy is:  0.5122348823903198 and the std is:  0.004761549903255613
This is iteration: 64 out of 89




0.5121274383739244
The average accuracy is:  0.5121274383739244 and the std is:  0.004464237480686695
This is iteration: 65 out of 89




0.5119494946849226
The average accuracy is:  0.5119494946849226 and the std is:  0.004598676703092219
This is iteration: 66 out of 89




0.5120689304132261
The average accuracy is:  0.5120689304132261 and the std is:  0.005249768929628917
This is iteration: 67 out of 89




0.5119363068170837
The average accuracy is:  0.5119363068170837 and the std is:  0.005337223671741498
This is iteration: 68 out of 89




0.5119086431098117
The average accuracy is:  0.5119086431098117 and the std is:  0.005006182961765804
This is iteration: 69 out of 89




0.5118553154387029
The average accuracy is:  0.5118553154387029 and the std is:  0.004655706115133218
This is iteration: 70 out of 89




0.5118282841132814
The average accuracy is:  0.5118282841132814 and the std is:  0.004771847641081517
This is iteration: 71 out of 89




0.5117936519322661
The average accuracy is:  0.5117936519322661 and the std is:  0.005029667227277601
This is iteration: 72 out of 89




0.5117755374231121
The average accuracy is:  0.5117755374231121 and the std is:  0.005047781736431611
This is iteration: 73 out of 89




0.5118114032231024
The average accuracy is:  0.5118114032231024 and the std is:  0.005102191893530783
This is iteration: 74 out of 89




0.511757721212892
The average accuracy is:  0.511757721212892 and the std is:  0.004841227287923022
This is iteration: 75 out of 89




0.5117391649944809
The average accuracy is:  0.5117391649944809 and the std is:  0.004680546928099272
This is iteration: 76 out of 89




In [None]:
pd.DataFrame(dict(logit_acc = logit_acc, logit_std = logit_std))

In [None]:
percept_acc, percept_std = run_indices(X_train, y_train, clf2, indices)

In [None]:
pd.DataFrame(dict(percept_acc = percept_acc, percept_std = percept_std))

In [None]:
NB_acc, NB_std = run_indices(X_train, y_train, clf3, indices)

In [None]:
pd.DataFrame(dict(NB_acc = NB_acc, NB_std = NB_std))

In [None]:
c45_acc, c45_std = run_indices(X_train, y_train, clf4, indices)

In [None]:
pd.DataFrame(dict(c45_acc = c45_acc, c45_std = c45_std))

In [None]:
svm_acc = run_indices(X_train, y_train, clf5, indices)

In [None]:
pd.DataFrame(svm_acc)

In [None]:
KNN_acc = run_indices(X_train, y_train, clf6, indices)

In [None]:
pd.DataFrame(KNN_acc)