In [115]:
import scipy.io as sio
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import pandas as pd
import seaborn as sns
from sklearn import datasets
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC #using linearSVC instead of SVC(kernel = 'linear') to improve runtime.
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

## Load Datasets

### 1. Adult Dataset
- Census data, used to predict if a person make over 50k a year
- Includes attributes such as age, gender, race, employment etc.
- http://archive.ics.uci.edu/ml/datasets/Adult
- { <=50k = 0, >50k =1}

In [8]:
adult_df = pd.read_csv('adult.data', header=None, na_values="?" )

adult_df = adult_df.drop(adult_df[adult_df.isnull().any(axis=1)].index, axis=0) #dropping rows w/ null vals

#label encoding to change categorical data, used link posted on Piazza as help
encoder = LabelEncoder()
adult_df = adult_df.apply(encoder.fit_transform)


#5000 data points to assist with computation time)
adult_df = adult_df.iloc[:5000]


#normalizing data, used to improve SVM performanc, still not sure why it's poorer than others.
scaler = StandardScaler()
adult = pd.DataFrame((scaler.fit_transform(adult_df)).astype('int'), columns = adult_df.columns)

adult.head(10)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0,2,-1,0,1,0,-1,0,0,0,0,0,0,0,0
1,0,1,-1,0,1,0,0,0,0,0,0,0,-2,0,0
2,0,0,0,0,0,-1,0,0,0,0,0,0,0,0,0
3,1,0,0,-2,-1,0,0,0,-1,0,0,0,0,0,0
4,0,0,1,0,1,0,0,2,-1,-1,0,0,0,-3,0
5,0,0,1,0,1,0,0,2,0,-1,0,0,0,0,0
6,0,0,0,-1,-2,0,0,0,-1,-1,0,0,-2,-1,0
7,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
8,0,0,-1,0,1,0,0,0,0,-1,4,0,0,0,1
9,0,0,0,0,1,0,0,0,0,0,3,0,0,0,1


In [9]:
print(adult.shape)

(5000, 15)


### 2. Iris
- The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant. One class is linearly separable from the other 2; the latter are NOT linearly separable from each other.
- Problem is to predict type of iris based on features
- http://archive.ics.uci.edu/ml/datasets/Iris

In [61]:
#used from HW2

iris = datasets.load_iris()
df_iris = pd.DataFrame(iris.data, columns=['0', '1', '2', '3'])
df_iris['target'] = iris.target.tolist()
print(df_iris.shape)

(150, 5)


In [62]:
#getting nonseparable targets

df_iris = df_iris[df_iris['target'] != 0]
print(df_iris.shape)
df_iris.head()

(100, 5)


Unnamed: 0,0,1,2,3,target
50,7.0,3.2,4.7,1.4,1
51,6.4,3.2,4.5,1.5,1
52,6.9,3.1,4.9,1.5,1
53,5.5,2.3,4.0,1.3,1
54,6.5,2.8,4.6,1.5,1


### 3. Letter Recognition
- http://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/
- Database of character image features; try to identify the letter
- letters A-M will be positive, and N-Z negative, to make it more balanced, so problem will be to predict if A-M or if N-Z
- Identify black-and-white rectangular pixel displays as one of the 26 capital letters in the alphabet. Character images were based on 20 different fonts and each letter was randomly distorted to produce 20,000 unique stimuli. Each stimulus was converted into 16 primitive numerical attributes which were scaled to fit values from 0 through 15.

In [73]:
letter_df = pd.read_csv('letter-recognition.data', header=None, na_values="?" )

letter_df = letter_df.drop(letter_df[letter_df.isnull().any(axis=1)].index, axis=0) #dropping rows w/ null vals

letter_df['labels'] = [ord(x)%32 for x in letter_df[0]] #converting Letters to Ints 1 - 27

letter_df['labels'] = np.where(letter_df['labels']>=13, 1, 0) # making binary labels for classification

letter_df = letter_df.drop(letter_df.columns[0], axis = 1)


labels = letter_df['labels'] #store the y labels row since we don't want to normalize those


scaler = StandardScaler()
#adult = (scaler.fit_transform(adult_df)).astype('int')
letter = pd.DataFrame((scaler.fit_transform(letter_df)).astype('int'), columns = letter_df.columns)

letter['labels'] = labels

#5000 data points(to assist with computation time)
letter = letter.iloc[:5000]



letter.head(10)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,labels
0,-1,0,-1,0,-1,0,2,-1,0,0,1,0,-1,0,-1,0,1
1,0,1,-1,0,0,1,-1,0,0,1,-1,0,0,0,0,1,0
2,0,1,0,1,1,1,0,0,0,0,-1,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,-1,0,0,1,1,0,0,1
4,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0,0,0,1,0
5,0,1,0,1,0,0,0,0,1,-1,0,0,-1,0,2,0,1
6,0,-1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0
7,-1,-1,-1,-1,-1,0,-2,0,-1,0,-1,0,0,-1,0,0,0
8,-1,-1,0,0,0,1,0,0,0,1,0,0,0,-1,-1,0,0
9,3,2,3,1,1,3,-2,0,-1,1,-2,0,2,-4,-1,0,1


In [40]:
letter.shape

(5000, 17)

### 4. Mushroom
- http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/
- mushrooms described in terms of physical characteristics such as shape and color
- Problem is to predit if given mushroom is poisonous or edible
- col 0 is labels, p = posion, e = edible
- convert to p = 1, e = 0

In [41]:
mush_df = pd.read_csv('agaricus-lepiota.data', header=None, na_values="?" )

mush_df['labels'] = mush_df[0] #storing labels bc they were being encoded strangely by LabelEncoder

mush_df = mush_df.drop(mush_df.columns[0], axis = 1)

mush_df = mush_df.drop(mush_df[mush_df.isnull().any(axis=1)].index, axis=0) #dropping rows w/ null vals

#mush_df.dtypes, checking datatypes

encoder = LabelEncoder() #encoding data
mush = mush_df.apply(encoder.fit_transform)

mush = mush.iloc[:5000]

mush.head(10)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,14,15,16,17,18,19,20,21,22,labels
0,5,2,4,1,6,1,0,1,2,0,...,5,5,0,0,1,3,1,3,5,1
1,5,2,7,1,0,1,0,0,2,0,...,5,5,0,0,1,3,2,2,1,0
2,0,2,6,1,3,1,0,0,3,0,...,5,5,0,0,1,3,2,2,3,0
3,5,3,6,1,6,1,0,1,3,0,...,5,5,0,0,1,3,1,3,5,1
4,5,2,3,0,5,1,1,0,2,1,...,5,5,0,0,1,0,2,0,1,0
5,5,3,7,1,0,1,0,0,3,0,...,5,5,0,0,1,3,1,2,1,0
6,0,2,6,1,0,1,0,0,0,0,...,5,5,0,0,1,3,1,2,3,0
7,0,3,6,1,3,1,0,0,3,0,...,5,5,0,0,1,3,2,3,3,0
8,5,3,6,1,6,1,0,1,4,0,...,5,5,0,0,1,3,1,4,1,1
9,0,2,7,1,0,1,0,0,0,0,...,5,5,0,0,1,3,1,3,3,0


### 5. Parkinson's
- Original dataset used voice measurements to discriminate between healhy patients and those with Parkinson's.
- Predict according to "status" column which is set to 0 for healthy and 1 for PD.
- The name col will be dropped

In [42]:
#import and clean data
park_df = pd.read_csv('parkinsons.data') 
park_df = park_df.drop(columns=['name']) #dropping id column

park_df['labels'] = park_df['status']
park_df = park_df.drop(columns=['status'])

scaler = StandardScaler()
park = pd.DataFrame((scaler.fit_transform(park_df)).astype('int'), columns = park_df.columns) #normalizing data

park.head(10)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Unnamed: 0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),...,Shimmer:DDA,NHR,HNR,RPDE,DFA,spread1,spread2,D2,PPE,labels
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,1,0,1,0,1,1,...,1,0,0,0,1,1,1,0,1,0
2,0,0,0,0,1,0,1,0,1,1,...,1,0,0,0,1,1,1,0,1,0
3,0,0,0,0,1,0,1,0,1,1,...,1,0,0,0,1,1,1,0,1,0
4,0,0,0,1,1,1,2,1,1,1,...,1,0,0,0,1,1,0,0,2,0
5,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,1,1,0,0,1,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-1,0,0
7,-1,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
8,-1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,1,0,0,0
9,-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0


## Classifier Functions

In [43]:
def results(clf, xTest, yTest):
    
    print ("accuracy score: " + str(clf.score(xTest, yTest)))
    return clf.score(xTest, yTest)

### 1 Linear SVM

In [44]:
def svm(x_train, y_train):
    
    
    #scaling = preprocessing.RobustScaler()
    #x_train = scaling.fit_transform(x_train)
    
    clf = LinearSVC(loss="hinge") #hinge loss improved performance
    param_grid = {'C': [10e-7, 10e-6, 10e-5, 10**-4,10e-3, 10**-2, 10**-1, 1]}# Different C to try.
    clas = GridSearchCV(clf, param_grid = param_grid, cv = 5, return_train_score=True, n_jobs=-1)#identify the best C
    
    clas.fit(x_train, y_train) #fit the classifier w/ train data
    
    print("C" + str(clas.best_estimator_.C))
    
    return clas

### 2 Decision Tree

In [45]:
def decision_tree(x_train, y_train):
    clf = DecisionTreeClassifier()
    param_grid = {'max_depth':[None, 2, 4, 8],}
    clas = GridSearchCV(estimator = clf, param_grid = param_grid, cv = 5, return_train_score=True )

    clas.fit(x_train, y_train)
    
    return clas

### 3. KNN

In [46]:
def kNN(x_train, y_train):
    clf = KNeighborsClassifier()
    
    param_grid = {'n_neighbors': [1,2,3,4,5,6,7]} #tried using 26 large k, spaced out. but acc was very loww every time
    clas = GridSearchCV(estimator = clf, param_grid = param_grid, cv = 5, return_train_score=True )

    clas.fit(x_train, y_train)
    
    print("K" + str(clas.best_estimator_.n_neighbors))
    
    return clas


### 4. Random Forest

In [105]:
def forest(x_train, y_train):
    clf = RandomForestClassifier(n_estimators=1024, n_jobs=-1, criterion='entropy')
    
    #max_depth = {'max_depth':[None, 0, 4, 8, 16]}
    
    #max features converted to float because dataset entries are float types
    #based on sklearn documentation: 
    #https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
    
    param_grid = {
        'max_features': [1,2,4,6,8,12]} 
    
    clas = GridSearchCV(clf, param_grid, cv=5, return_train_score=True)
    clas.fit(x_train, y_train)
    
    return clas
    

### 5. Logistic Regression

In [48]:
def log(x_train, y_train):
    
    clf = LogisticRegression(solver='liblinear', multi_class='ovr')
    
    param_grid = {'C' : [0.001, 0.01, 0.1, 1, 10]}
    
    clas = GridSearchCV(estimator = clf, param_grid = param_grid, cv = 5, return_train_score=True )

    clas.fit(x_train, y_train)
    
    return clas


## Classification Methods

#### Functions to Shuffle, Partition, Cross-Validate, and Test Datasets over Classifiers for 3 Trials and 3 Partitions

In [49]:
#function to partition datasets
def partition(train_size, length, Dataset): 

    X = Dataset.iloc[:, :length]    # First column to second last column: Features (numerical values) 
    Y = Dataset.iloc[:, length]
    X_train_val = X.iloc[:int(train_size * len(X)), :] # Get features from train + val set.        
    X_test = X.iloc[int(train_size * len(X)):, :] # Get features from test set.     
    Y_train_val = Y.iloc[:int(train_size * len(X))] # Get labels from train + val set.
    Y_test = Y.iloc[int(train_size * len(X)):]# Get labels from test set.  
        
    return X_train_val, X_test, Y_train_val, Y_test
        
        
        

In [56]:
def classifier_single_results(clas, data, train_size, df_length):

    test = 0
    train = 0
    val = 0
        
    X_train_val, X_test, Y_train_val, Y_test = partition(train_size, df_length, data) #getting partitions
    
    clf = clas(X_train_val, Y_train_val) #training on classifier

    test = (results(clf, X_test, Y_test))
    train = (clf.cv_results_['mean_train_score'])
    val = (clf.cv_results_['mean_test_score'])

    #print((clf.cv_results_['mean_train_score']))
    #print((clf.cv_results_['mean_test_score']))
    
    return test, train, val

In [51]:
def shuffle(dataset):
    D1 = dataset.sample(frac=1)
    D2 = dataset.sample(frac=1)
    D3 = dataset.sample(frac=1)
    return D1, D2, D3


In [57]:
def get_results(datset_list, df_length, classifiers, train_size, DF):
    
    #to help store test results, easier for final comparison
    i=0
    if train_size == 0.2:
        j = 0
    if train_size == 0.5:
        j = 1
    if train_size == 0.8:
        j = 2
    for clas in classifiers: #loop thru each classifier over single dataset
        
        
        
        print("train size is: " + str(int(train_size * 100)))
        print(clas.__name__)

        train = []
        val = []
        test = []

        for d in range(len(dataset_list)): # three trials, each w/ differently shuffled data, but same partitions


            test1, train1, val1 = classifier_single_results(clas, dataset_list[d], train_size, df_length)
            

            test.append(test1)
            train.append(train1)
            val.append(val1)

            
        train = max(np.array(train).mean(axis=0))
        val = max(np.array(val).mean(axis=0))
        test = np.mean(test)

        print("avg train acc : " + str(train))
        print("avg val acc: " + str(val))
    
        
        print("avg test acc: " + str(test))


        DF.iloc[j, i] = test

        print('\n')
        i = 1+i
    return DF


### Running and Testing Classifiers

#### Iris

In [63]:
classifiers = [kNN, decision_tree, svm, log, forest] 

partition_size = [0.2, 0.5, 0.8]

cols = ['knn', 'tree', 'svm', 'log', 'forest']
index = [20,50,80]

iris_avg= pd.DataFrame(index=index, columns=cols)


dataset_list = shuffle(df_iris) #shuffling data 3 times, unified across partitions and classifiers

get_results(dataset_list, 4, classifiers, partition_size[0], iris_avg)
get_results(dataset_list, 4, classifiers, partition_size[1], iris_avg)
get_results(dataset_list, 4, classifiers, partition_size[2], iris_avg)

train size is: 20
kNN
K1
accuracy score: 0.9375




K7
accuracy score: 0.9
K1
accuracy score: 0.9625
avg train acc : 1.0
avg val acc: 0.9166666666666666
avg test acc: 0.9333333333333332


train size is: 20
decision_tree




accuracy score: 0.925
accuracy score: 0.875
accuracy score: 0.925
avg train acc : 1.0
avg val acc: 0.9500000000000001
avg test acc: 0.9083333333333333


train size is: 20
svm




C1
accuracy score: 0.95
C1
accuracy score: 0.9625




C1
accuracy score: 0.95
avg train acc : 0.9621568627450982
avg val acc: 0.9499999999999998
avg test acc: 0.9541666666666666


train size is: 20
log
accuracy score: 0.9




accuracy score: 0.9375
accuracy score: 0.925
avg train acc : 0.9705228758169935
avg val acc: 0.9499999999999998
avg test acc: 0.9208333333333334


train size is: 20
forest




accuracy score: 0.925




accuracy score: 0.9125
accuracy score: 0.925
avg train acc : 1.0
avg val acc: 0.9333333333333332
avg test acc: 0.9208333333333334


train size is: 50
kNN
K1
accuracy score: 0.88
K1
accuracy score: 0.92
K1
accuracy score: 0.94
avg train acc : 1.0
avg val acc: 0.98
avg test acc: 0.9133333333333334


train size is: 50
decision_tree




accuracy score: 0.96
accuracy score: 0.92
accuracy score: 0.96
avg train acc : 1.0
avg val acc: 0.9066666666666667
avg test acc: 0.9466666666666667


train size is: 50
svm




C1
accuracy score: 0.94
C0.1
accuracy score: 0.96




C1
accuracy score: 0.96
avg train acc : 0.9832885136543673
avg val acc: 0.9733333333333333
avg test acc: 0.9533333333333333


train size is: 50
log
accuracy score: 0.94
accuracy score: 0.94
accuracy score: 0.92
avg train acc : 0.9866218469877005
avg val acc: 0.9733333333333333
avg test acc: 0.9333333333333332


train size is: 50
forest




accuracy score: 0.94
accuracy score: 0.86




accuracy score: 0.96
avg train acc : 1.0
avg val acc: 0.9466666666666667
avg test acc: 0.9199999999999999


train size is: 80
kNN
K5
accuracy score: 1.0




K3
accuracy score: 0.9
K7
accuracy score: 0.9
avg train acc : 1.0
avg val acc: 0.9583333333333334
avg test acc: 0.9333333333333332


train size is: 80
decision_tree




accuracy score: 0.95
accuracy score: 0.9
accuracy score: 0.9
avg train acc : 1.0
avg val acc: 0.9333333333333332
avg test acc: 0.9166666666666666


train size is: 80
svm




C0.1
accuracy score: 1.0
C0.1
accuracy score: 0.95




C1
accuracy score: 0.9
avg train acc : 0.9770477207977208
avg val acc: 0.9708333333333333
avg test acc: 0.9500000000000001


train size is: 80
log
accuracy score: 1.0
accuracy score: 0.95




accuracy score: 0.85
avg train acc : 0.9801732295482296
avg val acc: 0.9666666666666668
avg test acc: 0.9333333333333332


train size is: 80
forest




accuracy score: 1.0
accuracy score: 0.9
accuracy score: 0.9
avg train acc : 1.0
avg val acc: 0.9333333333333332
avg test acc: 0.9333333333333332




Unnamed: 0,knn,tree,svm,log,forest
20,0.933333,0.908333,0.954167,0.920833,0.920833
50,0.913333,0.946667,0.953333,0.933333,0.92
80,0.933333,0.916667,0.95,0.933333,0.933333


this comparison on the small dataset fits nicely with results of the paper. Decision tree performing worst, while svm models performing best and KNN is really not that bad.

#### Letter

In [74]:
classifiers = [kNN, decision_tree, svm, log] 

partition_size = [0.2, 0.5, 0.8]

cols = ['knn', 'tree',  'svm', 'log']
index = [20,50,80]

letter_avg= pd.DataFrame(index=index, columns=cols)
#accuracy_scores.iloc[0, 4] = 80

dataset_list = shuffle(letter) #shuffling data 3 times, unified across partitions and classifiers

In [75]:
get_results(dataset_list, 16, classifiers, partition_size[0], letter_avg)
get_results(dataset_list, 16, classifiers, partition_size[1], letter_avg)
get_results(dataset_list, 16, classifiers, partition_size[2], letter_avg)

print(letter_avg)

train size is: 20
kNN
K5
accuracy score: 0.759
K1
accuracy score: 0.77075
K1
accuracy score: 0.777
avg train acc : 0.9614155053367271
avg val acc: 0.774
avg test acc: 0.7689166666666667


train size is: 20
decision_tree
accuracy score: 0.766
accuracy score: 0.77625
accuracy score: 0.77775
avg train acc : 0.9671656117171018
avg val acc: 0.7633333333333333
avg test acc: 0.7733333333333334


train size is: 20
svm




C1
accuracy score: 0.70975
C0.1
accuracy score: 0.7105




C1
accuracy score: 0.71675
avg train acc : 0.7243345746113145
avg val acc: 0.7113333333333333
avg test acc: 0.7123333333333334


train size is: 20
log
accuracy score: 0.70125
accuracy score: 0.7005
accuracy score: 0.70025
avg train acc : 0.7164165481768983
avg val acc: 0.706
avg test acc: 0.7006666666666668


train size is: 50
kNN
K5
accuracy score: 0.8104
K5
accuracy score: 0.804
K3
accuracy score: 0.802
avg train acc : 0.9437670590334314
avg val acc: 0.8052
avg test acc: 0.8054666666666668


train size is: 50
decision_tree
accuracy score: 0.7996
accuracy score: 0.8048
accuracy score: 0.7972
avg train acc : 0.9528000603333484
avg val acc: 0.8081333333333333
avg test acc: 0.8005333333333334


train size is: 50
svm
C0.1
accuracy score: 0.7064




C1
accuracy score: 0.7148




C1
accuracy score: 0.6972
avg train acc : 0.7218992292831405
avg val acc: 0.7146666666666667
avg test acc: 0.7061333333333334


train size is: 50
log
accuracy score: 0.7036
accuracy score: 0.7072
accuracy score: 0.704
avg train acc : 0.715934078641853
avg val acc: 0.7096
avg test acc: 0.7049333333333333


train size is: 80
kNN
K7
accuracy score: 0.836
K7
accuracy score: 0.804
K7
accuracy score: 0.812
avg train acc : 0.9338332052795285
avg val acc: 0.816
avg test acc: 0.8173333333333334


train size is: 80
decision_tree
accuracy score: 0.836
accuracy score: 0.826
accuracy score: 0.804
avg train acc : 0.944145653928612
avg val acc: 0.8116666666666666
avg test acc: 0.8220000000000001


train size is: 80
svm




C1
accuracy score: 0.695




C1
accuracy score: 0.739




C1
accuracy score: 0.718
avg train acc : 0.7199999752502418
avg val acc: 0.7156666666666668
avg test acc: 0.7173333333333334


train size is: 80
log
accuracy score: 0.687
accuracy score: 0.727
accuracy score: 0.703
avg train acc : 0.7142502222269912
avg val acc: 0.7105
avg test acc: 0.7056666666666667


         knn      tree       svm       log
20  0.768917  0.773333  0.712333  0.700667
50  0.805467  0.800533  0.706133  0.704933
80  0.817333     0.822  0.717333  0.705667


#### Adult 

In [97]:
classifiers = [kNN, decision_tree, svm, log] 

partition_size = [0.2, 0.5, 0.8]

cols = ['knn', 'tree', 'svm', 'log']
index = [20,50,80]

adult_avg= pd.DataFrame(index=index, columns=cols)
#accuracy_scores.iloc[0, 4] = 80

dataset_list = shuffle(adult) #shuffling data 3 times, unified across partitions and classifiers


In [98]:
get_results(dataset_list, 14, classifiers, partition_size[0], adult_avg)
get_results(dataset_list, 14, classifiers, partition_size[1], adult_avg)
get_results(dataset_list, 14, classifiers, partition_size[2], adult_avg)

print(adult_avg)

train size is: 20
kNN
K6
accuracy score: 0.80925
K6
accuracy score: 0.8165
K7
accuracy score: 0.80175
avg train acc : 0.9292484958309831
avg val acc: 0.8069999999999999
avg test acc: 0.8091666666666667


train size is: 20
decision_tree
accuracy score: 0.8105
accuracy score: 0.796
accuracy score: 0.805
avg train acc : 0.9424993390614672
avg val acc: 0.8046666666666668
avg test acc: 0.8038333333333334


train size is: 20
svm




C1
accuracy score: 0.8055
C0.1
accuracy score: 0.80775




C0.1
accuracy score: 0.8015
avg train acc : 0.8057514423199619
avg val acc: 0.799
avg test acc: 0.8049166666666666


train size is: 20
log
accuracy score: 0.809
accuracy score: 0.803
accuracy score: 0.805
avg train acc : 0.8179168662763535
avg val acc: 0.8080000000000002
avg test acc: 0.8056666666666668


train size is: 50
kNN
K6
accuracy score: 0.8084
K6
accuracy score: 0.8072
K6
accuracy score: 0.804
avg train acc : 0.9101998017749504
avg val acc: 0.8118666666666666
avg test acc: 0.8065333333333333


train size is: 50
decision_tree
accuracy score: 0.8028
accuracy score: 0.8056
accuracy score: 0.81
avg train acc : 0.9265335710250594
avg val acc: 0.8138666666666667
avg test acc: 0.8061333333333334


train size is: 50
svm
C0.01
accuracy score: 0.7952
C0.01
accuracy score: 0.8052
C0.01
accuracy score: 0.8032
avg train acc : 0.8076339179251462
avg val acc: 0.8073333333333332
avg test acc: 0.8012


train size is: 50
log
accuracy score: 0.7984
accuracy score: 0.8088
accuracy score: 0.8076
a

In [99]:
adult_avg

Unnamed: 0,knn,tree,svm,log
20,0.809167,0.803833,0.804917,0.805667
50,0.806533,0.806133,0.8012,0.804933
80,0.811,0.816667,0.809333,0.806333


#### Mushroom 

In [100]:
classifiers = [kNN, decision_tree, svm, log] 

partition_size = [0.2, 0.5, 0.8]

cols = ['knn', 'tree', 'svm', 'log']
index = [20,50,80]

mush_avg= pd.DataFrame(index=index, columns=cols)
#accuracy_scores.iloc[0, 4] = 80

dataset_list = shuffle(mush) #shuffling data 3 times, unified across partitions and classifiers

In [101]:
get_results(dataset_list, 22, classifiers, partition_size[0], mush_avg)
get_results(dataset_list, 22, classifiers, partition_size[1], mush_avg)
get_results(dataset_list, 22, classifiers, partition_size[2], mush_avg)

print(mush_avg)

train size is: 20
kNN
K1
accuracy score: 0.99675
K2
accuracy score: 0.99675
K1
accuracy score: 0.99475
avg train acc : 1.0
avg val acc: 0.9963333333333333
avg test acc: 0.9960833333333333


train size is: 20
decision_tree
accuracy score: 0.99875
accuracy score: 0.99975
accuracy score: 0.9995
avg train acc : 1.0
avg val acc: 0.9980000000000001
avg test acc: 0.9993333333333334


train size is: 20
svm




C1
accuracy score: 0.995




C1
accuracy score: 0.99775




C1
accuracy score: 0.995
avg train acc : 0.9987502070315735
avg val acc: 0.9980000000000001
avg test acc: 0.9959166666666667


train size is: 20
log
accuracy score: 0.997
accuracy score: 0.9985
accuracy score: 0.99525
avg train acc : 0.999333540885741
avg val acc: 0.9966666666666667
avg test acc: 0.9969166666666668


train size is: 50
kNN
K2
accuracy score: 0.9972
K1
accuracy score: 0.9992
K1
accuracy score: 0.9984
avg train acc : 1.0
avg val acc: 0.9977333333333332
avg test acc: 0.9982666666666665


train size is: 50
decision_tree
accuracy score: 1.0
accuracy score: 1.0
accuracy score: 1.0
avg train acc : 1.0
avg val acc: 0.9996
avg test acc: 1.0


train size is: 50
svm




C1
accuracy score: 0.9984




C1
accuracy score: 0.9992




C1
accuracy score: 0.9972
avg train acc : 0.9976666663833335
avg val acc: 0.9969333333333333
avg test acc: 0.9982666666666665


train size is: 50
log
accuracy score: 0.9984
accuracy score: 0.9988
accuracy score: 0.9976
avg train acc : 0.9982999830916626
avg val acc: 0.9978666666666666
avg test acc: 0.9982666666666665


train size is: 80
kNN
K1
accuracy score: 1.0
K1
accuracy score: 1.0
K1
accuracy score: 1.0
avg train acc : 1.0
avg val acc: 0.9985833333333334
avg test acc: 1.0


train size is: 80
decision_tree
accuracy score: 1.0
accuracy score: 0.999
accuracy score: 1.0
avg train acc : 1.0
avg val acc: 1.0
avg test acc: 0.9996666666666667


train size is: 80
svm
C1
accuracy score: 0.999




C1
accuracy score: 1.0




C1
accuracy score: 0.998
avg train acc : 0.9980416665934245
avg val acc: 0.9977499999999999
avg test acc: 0.999


train size is: 80
log
accuracy score: 1.0
accuracy score: 0.999
accuracy score: 0.998
avg train acc : 0.9984374999389649
avg val acc: 0.9981666666666666
avg test acc: 0.999


         knn      tree       svm       log
20  0.996083  0.999333  0.995917  0.996917
50  0.998267         1  0.998267  0.998267
80         1  0.999667     0.999     0.999


#### Parkinson's

In [85]:
classifiers = [kNN, decision_tree, svm, log] 

partition_size = [0.2, 0.5, 0.8]

cols = ['knn', 'tree', 'svm', 'log']
index = [20,50,80]

park_avg= pd.DataFrame(index=index, columns=cols)

dataset_list = shuffle(park) #shuffling data 3 times, unified across partitions and classifiers

In [87]:
get_results(dataset_list, 22, classifiers, partition_size[0], park_avg)
get_results(dataset_list, 22, classifiers, partition_size[1], park_avg)
get_results(dataset_list, 22, classifiers, partition_size[2], park_avg)

train size is: 20
kNN
K5
accuracy score: 0.7243589743589743




K5
accuracy score: 0.8461538461538461
K1
accuracy score: 0.8076923076923077
avg train acc : 0.9850089605734768
avg val acc: 0.8376068376068376
avg test acc: 0.7927350427350427


train size is: 20
decision_tree




accuracy score: 0.8076923076923077
accuracy score: 0.8141025641025641
accuracy score: 0.8589743589743589
avg train acc : 0.9850089605734768
avg val acc: 0.7777777777777778
avg test acc: 0.826923076923077


train size is: 20
svm




C0.1
accuracy score: 0.7243589743589743
C0.1
accuracy score: 0.8461538461538461




C0.0001
accuracy score: 0.8525641025641025
avg train acc : 0.925179211469534
avg val acc: 0.8205128205128206
avg test acc: 0.8076923076923076


train size is: 20
log
accuracy score: 0.7948717948717948
accuracy score: 0.8525641025641025




accuracy score: 0.8525641025641025
avg train acc : 0.940165770609319
avg val acc: 0.8205128205128206
avg test acc: 0.8333333333333334


train size is: 50
kNN




K3
accuracy score: 0.8367346938775511




K1
accuracy score: 0.7040816326530612




K3
accuracy score: 0.826530612244898
avg train acc : 0.9587164741395254
avg val acc: 0.8213058419243987
avg test acc: 0.7891156462585034


train size is: 50
decision_tree
accuracy score: 0.8367346938775511
accuracy score: 0.826530612244898




accuracy score: 0.9081632653061225
avg train acc : 0.971637763376604
avg val acc: 0.8144329896907218
avg test acc: 0.8571428571428572


train size is: 50
svm




C0.1
accuracy score: 0.826530612244898
C0.1
accuracy score: 0.8469387755102041




C0.01
accuracy score: 0.8469387755102041
avg train acc : 0.8917680254455737
avg val acc: 0.8350515463917526
avg test acc: 0.8401360544217688


train size is: 50
log
accuracy score: 0.8061224489795918
accuracy score: 0.8469387755102041




accuracy score: 0.8877551020408163
avg train acc : 0.9046337924419202
avg val acc: 0.8281786941580757
avg test acc: 0.8469387755102041


train size is: 80
kNN




K3
accuracy score: 0.8205128205128205




K7
accuracy score: 0.8974358974358975
K7
accuracy score: 0.8717948717948718
avg train acc : 0.9556682710360128
avg val acc: 0.8354700854700855
avg test acc: 0.8632478632478633


train size is: 80
decision_tree




accuracy score: 0.8205128205128205
accuracy score: 0.8974358974358975
accuracy score: 0.9230769230769231
avg train acc : 0.9674275473630312
avg val acc: 0.8611111111111112
avg test acc: 0.8803418803418804


train size is: 80
svm




C0.1
accuracy score: 0.8205128205128205
C0.1
accuracy score: 0.8974358974358975




C0.01
accuracy score: 0.8461538461538461
avg train acc : 0.8760633555214201
avg val acc: 0.8461538461538461
avg test acc: 0.8547008547008548


train size is: 80
log
accuracy score: 0.8461538461538461




accuracy score: 0.9230769230769231
accuracy score: 0.8461538461538461
avg train acc : 0.8776804915514593
avg val acc: 0.8461538461538461
avg test acc: 0.8717948717948718






Unnamed: 0,knn,tree,svm,log
20,0.792735,0.826923,0.807692,0.833333
50,0.789116,0.857143,0.840136,0.846939
80,0.863248,0.880342,0.854701,0.871795


##### Running Random Forest Classifier separately. It was taking a very long time to run

In [104]:
#'max_features': [1,2,4,6,8,12,16,20] tested
#mush

partition_size = [0.2, 0.5, 0.8]
cols = ['forest']
index = [20,50,80]

mush_avg_rf= pd.DataFrame(index=index, columns=cols)

dataset_list = shuffle(mush) 
get_results(dataset_list, 22, [forest], partition_size[0], mush_avg_rf)
get_results(dataset_list, 22, [forest], partition_size[1], mush_avg_rf)
get_results(dataset_list, 22, [forest], partition_size[2], mush_avg_rf)

print(mush_avg_rf)

train size is: 20
forest
accuracy score: 0.99975
accuracy score: 1.0
accuracy score: 0.999
avg train acc : 1.0
avg val acc: 0.9993333333333334
avg test acc: 0.9995833333333334


train size is: 50
forest
accuracy score: 0.9996
accuracy score: 1.0
accuracy score: 0.9996
avg train acc : 1.0
avg val acc: 0.9996
avg test acc: 0.9997333333333334


train size is: 80
forest
accuracy score: 1.0
accuracy score: 1.0
accuracy score: 0.999
avg train acc : 1.0
avg val acc: 0.9999166666666667
avg test acc: 0.9996666666666667


      forest
20  0.999583
50  0.999733
80  0.999667


In [91]:
#'max_features': [1,2,4,6,8,12,16,20] tested
#park

partition_size = [0.2, 0.5, 0.8]
cols = ['forest']
index = [20,50,80]

park_avg_rf= pd.DataFrame(index=index, columns=cols)

dataset_list = shuffle(park) 
get_results(dataset_list, 22, [forest], partition_size[0], park_avg_rf)
get_results(dataset_list, 22, [forest], partition_size[1], park_avg_rf)
get_results(dataset_list, 22, [forest], partition_size[2], park_avg_rf)

print(park_avg_rf)

train size is: 20
forest




accuracy score: 0.8269230769230769




accuracy score: 0.8397435897435898




accuracy score: 0.8397435897435898
avg train acc : 0.966057347670251
avg val acc: 0.8376068376068376
avg test acc: 0.8354700854700855


train size is: 50
forest




accuracy score: 0.8367346938775511




accuracy score: 0.826530612244898




accuracy score: 0.8979591836734694
avg train acc : 0.9665434314001935
avg val acc: 0.8591065292096219
avg test acc: 0.8537414965986395


train size is: 80
forest




accuracy score: 0.7948717948717948




accuracy score: 0.7692307692307693




accuracy score: 0.8974358974358975
avg train acc : 0.9717069807134323
avg val acc: 0.8782051282051282
avg test acc: 0.8205128205128206


      forest
20   0.83547
50  0.853741
80  0.820513


In [94]:
#'max_features': [1,2,4,6,8,12] tested
# adult

partition_size = [0.2, 0.5, 0.8]
cols = ['forest']
index = [20,50,80]

adult_avg_rf= pd.DataFrame(index=index, columns=cols)

dataset_list = shuffle(adult) 
get_results(dataset_list, 14, [forest], partition_size[0], adult_avg_rf)
get_results(dataset_list, 14, [forest], partition_size[1], adult_avg_rf)
get_results(dataset_list, 14, [forest], partition_size[2], adult_avg_rf)

print(adult_avg_rf)

train size is: 20
forest
accuracy score: 0.81425
accuracy score: 0.80125
accuracy score: 0.7915
avg train acc : 0.9383325498685675
avg val acc: 0.805
avg test acc: 0.8023333333333333


train size is: 50
forest
accuracy score: 0.8088
accuracy score: 0.8064
accuracy score: 0.8036
avg train acc : 0.9248998114249529
avg val acc: 0.8177333333333333
avg test acc: 0.8062666666666667


train size is: 80
forest
accuracy score: 0.811
accuracy score: 0.819
accuracy score: 0.811
avg train acc : 0.9127916425923641
avg val acc: 0.8164166666666667
avg test acc: 0.8136666666666666


      forest
20  0.802333
50  0.806267
80  0.813667


In [106]:
#'max_features': [1,2,4,6,8,12] tested
# letter

partition_size = [0.2, 0.5, 0.8]
cols = ['forest']
index = [20,50,80]

letter_avg_rf= pd.DataFrame(index=index, columns=cols)

dataset_list = shuffle(letter) 
get_results(dataset_list, 16, [forest], partition_size[0], letter_avg_rf)
get_results(dataset_list, 16, [forest], partition_size[1], letter_avg_rf)
get_results(dataset_list, 16, [forest], partition_size[2], letter_avg_rf)

print(letter_avg_rf)

train size is: 20
forest
accuracy score: 0.80375
accuracy score: 0.80475
accuracy score: 0.808
avg train acc : 0.9648337263026973
avg val acc: 0.8063333333333333
avg test acc: 0.8055


train size is: 50
forest
accuracy score: 0.842
accuracy score: 0.8276
accuracy score: 0.83
avg train acc : 0.9485663598999233
avg val acc: 0.8349333333333333
avg test acc: 0.8332


train size is: 80
forest
accuracy score: 0.864
accuracy score: 0.849
accuracy score: 0.839
avg train acc : 0.9428958912007707
avg val acc: 0.84075
avg test acc: 0.8506666666666667


      forest
20    0.8055
50    0.8332
80  0.850667


In [107]:
letter_avg['rf'] = letter_avg_rf
mush_avg['rf'] = mush_avg_rf
adult_avg['rf'] = adult_avg_rf
park_avg['rf'] = park_avg_rf

In [152]:
print('iris','\n',iris_avg)
print('adult', '\n', adult_avg)
print('letter', '\n', letter_avg)
print('mush', '\n', mush_avg)
print('park', '\n', park_avg)

iris 
            0         1         2         3         4
20  0.933333  0.908333  0.954167  0.920833  0.920833
50  0.913333  0.946667  0.953333  0.933333      0.92
80  0.933333  0.916667      0.95  0.933333  0.933333
adult 
          knn      tree       svm       log        rf
20  0.809167  0.803833  0.804917  0.805667  0.802333
50  0.806533  0.806133    0.8012  0.804933  0.806267
80     0.811  0.816667  0.809333  0.806333  0.813667
letter 
          knn      tree       svm       log        rf
20  0.768917  0.773333  0.712333  0.700667    0.8055
50  0.805467  0.800533  0.706133  0.704933    0.8332
80  0.817333     0.822  0.717333  0.705667  0.850667
mush 
          knn      tree       svm       log        rf
20  0.996083  0.999333  0.995917  0.996917  0.999583
50  0.998267         1  0.998267  0.998267  0.999733
80         1  0.999667     0.999     0.999  0.999667
park 
          knn      tree       svm       log        rf
20  0.792735  0.826923  0.807692  0.833333   0.83547
50  0.78