In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import AdaBoostClassifier 
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

import nltk
# import xlsxwriter
import openpyxl
import os.path
from os.path import exists


In [2]:
df_data = pd.read_csv("../data/train.csv")
df_data.head()

Unnamed: 0,Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity
0,0007de18844b0dbbb5e1f607da0606e0,0,1,1,1,0,0,1,0,0,0,0,0,63
1,0009c66b9439883ba2750fb825e1d7db,0,1,1,0,0,0,0,0,0,0,0,0,42
2,0013fd999caf9a3efe1352ca1b0d937e,0,1,1,1,0,0,0,0,1,1,0,0,28
3,0018df346ac9c1d8413cfcc888ca8246,0,1,1,1,0,0,0,0,0,0,0,0,15
4,001dc955e10590d3ca4673f034feeef2,0,0,0,1,0,0,1,0,0,0,0,0,72


In [3]:
df_data.describe()

Unnamed: 0,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity
count,9912.0,9912.0,9912.0,9912.0,9912.0,9912.0,9912.0,9912.0,9912.0,9912.0,9912.0,9912.0,9912.0
mean,0.027643,0.772599,0.903955,0.861582,0.009988,0.067797,0.129338,0.049637,0.166263,0.172014,0.061239,0.07042,38.039044
std,0.163957,0.419175,0.294668,0.345356,0.099444,0.251409,0.335591,0.217204,0.372335,0.377411,0.23978,0.255866,20.59199
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0
50%,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33.0
75%,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,100.0


In [4]:
def scale10(n):
    val = 0
    if n <=10: 
        val = 1
    elif n >10 and n<=20:
        val = 2
    elif n >20 and n <=30:
        val = 3
    elif n>30 and n <=40:
        val = 4
    elif n > 40 and n<=50:
        val = 5
    elif n>50 and n<=60:
        val = 6
    elif n>60 and n<=70:
        val = 7
    elif n>70 and n<=80:
        val = 8
    elif n>80 and n<=90:
        val = 9
    else: 
        val = 10
    return(val)

In [5]:
def scale5(n):
    val = 0
    if n <=20: 
        val = 1
    elif n >20 and n<=40:
        val = 2
    elif n >40 and n <=60:
        val = 3
    elif n>60 and n <=80:
        val = 4
    else: val = 5

    return(val)

In [6]:
def scale4(n):
    val = 0
    if n <=25: 
        val = 1
    elif n >25 and n<=50:
        val = 2
    elif n >50 and n <=75:
        val = 3
    else: 
        val = 4

    return(val)

In [7]:
def scale3(n):
    val = 0
    if n <=34: 
        val = 1
    elif n >34 and n<=68:
        val = 2
    else:
        val = 3

    return(val)

In [8]:
def scale2(n):
    val = 0
    if n <=50: 
        val = 1
    else:
        val = 2

    return(val)

In [9]:
df_data["Scale10"] = df_data.apply(lambda row: scale10(row.Pawpularity), axis=1)

In [10]:
df_data["Scale5"] = df_data.apply(lambda row: scale5(row.Pawpularity), axis=1)

In [11]:
df_data["Scale4"] = df_data.apply(lambda row: scale4(row.Pawpularity), axis=1)

In [12]:
df_data["Scale3"] = df_data.apply(lambda row: scale3(row.Pawpularity), axis=1)

In [13]:
df_data["Scale2"] = df_data.apply(lambda row: scale2(row.Pawpularity), axis=1)

In [14]:
df_data.head()

Unnamed: 0,Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity,Scale10,Scale5,Scale4,Scale3,Scale2
0,0007de18844b0dbbb5e1f607da0606e0,0,1,1,1,0,0,1,0,0,0,0,0,63,7,4,3,2,2
1,0009c66b9439883ba2750fb825e1d7db,0,1,1,0,0,0,0,0,0,0,0,0,42,5,3,2,2,1
2,0013fd999caf9a3efe1352ca1b0d937e,0,1,1,1,0,0,0,0,1,1,0,0,28,3,2,2,1,1
3,0018df346ac9c1d8413cfcc888ca8246,0,1,1,1,0,0,0,0,0,0,0,0,15,2,1,1,1,1
4,001dc955e10590d3ca4673f034feeef2,0,0,0,1,0,0,1,0,0,0,0,0,72,8,4,3,3,2


### Splitting Data

In [15]:
columnsL = df_data.columns
columnsL

Index(['Id', 'Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory',
       'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur', 'Pawpularity',
       'Scale10', 'Scale5', 'Scale4', 'Scale3', 'Scale2'],
      dtype='object')

In [16]:
#Get X and Y data - shuffle data.
X_cols = ['Subject Focus','Eyes', 'Face','Near','Action','Accessory','Group','Collage','Human','Occlusion','Info','Blur',]
X = np.array(df_data[X_cols])
Y = df_data['Pawpularity'].values[:]

id_image = df_data['Id'].values[:]

Y10 = df_data['Scale10'].values[:]
Y5 = df_data['Scale5'].values[:]
Y4 = df_data['Scale4'].values[:]
Y3 = df_data['Scale3'].values[:]
Y2 = df_data['Scale2'].values[:]


shuffle = np.random.permutation(np.arange(X.shape[0]))
X, Y, id_image = X[shuffle], Y[shuffle], id_image[shuffle]
Y10, Y5, Y4, Y3, Y2 = Y10[shuffle], Y5[shuffle], Y4[shuffle], Y3[shuffle], Y2[shuffle]

In [17]:
# Define sizes for train, development and test data (0.5, 0.2, 0.3)
per_train = 0.5
per_dev = 0.2

num_images = len(Y)
train_size = int(round(num_images * per_train,0))
dev_size = int(round(num_images * per_dev,0))

In [18]:
# Split data based on defined sizes
test_data, test_labels, id_test = X[train_size+dev_size:], Y[train_size+dev_size:], id_image[train_size+dev_size:]
test_y10 = Y10[train_size+dev_size:]
test_y5 = Y5[train_size+dev_size:]
test_y4 = Y4[train_size+dev_size:]
test_y3 = Y3[train_size+dev_size:]
test_y2 = Y2[train_size+dev_size:]

dev_data, dev_labels, id_dev = X[train_size:train_size+dev_size], Y[train_size:train_size+dev_size], id_image[train_size:train_size+dev_size]
dev_y10 = Y10[train_size:train_size+dev_size]
dev_y5 = Y5[train_size:train_size+dev_size]
dev_y4 = Y4[train_size:train_size+dev_size]
dev_y3 = Y3[train_size:train_size+dev_size]
dev_y2 = Y2[train_size:train_size+dev_size]

train_data, train_labels, id_train = X[:train_size], Y[:train_size], id_image[:train_size]
train_y10 =  Y10[:train_size]
train_y5 =  Y5[:train_size]
train_y4 =  Y4[:train_size]
train_y3 =  Y3[:train_size]
train_y2 =  Y2[:train_size]

print(num_images)
print(train_data.shape, train_labels.shape, id_train.shape)
print(dev_data.shape, dev_labels.shape, id_dev.shape)
print(test_data.shape, test_labels.shape, id_test.shape)
print(test_y10.shape, dev_y10.shape, train_y10.shape)
print(test_y5.shape, dev_y5.shape, train_y5.shape)
print(test_y4.shape, dev_y4.shape, train_y4.shape)
print(test_y3.shape, dev_y3.shape, train_y3.shape)
print(test_y2.shape, dev_y2.shape, train_y2.shape)

9912
(4956, 12) (4956,) (4956,)
(1982, 12) (1982,) (1982,)
(2974, 12) (2974,) (2974,)
(2974,) (1982,) (4956,)
(2974,) (1982,) (4956,)
(2974,) (1982,) (4956,)
(2974,) (1982,) (4956,)
(2974,) (1982,) (4956,)


In [19]:
def wrt_excel(file, sheet_name, df):
    if os.path.exists(file):
        with pd.ExcelWriter(file, engine="openpyxl", mode='a') as writer:
            df.to_excel(writer, sheet_name=sheet_name)
    else:
        with pd.ExcelWriter(file, engine="openpyxl") as writer:
            df.to_excel(writer, sheet_name=sheet_name)

In [20]:
f1_score = []
rmse = []
acc = []
hamm = []

knn_mod = KNeighborsClassifier(n_neighbors=2, algorithm="auto", weights="uniform", p=1)
knn_mod.fit(train_data, train_labels)
acc.append(knn_mod.score(dev_data, dev_labels))
f1_score.append(metrics.f1_score(dev_labels, knn_mod.predict(dev_data), average="weighted"))
rmse.append(metrics.mean_squared_error(dev_labels, knn_mod.predict(dev_data), squared=False))
hamm.append(metrics.hamming_loss(dev_labels, knn_mod.predict(dev_data)))

### KNN Classifier

In [21]:
def knn_model(train_data, train_labels, dev_data, dev_labels, algorithm, weigth, klist):
    
    f1_score = []
    rmse = []
    acc = []
    hamm = []
    for k in klist:
        knn_mod = KNeighborsClassifier(n_neighbors=k, algorithm=algorithm, weights=weigth, p=1)
        knn_mod.fit(train_data, train_labels)
        acc.append(knn_mod.score(dev_data, dev_labels))
        f1_score.append(metrics.f1_score(dev_labels, knn_mod.predict(dev_data), average="weighted"))
        rmse.append(metrics.mean_squared_error(dev_labels, knn_mod.predict(dev_data), squared=False))
        hamm.append(metrics.hamming_loss(dev_labels, knn_mod.predict(dev_data)))
    return f1_score, rmse, acc, hamm
    
def knn_models(train_data, Y_train, dev_data, Y_dev):
    df_knn = pd.DataFrame()
    klist = [1, 2, 3, 4, 5, 6, 7,8,9, 10, 11, 12, 13, 14, 15]
    algorithm_list = ["auto", "ball_tree", "kd_tree", "brute"]
   
    weights = ["uniform", "distance"]
    df_knn["K"] = klist

    for algorithm in algorithm_list:
        df_knn[algorithm+"_f1"], df_knn[algorithm+"_rmse"], df_knn[algorithm+"_acc"], df_knn[algorithm+"_hamm"]  = knn_model(train_data, Y_train, dev_data, Y_dev, algorithm, weights[1], klist)
        
    print(df_knn)
    print("df_knn")
    return(df_knn)


### NB Classifier

In [22]:
def NB_model(train_data, train_labels, dev_data, dev_labels, alpha_list):
    
    f1_score = []
    rmse = []
    acc = []
    hamm = []
    for alpha in alpha_list:
        NB_mod = BernoulliNB(alpha=alpha)
        NB_mod.fit(train_data, train_labels)
        acc.append(NB_mod.score(dev_data, dev_labels))
        f1_score.append(metrics.f1_score(dev_labels, NB_mod.predict(dev_data), average="weighted"))
        rmse.append(metrics.mean_squared_error(dev_labels, NB_mod.predict(dev_data), squared=False))
        hamm.append(metrics.hamming_loss(dev_labels, NB_mod.predict(dev_data)))
    return f1_score, rmse, acc, hamm

def NB_models(train_data, Y_train, dev_data, Y_dev):
    alpha_list = [1.0e-10, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]

    df_NB = pd.DataFrame()
    df_NB["Alpha"] = alpha_list
    df_NB["F1_score"], df_NB["RMSE"], df_NB["ACC"], df_NB["HAMM"] = NB_model(train_data, Y_train, dev_data, Y_dev, alpha_list)

    print(df_NB)
    print("df_NB")
    return(df_NB)


### Multinomial NB

In [23]:
def MNB_model(train_data, train_labels, dev_data, dev_labels, alpha_list):
    
    f1_score = []
    rmse = []
    acc = []
    hamm = []
    for alpha in alpha_list:
        MNB_mod = MultinomialNB(alpha=alpha)
        MNB_mod.fit(train_data, train_labels)
        f1_score.append(metrics.f1_score(dev_labels, MNB_mod.predict(dev_data), average="weighted"))
        rmse.append(metrics.mean_squared_error(dev_labels, MNB_mod.predict(dev_data), squared=False))
        acc.append(metrics.accuracy_score(dev_labels, MNB_mod.predict(dev_data)))
        hamm.append(metrics.hamming_loss(dev_labels, MNB_mod.predict(dev_data)))
    return f1_score, rmse, acc, hamm

def MNB_models(train_data, Y_train, dev_data, Y_dev):
    alpha_list = [1.0e-10, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]

    df_MNB = pd.DataFrame()
    df_MNB["Alpha"] = alpha_list
    df_MNB["F1_score"], df_MNB["RMSE"], df_MNB["ACC"], df_MNB["HAMM"] = MNB_model(train_data, Y_train, 
                                                                                  dev_data, Y_dev, alpha_list)

    print(df_MNB)
    print("df_MNB")
    return(df_MNB)

### Gaussian NB (only classification)

In [24]:
def GNB_model(train_data, train_labels, dev_data, dev_labels, smoothing_list):
    
    f1_score = []
    rmse = []
    acc = []
    hamm = []
    for var_smoothing in smoothing_list:
        GNB_mod = GaussianNB(var_smoothing=var_smoothing)
        GNB_mod.fit(train_data, train_labels)
        f1_score.append(metrics.f1_score(dev_labels, GNB_mod.predict(dev_data), average="weighted"))
        rmse.append(metrics.mean_squared_error(dev_labels, GNB_mod.predict(dev_data), squared=False))
        acc.append(metrics.accuracy_score(dev_labels, GNB_mod.predict(dev_data)))
        hamm.append(metrics.hamming_loss(dev_labels, GNB_mod.predict(dev_data)))
    return f1_score, rmse, acc, hamm

def GNB_models(train_data, Y_train, dev_data, Y_dev):
    smoothing_list = [1.0e-10, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]

    df_GNB = pd.DataFrame()
    df_GNB["Var Smooth"] = smoothing_list
    df_GNB["F1_score"], df_GNB["RMSE"], df_GNB["ACC"], df_GNB["HAMM"] = GNB_model(train_data, Y_train, dev_data, Y_dev, smoothing_list)

    print(df_GNB)
    print("df_GNB")
    return(df_GNB)

### LogisticRegression
Warning The choice of the algorithm depends on the penalty chosen: Supported penalties by solver:  
- ‘newton-cg’ - [‘l2’, ‘none’]  
- ‘lbfgs’ - [‘l2’, ‘none’]  
- ‘liblinear’ - [‘l1’, ‘l2’]  
- ‘sag’ - [‘l2’, ‘none’]  
- ‘saga’ - [‘elasticnet’, ‘l1’, ‘l2’, ‘none’]

**max_iter was increased to 200, so it would converge**
- max_iter int, default=100
- Maximum number of iterations taken for the solvers to converge.

In [25]:
def LogR_model(train_data, train_labels, dev_data, dev_labels, penalty, solver, c_list):
    
    
    #c_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1, 1.2, 1.3, 1.4, 1.5]
    
    f1_score = []
    rmse = []
    acc = []
    hamm = []
    for c in c_list:
        logR_mod = LogisticRegression(C=c, solver=solver, multi_class="auto", penalty=penalty, max_iter=200)
        logR_mod.fit(train_data, train_labels)
        acc.append(logR_mod.score(dev_data, dev_labels))
        f1_score.append(metrics.f1_score(dev_labels, logR_mod.predict(dev_data), average="weighted"))
        rmse.append(metrics.mean_squared_error(dev_labels, logR_mod.predict(dev_data), squared=False))
        hamm.append(metrics.hamming_loss(dev_labels, logR_mod.predict(dev_data)))
    return f1_score, rmse, acc, hamm

def LogR_models(train_data, Y_train, dev_data, Y_dev):
    df_logR =pd.DataFrame()
    solver_list = ["liblinear", "newton-cg", "sag", "lbfgs"]
    c_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]

    df_logR["C"] = c_list
    for solver in solver_list:
#         df_logR[solver] = LogR_model(train_data, Y_train, dev_data, Y_dev, "l2", solver, c_list)
        df_logR[solver+"_f1"], df_logR[solver+"_rmse"], df_logR[solver+"_acc"], df_logR[solver+"_hamm"]= LogR_model(train_data, Y_train, dev_data, Y_dev, "l2", solver, c_list)

    print(df_logR)
    print("df_LogR")
    return(df_logR)

### Tree (Regression) - 

In [26]:
def DT_model(train_data, train_labels, dev_data, dev_labels, criterion, max_depth_list):
    
    
    #c_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1, 1.2, 1.3, 1.4, 1.5]
    
    f1_score = []
    rmse=[]
    acc = []
    hamm = []
    for max_depth in max_depth_list:
        dt_model = DecisionTreeClassifier(criterion=criterion, min_samples_split=10, max_depth=max_depth)
#         dt_model = DecisionTreeRegressor(criterion=criterion, min_samples_split=10, max_depth=max_depth)
        dt_model.fit(train_data, train_labels)
        f1_score.append(metrics.f1_score(dev_labels, dt_model.predict(dev_data), average="weighted"))
        rmse.append(metrics.mean_squared_error(dev_labels, dt_model.predict(dev_data), squared=False))
        acc.append(metrics.accuracy_score(dev_labels, dt_model.predict(dev_data)))
        hamm.append(metrics.hamming_loss(dev_labels, dt_model.predict(dev_data)))       
    return f1_score, rmse, acc, hamm

def DT_models(train_data, Y_train, dev_data, Y_dev):
    df_DT =pd.DataFrame()
    criterion_list = ["entropy", "gini"]
    max_depth_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
    df_DT["max_depth"] = max_depth_list

    for criterion in criterion_list:
#         df_DT[criterion] = DT_model(train_data, Y_train, dev_data, Y_dev, criterion, max_depth_list)
        df_DT[criterion+"_f1"], df_DT[criterion+"_rmse"], df_DT[criterion+"_acc"], df_DT[criterion+"_hamm"]= DT_model(train_data, 
                                                                                                                      Y_train, dev_data, 
                                                                                                                      Y_dev, criterion, max_depth_list)

    print(df_DT)
    print("df_DT")
    return(df_DT)

### Random Forest (Regression)

In [27]:
def RF_model(train_data, train_labels, dev_data, dev_labels, criterion, n_estimators_list):
        
    f1_score = []
    rmse=[]
    acc = []
    hamm = []
    for n_estimators in n_estimators_list:
        RF_model = RandomForestClassifier(n_estimators=n_estimators,criterion=criterion, min_samples_split=10)
        RF_model.fit(train_data, train_labels)
        f1_score.append(metrics.f1_score(dev_labels, RF_model.predict(dev_data), average="weighted"))
        rmse.append(metrics.mean_squared_error(dev_labels, RF_model.predict(dev_data), squared=False))
        acc.append(metrics.accuracy_score(dev_labels, RF_model.predict(dev_data)))
        hamm.append(metrics.hamming_loss(dev_labels, RF_model.predict(dev_data)))
    return f1_score, rmse, acc, hamm

def RF_models(train_data, Y_train, dev_data, Y_dev):
    df_RF =pd.DataFrame()
    criterion_list = ["entropy", "gini"]
    n_estimators_list = [5, 10, 15, 20, 25, 30]
    df_RF["n_estimators"] = n_estimators_list

    for criterion in criterion_list:
#         df_RF[criterion] = RF_model(train_data, Y_train, dev_data, Y_dev, criterion, n_estimators_list)
        df_RF[criterion+"_f1"], df_RF[criterion+"_rmse"], df_RF[criterion+"_acc"], df_RF[criterion+"_hamm"]= RF_model(train_data, Y_train, dev_data, Y_dev, 
                                                                   criterion, n_estimators_list)

    print(df_RF)
    print("df_RF")
    return(df_RF)

### AdaBoost (Regression)

In [28]:
def AdaB_model(train_data, train_labels, dev_data, dev_labels, algorithm, n_estimators_list):
        
    f1_score = []
    rmse=[]
    acc = []
    hamm = []
    for n_estimators in n_estimators_list:
        AdaB_model = AdaBoostClassifier(n_estimators=n_estimators,algorithm=algorithm, learning_rate=1.2)
        AdaB_model.fit(train_data, train_labels)
        f1_score.append(metrics.f1_score(dev_labels, AdaB_model.predict(dev_data), average="weighted"))
        rmse.append(metrics.mean_squared_error(dev_labels, AdaB_model.predict(dev_data), squared=False))
        acc.append(metrics.accuracy_score(dev_labels, AdaB_model.predict(dev_data)))
        hamm.append(metrics.hamming_loss(dev_labels, AdaB_model.predict(dev_data)))
    return f1_score, rmse, acc, hamm

def AdaB_models(train_data, Y_train, dev_data, Y_dev):
    df_AdaB =pd.DataFrame()
    algorithm_list = ["SAMME", "SAMME.R"]
    n_estimators_list = [5, 10, 15, 20, 25, 30]
    df_AdaB["n_estimators"] = n_estimators_list

    for algorithm in algorithm_list:
        df_AdaB[algorithm+"_f1"], df_AdaB[algorithm+"_rmse"], df_AdaB[algorithm+"_acc"], df_AdaB[algorithm+"_hamm"]= AdaB_model(train_data, Y_train, 
                                                                         dev_data, Y_dev, algorithm, n_estimators_list)

    print(df_AdaB)
    print("df_AdaB")
    return(df_AdaB)

### SVM  

In [29]:
def SVM_model(train_data, train_labels, dev_data, dev_labels, kernel, c_list):
        
    f1_score = []
    rmse = []
    acc = []
    hamm = []
    for c in c_list:
        if kernel == "LinearSVC":
            svm_model = svm.LinearSVC(C=c, max_iter=10000)
        elif kernel == "poly":
            svm_model = svm.SVC(kernel=kernel, C=c, degree=2, gamma=1)
        elif kernel == "rbf":
            svm_model = svm.SVC(kernel=kernel, C=c, gamma=0.7)
        else:
            svm_model = svm.SVC(kernel=kernel, C=c,)
        
        svm_model.fit(train_data, train_labels)
        f1_score.append(metrics.f1_score(dev_labels, svm_model.predict(dev_data), average="weighted"))
        rmse.append(metrics.mean_squared_error(dev_labels, svm_model.predict(dev_data), squared=False))
        acc.append(metrics.accuracy_score(dev_labels, svm_model.predict(dev_data)))
        hamm.append(metrics.hamming_loss(dev_labels, svm_model.predict(dev_data)))
    return f1_score, rmse, acc, hamm

def SVM_models(train_data, Y_train, dev_data):
    df_SVM =pd.DataFrame()
    kernel_list = ["linear", "rbf", "poly", "LinearSVC"]
    c_list = [0.5, 1, 1.5, 2, 2.5, 3, 4, 5, 10, 20]
    df_SVM["C"] = c_list

    for kernel in kernel_list:
        df_SVM[kernel+"_f1"], df_SVM[kernel+"_rmse"], df_SVM[kernel+"_acc"], df_SVM[kernel+"_hamm"] = SVM_model(train_data, Y_train, dev_data, Y_dev, kernel, c_list)
    print(df_SVM)
    print("df_SVM")

    return(df_SVM)

### Neural Network

**hidden_layer_sizestuple, length = n_layers - 2, default=(100,)**  
The ith element represents the number of neurons in the ith hidden layer.

**activation{‘identity’, ‘logistic’, ‘tanh’, ‘relu’}, default=’relu’**  
Activation function for the hidden layer.  

- ‘identity’, no-op activation, useful to implement linear bottleneck, returns f(x) = x  
- ‘logistic’, the logistic sigmoid function, returns f(x) = 1 / (1 + exp(-x)).  
- ‘tanh’, the hyperbolic tan function, returns f(x) = tanh(x).   
- ‘relu’, the rectified linear unit function, returns f(x) = max(0, x)  

**solver{‘lbfgs’, ‘sgd’, ‘adam’}, default=’adam’**  
The solver for weight optimization.  

- ‘lbfgs’ is an optimizer in the family of quasi-Newton methods.
- ‘sgd’ refers to stochastic gradient descent.
- ‘adam’ refers to a stochastic gradient-based optimizer proposed by Kingma, Diederik, and Jimmy Ba

Note: The default solver ‘adam’ works pretty well on relatively large datasets (with thousands of training samples or more) in terms of both training time and validation score. For small datasets, however, ‘lbfgs’ can converge faster and perform better.

**alphafloat, default=0.0001**
L2 penalty (regularization term) parameter.

In [30]:
def NN_model(train_data, train_labels, dev_data, dev_labels, activation, solver_list, alpha_list, layer_list, choice):
        
    f1_score = []
    rmse = []
    acc = []
    hamm = []
    if choice == "A":
        for alpha in alpha_list:
            NN_model = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=1000, activation=activation, alpha=alpha)
            NN_model.fit(train_data, train_labels)
            f1_score.append(metrics.f1_score(dev_labels, NN_model.predict(dev_data), average="weighted"))
            rmse.append(metrics.mean_squared_error(dev_labels, NN_model.predict(dev_data), squared=False))
            acc.append(metrics.accuracy_score(dev_labels, NN_model.predict(dev_data)))
            hamm.append(metrics.hamming_loss(dev_labels, NN_model.predict(dev_data)))
    elif choice == "L":
        for layer in layer_list:
            NN_model = MLPClassifier(hidden_layer_sizes=layer, max_iter=1000, activation=activation)
            NN_model.fit(train_data, train_labels)
            f1_score.append(metrics.f1_score(dev_labels, NN_model.predict(dev_data), average="weighted"))
            rmse.append(metrics.mean_squared_error(dev_labels, NN_model.predict(dev_data), squared=False))
            acc.append(metrics.accuracy_score(dev_labels, NN_model.predict(dev_data)))
            hamm.append(metrics.hamming_loss(dev_labels, NN_model.predict(dev_data)))
    else:
        for solver in solver_list:
            NN_model = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=8000, activation=activation, solver=solver)
            NN_model.fit(train_data, train_labels)
            f1_score.append(metrics.f1_score(dev_labels, NN_model.predict(dev_data), average="weighted"))
            rmse.append(metrics.mean_squared_error(dev_labels, NN_model.predict(dev_data), squared=False))
            acc.append(metrics.accuracy_score(dev_labels, NN_model.predict(dev_data)))
            hamm.append(metrics.hamming_loss(dev_labels, NN_model.predict(dev_data)))
    
    return f1_score, rmse, acc, hamm

#Note: Changing Alpha is not creating any variation in the f1_score.  Try first with L and then with S
def NN_models(train_data, Y_train, dev_data, Y_dev, choice):
    
    df_NN =pd.DataFrame()
    activation_list = ["identity", "logistic", "tanh", "relu"]
    layer_list = [(10,10,10), (5,5,5), (3,3,3), (20, 20, 20)]
    solver_list = ["lbfgs", "sgd", "adam"]
    alpha_list = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.5]

    for activation in activation_list:
        df_NN[activation+'_f1'], df_NN[activation+'_rmse'], df_NN[activation+'_acc'], df_NN[activation+'_hamm'] = NN_model(train_data, Y_train, dev_data, Y_dev, activation, solver_list, alpha_list, layer_list, choice)

    if choice == "A":
        df_NN["Alpha"] = alpha_list
    elif choice == "L":
        df_NN["Layers"] = layer_list
    else:
        df_NN["Solver"] = solver_list
    
    print(df_NN)
    print("df_NN")
    return(df_NN)

In [31]:
def assign_y(scale):
    if scale == 2:
        Y_train = train_y2
        Y_dev = dev_y2
    elif scale == 3:
        Y_train = train_y3
        Y_dev = dev_y3
    elif scale == 4:
        Y_train = train_y4
        Y_dev = dev_y4
    elif scale == 5:
        Y_train = train_y5
        Y_dev = dev_y5
    elif scale == 10:
        Y_train = train_y10
        Y_dev = dev_y10
    elif scale == 100:
        Y_train = train_labels
        Y_dev = dev_labels
    else:
        Y_train = train_labels
        Y_dev = dev_labels
    return(Y_train, Y_dev)        

In [32]:
def print_confusion_matrix(Y_dev, Prediction, title):
    cfm = confusion_matrix(Y_dev,Prediction)
    if np.unique(Y_dev).max() > 5:
        size = 6
    else: 
        size = np.unique(Y_dev).max()
    fig, ax = plt.subplots(figsize=(size, size))
    ax.matshow(cfm, cmap=plt.cm.Blues, alpha=0.3)
    for i in range(cfm.shape[0]):
        for j in range(cfm.shape[1]):
            ax.text(x=j, y=i,s=cfm[i, j], va='center', ha='center', size='xx-large')
    ax.set_title(title)

In [33]:
scale_list = [2, 5, 10, 100]
file_name = "model_summary_baseline_final.xlsx"
for scale in scale_list:
    print(scale)
    Y_train, Y_dev = assign_y(scale)
    df_knn = knn_models(train_data, Y_train, dev_data, Y_dev)
    df_NB = NB_models(train_data, Y_train, dev_data, Y_dev)
    df_MNB = MNB_models(train_data, Y_train, dev_data, Y_dev)
    df_GNB = GNB_models(train_data, Y_train, dev_data, Y_dev)
    df_logR = LogR_models(train_data, Y_train, dev_data, Y_dev)
    df_DT = DT_models(train_data, Y_train, dev_data, Y_dev)
    df_RF = RF_models(train_data, Y_train, dev_data, Y_dev)
    df_AdaB = AdaB_models(train_data, Y_train, dev_data, Y_dev)
    df_SVM = SVM_models(train_data, Y_train, dev_data)
    df_NN1 = NN_models(train_data, Y_train, dev_data, Y_dev, "L")
    df_NN2 = NN_models(train_data, Y_train, dev_data, Y_dev, "S")

    wrt_excel(file_name, "knn-"+str(scale), df_knn)
    wrt_excel(file_name, "NB-"+str(scale), df_NB)    
    wrt_excel(file_name, "MNB-"+str(scale), df_MNB)
    wrt_excel(file_name, "GNB-"+str(scale), df_GNB)
    wrt_excel(file_name, "logR-"+str(scale), df_logR)
    wrt_excel(file_name, "DT-"+str(scale), df_DT)
    wrt_excel(file_name, "RF-"+str(scale), df_RF)
    wrt_excel(file_name, "AdaB-"+str(scale), df_AdaB)
    wrt_excel(file_name, "SVM-"+str(scale), df_SVM)
    wrt_excel(file_name, "NN-"+str(scale)+"L", df_NN1)
    wrt_excel(file_name, "NN-"+str(scale)+"S", df_NN2)

2
     K   auto_f1  auto_rmse  auto_acc  auto_hamm  ball_tree_f1  \
0    1  0.577796   0.680194  0.537336   0.462664      0.579138   
1    2  0.694264   0.470095  0.779011   0.220989      0.694534   
2    3  0.702511   0.478076  0.771443   0.228557      0.702511   
3    4  0.688322   0.469558  0.779516   0.220484      0.688322   
4    5  0.693183   0.472236  0.776993   0.223007      0.693183   
5    6  0.686968   0.468482  0.780525   0.219475      0.686968   
6    7  0.692004   0.471167  0.778002   0.221998      0.692004   
7    8  0.687217   0.467943  0.781029   0.218971      0.687217   
8    9  0.687217   0.467943  0.781029   0.218971      0.687217   
9   10  0.687217   0.467943  0.781029   0.218971      0.687217   
10  11  0.687217   0.467943  0.781029   0.218971      0.687217   
11  12  0.687217   0.467943  0.781029   0.218971      0.687217   
12  13  0.687217   0.467943  0.781029   0.218971      0.687217   
13  14  0.686968   0.468482  0.780525   0.219475      0.687217   
14  15  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


   identity_f1  identity_rmse  identity_acc  identity_hamm  logistic_f1  \
0     0.349133       1.109540      0.513623       0.486377      0.34801   
1     0.348010       1.109768      0.513118       0.486882      0.34801   
2     0.348010       1.109768      0.513118       0.486882      0.34801   

   logistic_rmse  logistic_acc  logistic_hamm   tanh_f1  tanh_rmse  tanh_acc  \
0       1.109768      0.513118       0.486882  0.364318   1.134942  0.498991   
1       1.109768      0.513118       0.486882  0.348010   1.109768  0.513118   
2       1.109768      0.513118       0.486882  0.348675   1.109995  0.512614   

   tanh_hamm   relu_f1  relu_rmse  relu_acc  relu_hamm Solver  
0   0.501009  0.367375   1.134942  0.493441   0.506559  lbfgs  
1   0.486882  0.348010   1.109768  0.513118   0.486882    sgd  
2   0.487386  0.357846   1.117695  0.507064   0.492936   adam  
df_NN
10
     K   auto_f1  auto_rmse  auto_acc  auto_hamm  ball_tree_f1  \
0    1  0.134211   4.309600  0.133199   0.86680

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


     C  liblinear_f1  liblinear_rmse  liblinear_acc  liblinear_hamm  \
0    1      0.151792        2.424646       0.273966        0.726034   
1    2      0.152754        2.424542       0.274470        0.725530   
2    3      0.154275        2.424334       0.275479        0.724521   
3    4      0.157491        2.418396       0.274975        0.725025   
4    5      0.157491        2.418396       0.274975        0.725025   
5    6      0.157491        2.418396       0.274975        0.725025   
6    7      0.157491        2.418396       0.274975        0.725025   
7    8      0.157491        2.418396       0.274975        0.725025   
8    9      0.157491        2.418396       0.274975        0.725025   
9   10      0.157491        2.418396       0.274975        0.725025   
10  11      0.157491        2.418396       0.274975        0.725025   
11  12      0.157491        2.418396       0.274975        0.725025   
12  13      0.157491        2.418396       0.274975        0.725025   
13  14

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


   identity_f1  identity_rmse  identity_acc  identity_hamm  logistic_f1  \
0     0.152798       2.424542      0.274470       0.725530     0.115517   
1     0.132149       2.444644      0.276488       0.723512     0.115517   
2     0.154325       2.427870      0.275479       0.724521     0.115517   

   logistic_rmse  logistic_acc  logistic_hamm   tanh_f1  tanh_rmse  tanh_acc  \
0        2.46069      0.270938       0.729062  0.179276   2.441237  0.266398   
1        2.46069      0.270938       0.729062  0.115517   2.460690  0.270938   
2        2.46069      0.270938       0.729062  0.158893   2.418917  0.274975   

   tanh_hamm   relu_f1  relu_rmse  relu_acc  relu_hamm Solver  
0   0.733602  0.146361   2.443716  0.274975   0.725025  lbfgs  
1   0.729062  0.115270   2.460895  0.269929   0.730071    sgd  
2   0.725025  0.158062   2.428493  0.272957   0.727043   adam  
df_NN
100
     K   auto_f1  auto_rmse  auto_acc  auto_hamm  ball_tree_f1  \
0    1  0.014051  45.461486  0.027750   0.9722

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

     C  liblinear_f1  liblinear_rmse  liblinear_acc  liblinear_hamm  \
0    1      0.011284       23.122071       0.027750        0.972250   
1    2      0.012571       23.132837       0.028254        0.971746   
2    3      0.012364       23.146443       0.027750        0.972250   
3    4      0.012619       23.147053       0.028254        0.971746   
4    5      0.011732       23.150203       0.027750        0.972250   
5    6      0.011732       23.150203       0.027750        0.972250   
6    7      0.011732       23.150203       0.027750        0.972250   
7    8      0.011732       23.152164       0.027750        0.972250   
8    9      0.011772       23.201961       0.027750        0.972250   
9   10      0.011772       23.209092       0.027750        0.972250   
10  11      0.011784       23.230615       0.027750        0.972250   
11  12      0.011790       23.251889       0.027750        0.972250   
12  13      0.012385       23.290102       0.028254        0.971746   
13  14



   identity_f1  identity_rmse  identity_acc  identity_hamm  logistic_f1  \
0     0.009865      23.387945      0.026236       0.973764     0.001499   
1     0.009772      23.358987      0.027750       0.972250     0.001499   
2     0.005837      22.630048      0.029768       0.970232     0.001499   
3     0.012301      23.436874      0.027245       0.972755     0.001499   

   logistic_rmse  logistic_acc  logistic_hamm   tanh_f1  tanh_rmse  tanh_acc  \
0      22.486065       0.02775        0.97225  0.013939  25.409472  0.031282   
1      22.486065       0.02775        0.97225  0.009228  23.277642  0.036831   
2      22.486065       0.02775        0.97225  0.001500  22.486603  0.027750   
3      22.486065       0.02775        0.97225  0.010805  25.393552  0.024723   

   tanh_hamm   relu_f1  relu_rmse  relu_acc  relu_hamm        Layers  
0   0.968718  0.010676  25.641261  0.025732   0.974268  (10, 10, 10)  
1   0.963169  0.010299  29.190188  0.029768   0.970232     (5, 5, 5)  
2   0.9722

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


   identity_f1  identity_rmse  identity_acc  identity_hamm  logistic_f1  \
0     0.013483      24.005854      0.027750       0.972250     0.001499   
1     0.005177      22.914475      0.025732       0.974268     0.001499   
2     0.012558      23.367701      0.029768       0.970232     0.001499   

   logistic_rmse  logistic_acc  logistic_hamm   tanh_f1  tanh_rmse  tanh_acc  \
0      22.486065       0.02775        0.97225  0.011073  24.041001  0.025227   
1      22.486065       0.02775        0.97225  0.002495  22.523169  0.027750   
2      22.486065       0.02775        0.97225  0.009488  24.092123  0.027245   

   tanh_hamm   relu_f1  relu_rmse  relu_acc  relu_hamm Solver  
0   0.974773  0.008000  24.128231  0.025732   0.974268  lbfgs  
1   0.972250  0.004519  22.980589  0.032795   0.967205    sgd  
2   0.972755  0.010828  23.947755  0.025732   0.974268   adam  
df_NN


## PCA 7 components > IN ADDITION TO ABOVE

In [34]:
components_pca = 7
random_state = 0

pca = PCA(n_components=components_pca, random_state=random_state)
pca.fit(train_data)

# overwriting train_data and dev_data to be the pca object - should do this in a cleaner way to preserve it but giving this a shot
train_data1 = pca.transform(train_data)
dev_data1 = pca.transform(dev_data)


In [36]:
scale_list = [2, 5, 10, 100]
file_name = "model_summary_w_pca_7_final.xlsx"
for scale in scale_list:
    print(scale)
    Y_train, Y_dev = assign_y(scale)
    df_knn = knn_models(train_data1, Y_train, dev_data1, Y_dev)
#     df_NB = NB_models(train_data1, Y_train, dev_data1, Y_dev)
#     df_MNB = MNB_models(train_data1, Y_train, dev_data1, Y_dev)
#     df_GNB = GNB_models(train_data1, Y_train, dev_data1, Y_dev)
    df_logR = LogR_models(train_data1, Y_train, dev_data1, Y_dev)
    df_DT = DT_models(train_data1, Y_train, dev_data1, Y_dev)
    df_RF = RF_models(train_data1, Y_train, dev_data1, Y_dev)
    df_AdaB = AdaB_models(train_data1, Y_train, dev_data1, Y_dev)
    df_SVM = SVM_models(train_data1, Y_train, dev_data1)
    df_NN1 = NN_models(train_data1, Y_train, dev_data1, Y_dev, "L")
    df_NN2 = NN_models(train_data1, Y_train, dev_data1, Y_dev, "S")

    wrt_excel(file_name, "knn-"+str(scale), df_knn)
#     wrt_excel(file_name, "NB-"+str(scale), df_NB)    
#     wrt_excel(file_name, "MNB-"+str(scale), df_MNB)
#     wrt_excel(file_name, "GNB-"+str(scale), df_GNB)
    wrt_excel(file_name, "logR-"+str(scale), df_logR)
    wrt_excel(file_name, "DT-"+str(scale), df_DT)
    wrt_excel(file_name, "RF-"+str(scale), df_RF)
    wrt_excel(file_name, "AdaB-"+str(scale), df_AdaB)
    wrt_excel(file_name, "SVM-"+str(scale), df_SVM)
    wrt_excel(file_name, "NN-"+str(scale)+"L", df_NN1)
    wrt_excel(file_name, "NN-"+str(scale)+"S", df_NN2)

2
     K   auto_f1  auto_rmse  auto_acc  auto_hamm  ball_tree_f1  \
0    1  0.675140   0.538151  0.710394   0.289606      0.675140   
1    2  0.689018   0.475431  0.773966   0.226034      0.689018   
2    3  0.677462   0.530122  0.718971   0.281029      0.677462   
3    4  0.688942   0.473836  0.775479   0.224521      0.688942   
4    5  0.678516   0.530122  0.718971   0.281029      0.678516   
5    6  0.689465   0.472770  0.776488   0.223512      0.689465   
6    7  0.681432   0.521486  0.728052   0.271948      0.681432   
7    8  0.681608   0.515648  0.734107   0.265893      0.681608   
8    9  0.681608   0.515648  0.734107   0.265893      0.681608   
9   10  0.686968   0.468482  0.780525   0.219475      0.686968   
10  11  0.686895   0.470631  0.778507   0.221493      0.686895   
11  12  0.686895   0.470631  0.778507   0.221493      0.686895   
12  13  0.686895   0.470631  0.778507   0.221493      0.686895   
13  14  0.686968   0.468482  0.780525   0.219475      0.686968   
14  15  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


   identity_f1  identity_rmse  identity_acc  identity_hamm  logistic_f1  \
0     0.688462        0.46524      0.783552       0.216448     0.688462   
1     0.688462        0.46524      0.783552       0.216448     0.688462   
2     0.688462        0.46524      0.783552       0.216448     0.688462   

   logistic_rmse  logistic_acc  logistic_hamm   tanh_f1  tanh_rmse  tanh_acc  \
0        0.46524      0.783552       0.216448  0.683095   0.478604  0.770938   
1        0.46524      0.783552       0.216448  0.688462   0.465240  0.783552   
2        0.46524      0.783552       0.216448  0.688462   0.465240  0.783552   

   tanh_hamm   relu_f1  relu_rmse  relu_acc  relu_hamm Solver  
0   0.229062  0.689102   0.471702  0.777497   0.222503  lbfgs  
1   0.216448  0.688462   0.465240  0.783552   0.216448    sgd  
2   0.216448  0.688462   0.465240  0.783552   0.216448   adam  
df_NN
5
     K   auto_f1  auto_rmse  auto_acc  auto_hamm  ball_tree_f1  \
0    1  0.270729   1.476182  0.268416   0.731584

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


   identity_f1  identity_rmse  identity_acc  identity_hamm  logistic_f1  \
0      0.34801       1.109768      0.513118       0.486882      0.34801   
1      0.34801       1.109768      0.513118       0.486882      0.34801   
2      0.34801       1.109768      0.513118       0.486882      0.34801   

   logistic_rmse  logistic_acc  logistic_hamm   tanh_f1  tanh_rmse  tanh_acc  \
0       1.109768      0.513118       0.486882  0.368512   1.130711  0.497982   
1       1.109768      0.513118       0.486882  0.348010   1.109768  0.513118   
2       1.109768      0.513118       0.486882  0.348010   1.109768  0.513118   

   tanh_hamm   relu_f1  relu_rmse  relu_acc  relu_hamm Solver  
0   0.502018  0.363289   1.117921  0.505550   0.494450  lbfgs  
1   0.486882  0.348010   1.109768  0.513118   0.486882    sgd  
2   0.486882  0.348904   1.111585  0.512109   0.487891   adam  
df_NN
10
     K   auto_f1  auto_rmse  auto_acc  auto_hamm  ball_tree_f1  \
0    1  0.133975   2.852053  0.145308   0.85469

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


   identity_f1  identity_rmse  identity_acc  identity_hamm  logistic_f1  \
0     0.147220       2.437410      0.282038       0.717962     0.115517   
1     0.163001       2.421002      0.276488       0.723512     0.115517   
2     0.138433       2.445470      0.278002       0.721998     0.115517   

   logistic_rmse  logistic_acc  logistic_hamm   tanh_f1  tanh_rmse  tanh_acc  \
0        2.46069      0.270938       0.729062  0.176324   2.426726  0.270938   
1        2.46069      0.270938       0.729062  0.121283   2.460177  0.271948   
2        2.46069      0.270938       0.729062  0.129934   2.454839  0.273461   

   tanh_hamm   relu_f1  relu_rmse  relu_acc  relu_hamm Solver  
0   0.729062  0.162256   2.440513  0.275479   0.724521  lbfgs  
1   0.728052  0.115517   2.460690  0.270938   0.729062    sgd  
2   0.726539  0.141374   2.435754  0.278507   0.721493   adam  
df_NN
100
     K   auto_f1  auto_rmse  auto_acc  auto_hamm  ball_tree_f1  \
0    1  0.006895  28.706191  0.013623   0.9863



   identity_f1  identity_rmse  identity_acc  identity_hamm  logistic_f1  \
0     0.010703      23.197448      0.029768       0.970232     0.001499   
1     0.009062      22.730932      0.032795       0.967205     0.001499   
2     0.007768      23.125868      0.031282       0.968718     0.001499   
3     0.010010      23.336189      0.027750       0.972250     0.001499   

   logistic_rmse  logistic_acc  logistic_hamm   tanh_f1  tanh_rmse  tanh_acc  \
0      22.486065       0.02775        0.97225  0.015107  23.885786  0.033300   
1      22.486065       0.02775        0.97225  0.008215  23.110329  0.028254   
2      22.486065       0.02775        0.97225  0.001484  22.485021  0.027245   
3      22.486065       0.02775        0.97225  0.015420  23.837365  0.029768   

   tanh_hamm   relu_f1  relu_rmse  relu_acc  relu_hamm        Layers  
0   0.966700  0.013179  24.303922  0.026741   0.973259  (10, 10, 10)  
1   0.971746  0.004566  26.338756  0.027750   0.972250     (5, 5, 5)  
2   0.9727

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


   identity_f1  identity_rmse  identity_acc  identity_hamm  logistic_f1  \
0     0.010893      23.282107      0.029263       0.970737     0.001499   
1     0.006128      22.725970      0.030777       0.969223     0.001499   
2     0.011045      23.493832      0.028759       0.971241     0.001499   

   logistic_rmse  logistic_acc  logistic_hamm   tanh_f1  tanh_rmse  tanh_acc  \
0      22.486065       0.02775        0.97225  0.016292  27.202292  0.031786   
1      22.486065       0.02775        0.97225  0.002781  22.508279  0.027750   
2      22.486065       0.02775        0.97225  0.010065  22.935196  0.027750   

   tanh_hamm   relu_f1  relu_rmse  relu_acc  relu_hamm Solver  
0   0.968214  0.014501  24.048954  0.033300   0.966700  lbfgs  
1   0.972250  0.004179  22.775082  0.024218   0.975782    sgd  
2   0.972250  0.012277  23.253224  0.030272   0.969728   adam  
df_NN


## PCA 8 components 

In [37]:
components_pca = 8
random_state = 0

pca = PCA(n_components=components_pca, random_state=random_state)
pca.fit(train_data)

# overwriting train_data and dev_data to be the pca object - should do this in a cleaner way to preserve it but giving this a shot
train_data2 = pca.transform(train_data)
dev_data2 = pca.transform(dev_data)

In [38]:
scale_list = [2, 5, 10, 100]
file_name = "model_summary_w_pca_8_final.xlsx"
for scale in scale_list:
    print(scale)
    Y_train, Y_dev = assign_y(scale)
    df_knn = knn_models(train_data2, Y_train, dev_data2, Y_dev)
#     df_NB = NB_models(train_data1, Y_train, dev_data1, Y_dev)
#     df_MNB = MNB_models(train_data1, Y_train, dev_data1, Y_dev)
#     df_GNB = GNB_models(train_data1, Y_train, dev_data1, Y_dev)
    df_logR = LogR_models(train_data2, Y_train, dev_data2, Y_dev)
    df_DT = DT_models(train_data2, Y_train, dev_data2, Y_dev)
    df_RF = RF_models(train_data2, Y_train, dev_data2, Y_dev)
    df_AdaB = AdaB_models(train_data2, Y_train, dev_data2, Y_dev)
    df_SVM = SVM_models(train_data2, Y_train, dev_data2)
    df_NN1 = NN_models(train_data2, Y_train, dev_data2, Y_dev, "L")
    df_NN2 = NN_models(train_data2, Y_train, dev_data2, Y_dev, "S")

    wrt_excel(file_name, "knn-"+str(scale), df_knn)
#     wrt_excel(file_name, "NB-"+str(scale), df_NB)    
#     wrt_excel(file_name, "MNB-"+str(scale), df_MNB)
#     wrt_excel(file_name, "GNB-"+str(scale), df_GNB)
    wrt_excel(file_name, "logR-"+str(scale), df_logR)
    wrt_excel(file_name, "DT-"+str(scale), df_DT)
    wrt_excel(file_name, "RF-"+str(scale), df_RF)
    wrt_excel(file_name, "AdaB-"+str(scale), df_AdaB)
    wrt_excel(file_name, "SVM-"+str(scale), df_SVM)
    wrt_excel(file_name, "NN-"+str(scale)+"L", df_NN1)
    wrt_excel(file_name, "NN-"+str(scale)+"S", df_NN2)

2
     K   auto_f1  auto_rmse  auto_acc  auto_hamm  ball_tree_f1  \
0    1  0.690061   0.501763  0.748234   0.251766      0.690061   
1    2  0.688064   0.473836  0.775479   0.224521      0.688064   
2    3  0.689092   0.483324  0.766398   0.233602      0.689092   
3    4  0.688323   0.473303  0.775984   0.224016      0.688323   
4    5  0.690468   0.480708  0.768920   0.231080      0.690468   
5    6  0.688583   0.472770  0.776488   0.223512      0.688583   
6    7  0.689283   0.474900  0.774470   0.225530      0.689283   
7    8  0.686968   0.468482  0.780525   0.219475      0.686968   
8    9  0.686968   0.468482  0.780525   0.219475      0.686968   
9   10  0.686968   0.468482  0.780525   0.219475      0.686968   
10  11  0.686895   0.470631  0.778507   0.221493      0.686895   
11  12  0.686895   0.470631  0.778507   0.221493      0.686895   
12  13  0.686895   0.470631  0.778507   0.221493      0.686895   
13  14  0.686968   0.468482  0.780525   0.219475      0.686968   
14  15  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


   identity_f1  identity_rmse  identity_acc  identity_hamm  logistic_f1  \
0     0.688462        0.46524      0.783552       0.216448     0.688462   
1     0.688462        0.46524      0.783552       0.216448     0.688462   
2     0.688462        0.46524      0.783552       0.216448     0.688462   

   logistic_rmse  logistic_acc  logistic_hamm   tanh_f1  tanh_rmse  tanh_acc  \
0        0.46524      0.783552       0.216448  0.690075   0.473303  0.775984   
1        0.46524      0.783552       0.216448  0.688462   0.465240  0.783552   
2        0.46524      0.783552       0.216448  0.688462   0.465240  0.783552   

   tanh_hamm   relu_f1  relu_rmse  relu_acc  relu_hamm Solver  
0   0.224016  0.688831   0.468482  0.780525   0.219475  lbfgs  
1   0.216448  0.688462   0.465240  0.783552   0.216448    sgd  
2   0.216448  0.688462   0.465240  0.783552   0.216448   adam  
df_NN
5
     K   auto_f1  auto_rmse  auto_acc  auto_hamm  ball_tree_f1  \
0    1  0.251117   1.540571  0.244198   0.755802

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


   identity_f1  identity_rmse  identity_acc  identity_hamm  logistic_f1  \
0      0.34801       1.109768      0.513118       0.486882      0.34801   
1      0.34801       1.109768      0.513118       0.486882      0.34801   
2      0.34801       1.109768      0.513118       0.486882      0.34801   

   logistic_rmse  logistic_acc  logistic_hamm   tanh_f1  tanh_rmse  tanh_acc  \
0       1.109768      0.513118       0.486882  0.368252   1.143578  0.496973   
1       1.109768      0.513118       0.486882  0.348010   1.109768  0.513118   
2       1.109768      0.513118       0.486882  0.348010   1.109768  0.513118   

   tanh_hamm   relu_f1  relu_rmse  relu_acc  relu_hamm Solver  
0   0.503027  0.358418   1.121976  0.496973   0.503027  lbfgs  
1   0.486882  0.348010   1.109768  0.513118   0.486882    sgd  
2   0.486882  0.358176   1.116114  0.505550   0.494450   adam  
df_NN
10
     K   auto_f1  auto_rmse  auto_acc  auto_hamm  ball_tree_f1  \
0    1  0.135484   3.285905  0.126135   0.87386

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


   identity_f1  identity_rmse  identity_acc  identity_hamm  logistic_f1  \
0     0.146558       2.432437      0.279516       0.720484     0.115517   
1     0.141368       2.438031      0.272957       0.727043     0.115517   
2     0.149864       2.431504      0.281534       0.718466     0.115517   

   logistic_rmse  logistic_acc  logistic_hamm   tanh_f1  tanh_rmse  tanh_acc  \
0        2.46069      0.270938       0.729062  0.158169   2.455559  0.271443   
1        2.46069      0.270938       0.729062  0.137690   2.446501  0.278002   
2        2.46069      0.270938       0.729062  0.160526   2.412338  0.274470   

   tanh_hamm   relu_f1  relu_rmse  relu_acc  relu_hamm Solver  
0   0.728557  0.146864   2.448872  0.269425   0.730575  lbfgs  
1   0.721998  0.117613   2.458126  0.271443   0.728557    sgd  
2   0.725530  0.145685   2.424022  0.276993   0.723007   adam  
df_NN
100
     K   auto_f1  auto_rmse  auto_acc  auto_hamm  ball_tree_f1  \
0    1  0.008701  33.652547  0.011100   0.9889

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


   identity_f1  identity_rmse  identity_acc  identity_hamm  logistic_f1  \
0     0.011267      23.231842      0.026741       0.973259     0.001499   
1     0.008578      23.053857      0.031282       0.968718     0.001499   
2     0.011167      23.138812      0.027245       0.972755     0.001499   

   logistic_rmse  logistic_acc  logistic_hamm   tanh_f1  tanh_rmse  tanh_acc  \
0      22.486065       0.02775        0.97225  0.012704  23.316830  0.026741   
1      22.486065       0.02775        0.97225  0.003401  22.491225  0.029263   
2      22.486065       0.02775        0.97225  0.010745  22.972772  0.024723   

   tanh_hamm   relu_f1  relu_rmse  relu_acc  relu_hamm Solver  
0   0.973259  0.009688  25.098444  0.026236   0.973764  lbfgs  
1   0.970737  0.004869  22.892578  0.030272   0.969728    sgd  
2   0.975277  0.012027  23.196415  0.027750   0.972250   adam  
df_NN
