# <b><font color="#3c38a8">Quiz \#1 - KNN and Naive-Bayes</font></b>

Import Packages

In [None]:
# General
import pandas as pd
import numpy as np
import random

# Make the random data
from sklearn.datasets import make_classification

# Naive-Bayes
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB

#KNN
from sklearn.preprocessing import StandardScaler  
from sklearn.neighbors import KNeighborsClassifier

# F1 score
from sklearn.metrics import f1_score

# Visualization
import seaborn as sns
from matplotlib import pyplot as plt

### <font color="#3c38a8">Create Functions</font>

One-Hot-Encoder

In [None]:
def encoding_func(df):
    # initialize encoder
    encode = preprocessing.OneHotEncoder()
    # fit the encoder to the categorical features
    encode.fit(df)
    # create an array of the encoded features
    onehotlabels = encode.transform(df).toarray()
    # turn the array into a dataframe
    work_data = onehotlabels
    # return the array
    return work_data

Naive-Bayes

In [None]:
def NB(onehotlabels, y):

    # create the classifier
    clf = ComplementNB()
    
    # clf = MultinomialNB() -> Note: running this classifier resulted in data that only had y_pred values of 0; a
    # cause was looked for extensively but could not be found, so the ComplementNB() was used instead (since it 
    # gave actual values)

    # fit the classifier to the features and the label
    clf.fit(onehotlabels, y)

    # create a prediction
    y_pred = clf.predict(onehotlabels)
    
    # create an f1 score for the data
    f_one = f1_score(y , y_pred)
    
    # return the f1 score
    return f_one

KNN

In [None]:
def knn(k, features, y):
    # Create the classifier baased on k neighbors and fit it to the data along with correct labels
    classifier = KNeighborsClassifier(n_neighbors=k, algorithm = 'kd_tree', p=2)  
    classifier.fit(features, y)
            
    # create a prediction based on just features
    y_pred = classifier.predict(features)
            
    # find the F1 score and append it to the list
    f_one = f1_score(y , y_pred)
    
    # return the fl score
    return f_one

### <font color="#3c38a8">Create Randomized Data</font>

Defining data

In [None]:
# Set up the creation of the data
X, y = make_classification(n_samples=10000,
                           random_state=528552,
                           class_sep=0.65,
                           flip_y=0.15,
                           weights=[0.65, 0.35],
                           n_features=10,
                           n_informative=4,
                           n_redundant=2)

In [None]:
# Create a blank list
col_list = []

# loop through 10 iterations and create a label for each of the 10 columns
for i in range(1,11):
    col_list.append('cont'+str(i))

# Check the list
col_list

In [None]:
# create a dataframe of X with col_lsit as column names
df1 = pd.DataFrame(X, columns = col_list)

# cerate a datafrrame of y with the label 'y'
df2 = pd.DataFrame(y, columns = ['y'])

# join the dataframes together
data = pd.concat([df2, df1],axis = 1)

# check the dataframe
data

Create Categorical Variables

In [None]:
sns.distplot(data['cont2'])

In [None]:
sns.distplot(data['cont4'])

In [None]:
sns.distplot(data['cont5'])

In [None]:
sns.distplot(data['cont7'])

In [None]:
sns.distplot(data['cont10'])

In [None]:
# Create 5 categorical categories by splitting the continuous features based on distributions
data["cat1"] = pd.cut(data['cont2'], [-10,-0.7,10], labels = ['A', 'B'])
data["cat2"] = pd.cut(data['cont4'], [-10,0,10], labels = ['A', 'B'])
data["cat3"] = pd.cut(data['cont5'], [-10,-0.7,0.5,10], labels = ['A', 'B', 'C'])
data["cat4"] = pd.cut(data['cont7'], [-10,0.2,10], labels = ['A', 'B'])
data["cat5"] = pd.cut(data['cont10'], [-10,-0.7,0.7,10], labels = ['A', 'B', 'C'])

In [None]:
# Loop through the 5 categorical features and check the number of values in each class
for i in range(1,6):
    name = "cat" + str(i)
    print("Categorical Variable " + str(i))
    print(data[name].value_counts())
    print("-------------------------------------")

Add Standardized Continuous Variables

In [None]:
# Create a dataframe of just the continuous data
cont_var = data.iloc[:,1:11]
cont_var

In [None]:
# standardize the seleted features
scaler = StandardScaler()  
scaler.fit(cont_var)
X_scaled = scaler.transform(cont_var)

# check to ensure everything looks good
print(X_scaled.shape)
print(X_scaled)

In [None]:
# Create an empty list for the col names
scaled_cols = []

# loop through the number of cols and add a title and number
for i in range(1,11):
    scaled_cols.append('scaled_cont'+str(i))

# create a dataframe of the scaled array, along with the col names
scaled_df = pd.DataFrame(X_scaled, columns = scaled_cols)

# join the main dataframe with this new dataframe of scaled continuous values
data = pd.concat([data, scaled_df],axis = 1)

In [None]:
# check the dataframe
print(data.columns)
print(data.head(25))

### <font color="#3c38a8">Run Naive-Bayes</font>

In [None]:
# select the categorical features
cat_features = list(data.iloc[:,11:16])
cat_features

Run Loops for the Hyperparameters

In [None]:
# Create a dataframe of the excel template
nb_df = pd.DataFrame(columns=['Your Name',
                               'Random State',
                               'Class Separator',
                               'flip_y',
                               'Class weight',
                               'Algorithm',
                               'k-Neighbors',
                               'Type_Features',
                               'Number_features',
                               'Number of Models',
                               'Best F1 Score'])
nb_df

In [None]:
# Create a blank list for the scores
score = []

# create a y variable for the actual labels
y = data["y"]

# for the number of features to check in the range of 1 to 5
for i in range(1,6):
    # if 1 or 4 features
    if i == 1 or i == 4:
        # do 5 samples
        for j in range(1,6):
            # select random categorical features, and make a dataframe of them
            features = data[random.sample(cat_features,i)].values
            
            # Run the encoding function on the selected features -> returns array of encoded cols
            onehotlabels = encoding_func(features)
            
            # run the naive-bayes equation on the array, and the correct labels
            f_one = NB(onehotlabels, y)
        
            # add the f1 score to the list
            score.append(f_one)
        
        # Create a row for the dataframe for the specific i, alongwith the max F1 score, then append to df
        temp = pd.DataFrame([['Name',528552,0.65,0.15,'0.65, 0.35','NB','NA','Categorical',i,5,max(score)]],
                           columns=list(nb_df.columns))
        nb_df = nb_df.append(temp)
        
        # reset the score list
        score = []
        
    elif i == 5:
        # do 1 sample
        # select random categorical features, and make a dataframe of them
        features = data[random.sample(cat_features,i)].values
            
        # Run the encoding function on the selected features -> returns array of encoded cols
        onehotlabels = encoding_func(features)
            
        # run the naive-bayes equation on the array, and the correct labels
        f_one = NB(onehotlabels, y)
        
        # add the f1 score to the list
        score.append(f_one)
        
        # Create a row for the dataframe for the specific i, alongwith the max F1 score, then append to df
        temp = pd.DataFrame([['Name',528552,0.65,0.15,'0.65, 0.35','NB','NA','Categorical',i,1,max(score)]],
                           columns=list(nb_df.columns))
        nb_df = nb_df.append(temp)
        
        # reset the score list
        score = []      
    else:
        # do 10 samples
        for j in range(1,11):
            # select random categorical features, and make a dataframe of them
            features = data[random.sample(cat_features,i)].values
            
            # Run the encoding function on the selected features -> returns array of encoded cols
            onehotlabels = encoding_func(features)
            
            # run the naive-bayes equation on the array, and the correct labels
            f_one = NB(onehotlabels, y)
        
            # add the f1 score to the list
            score.append(f_one)
        
        # Create a row for the dataframe for the specific i, alongwith the max F1 score, then append to df
        temp = pd.DataFrame([['Name',528552,0.65,0.15,'0.65, 0.35','NB','NA','Categorical',i,10,max(score)]],
                           columns=list(nb_df.columns))
        nb_df = nb_df.append(temp)
        
        # reset the score list
        score = []

In [None]:
nb_df

### <font color="#3c38a8">Run KNN (Continuous)</font>

Select Features

In [None]:
# Select the scaled numeric features
num_features = list(data.columns)[16:26]
num_features

Run Loops for the Hyperparameters

In [None]:
# Create a dataframe of the excel template
knn_df1 = pd.DataFrame(columns=['Your Name',
                               'Random State',
                               'Class Separator',
                               'flip_y',
                               'Class weight',
                               'Algorithm',
                               'k-Neighbors',
                               'Type_Features',
                               'Number_features',
                               'Number of Models',
                               'Best F1 Score'])
knn_df1

In [None]:
# Create a blank list for the scores
score = []

# create a y variable for the actual labels
y = data["y"]

# Loop through each k value for # of neighbors
for k in range(3,10,2):
    # loop through each number of features as i
    for i in range(3,8):
        #loop through 100 samples
        for j in range(1,101):
            # select random continuous features, and make a datframe of them
            features = data[random.sample(num_features,i)].values
            
            # Run the knn function, sending # neighbors, features, and the labels -> returns f1 score
            f_one = knn(k, features, y)
            
            # add the f1 score to the list
            score.append(f_one)
        
        # Create a row for the dataframe for the specific k and i, alongwith the max F1 score, then append to df
        temp = pd.DataFrame([['Name',528552,0.65,0.15,'0.65, 0.35','KNN',k,'Continuous',i,100,max(score)]],
                           columns=list(knn_df1.columns))
        knn_df1 = knn_df1.append(temp)
        
        # reset the score list
        score = []

In [None]:
knn_df1

### <font color="#3c38a8">Run KNN (Continuous + Categorical)</font>

Select Features

In [None]:
# get all features from the dataframe, dropping the regular cont features and the actual labels
all_knn_features = list(data.drop(['y', 'cont1','cont2','cont3','cont4','cont5','cont6','cont7','cont8','cont9','cont10'],axis=1))
all_knn_features

Run Loops for the Hyperparameters

In [None]:
# Create a dataframe of the excel template
knn_df2 = pd.DataFrame(columns=['Your Name',
                               'Random State',
                               'Class Separator',
                               'flip_y',
                               'Class weight',
                               'Algorithm',
                               'k-Neighbors',
                               'Type_Features',
                               'Number_features',
                               'Number of Models',
                               'Best F1 Score'])
knn_df2

In [None]:
# Create a blank list for the scores
score = []

# create a y variable for the actual labels
y = data["y"]

# Loop through each k value for # of neighbors
for k in range(3,10,2):
    # loop through each number of features as i
    for i in range(3,8):
        #loop through 100 samples
        for j in range(1,101):
            # select i random features from the list, then make a dataframe of those cols
            features = data[random.sample(all_knn_features,i)]            

            # initialize blank dfs for the continuous and categorical features
            new_df_cat = pd.DataFrame()
            new_df_cont = pd.DataFrame()
            
            # loop through the selected features, and separate them into the new dfs based on cont or cat
            for b in features.columns:
                if 'cat' in b:
                    new_df_cat = pd.concat([new_df_cat, pd.Series(data[b])], axis=1)
                else:
                    new_df_cont = pd.concat([new_df_cont, pd.Series(data[b])], axis=1)

            # if there are no categorical features, create an array of the continuous features
            if new_df_cat.empty == True:
                work_data = new_df_cont.values
    
            # if there are no continuous features, create an array of the encoded categorical features
            elif new_df_cont.empty == True:
                work_data = encoding_func(new_df_cat)
    
            # If both are present, encode the categorical features, then combine with continuous features
            else:
                cat_work_data = encoding_func(new_df_cat)
                work_data = np.concatenate((new_df_cont.values, cat_work_data), axis=1)
            
            # Run the knn function, sending # neighbors, features, and the labels -> returns f1 score
            f_one = knn(k, work_data, y)
            
            # add the f1 score to the list
            score.append(f_one)

        # Create a row for the dataframe for the specific k and i, alongwith the max F1 score, then append to the df
        temp = pd.DataFrame([['Name',528552,0.65,0.15,'0.65, 0.35','KNN',k,'Cont + Categorical',i,100,max(score)]],
                           columns=list(knn_df2.columns))
        knn_df2 = knn_df2.append(temp)
        
        # reset the score list
        score = []

In [None]:
knn_df2

### <font color="#3c38a8">Combine Everything</font>

Join KNN Dataframes

In [None]:
# Join the two knn dataframes
final_knn_df = pd.concat([knn_df1, knn_df2])

In [None]:
# sort the table so it looks the same as the template
final_knn_df2 = final_knn_df.sort_values(by=['k-Neighbors', 'Type_Features', 'Number_features'], ascending=[True, False, True])
final_knn_df2

Join the Naive-Bayes Dataframe

In [None]:
# add the NB dataframe to the final dataframe
final = pd.concat([final_knn_df2, nb_df])
final

Export the .csv

In [None]:
# change dir
%cd ""

In [None]:
# export the dataframe as a .csv
final.to_csv("Quiz1.csv", index=False)