# <font color="#3c38a8">Quiz 2 - Apple Strategy Game Reviews</font>

<font color="#3c38a8">11/09/2019</font>

In [None]:
# load in the required libraries

import pandas as pd
import numpy as np
import random
import scipy as sp
import datetime

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sklearn.decomposition import PCA
from sklearn.decomposition import SparsePCA
from sklearn.manifold import TSNE

from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from sklearn.neighbors import KNeighborsClassifier  
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

from imblearn.over_sampling import ADASYN

In [None]:
# change the working directory
%cd ""

### <font color="#3c38a8">Step 1: Data Cleaning and Dimension Reduction</font>

###### Create Response Categories

In [None]:
# import the excel file with the ratings
df_ratings = pd.read_csv("apple1000new.csv")

df_ratings

In [None]:
# keep only the game name and rating
df_ratings = df_ratings.iloc[:,[0,2]]

df_ratings

In [None]:
# create a categorical column of the scores
df_ratings["rating_cat"] = pd.cut(df_ratings['Average User Rating'], [0,3.1,4.1,5.1], labels = ['Poor', 'Good', 'Great'])

df_ratings

###### Examination of Feature Space

In [None]:
# import the data file and drop the unnecessary index column
df_raw = pd.read_csv("Apple1000games.csv")
df_raw = df_raw.drop(['Unnamed: 0'], axis=1)

df_raw

In [None]:
# check to see if any rows have no words associated with them -> if the max value is 0, then no words appear at all,
# and the row could be considered 'missing'
df_raw.describe().T.sort_values('max', ascending=True)

In [None]:
# merge the ratings with the raw data -> assumption is that the word data file is in the same order as the information
# file about the games
df_raw['rating_cat'] = df_ratings["rating_cat"]
df_raw

In [None]:
# remove the rows where there is no rating, by selecting the notnull ratings
df1 = df_raw[df_raw['rating_cat'].notnull()]

# reset the index and don't add the index as a column
df1.reset_index(inplace=True, drop=True)
df1

In [None]:
# remove columns with a \ in them, using a regex filter (NOTE: \ is selected using \\\\)
df1 = df1[df1.columns.drop(list(df1.filter(regex='\\\\')))]
df1

###### Standardize Feature Space

In [None]:
# select all columns except for the ratings, and check the last value to confirm it is not the ratings
feature_list = list(df1.columns)[0:-1]
feature_list[-1]

In [None]:
# standardize the features, using the previous cell list of feature names
scaler = StandardScaler()  
scaler.fit(df1[feature_list])
df1_scaled = scaler.transform(df1[feature_list])

# confirm the shape of the first row
df1_scaled[1].shape

###### Dimension Reduction - PCA

In [None]:
# get the start time
print("Start Time:", datetime.datetime.now().time())

# Run regular PCA on the standardized data for the first 90% of variance
pca = PCA(0.90)
pca.fit(df1_scaled)
pca_components = pca.transform(df1_scaled)

# create a blank cols list
cols = []

# loop through a for loop (the number of times is equal to the number of values in the first pca_component, and is
# equal to the number of PCs created), and create a col name that is added to the col list
for i in range(len(pca_components[1])):
    cols.append("PC"+str(i+1))
    
# get the end time
print("End Time:", datetime.datetime.now().time())

# put these PCs in a df
df_pca = pd.DataFrame(pca_components, columns = cols)
df_pca

###### Dimension Reduction - Sparse PCA

In [None]:
# get the start time
print("Start Time:", datetime.datetime.now().time())

# Run sparse PCA on the standardized data for 20 components
s_pca = SparsePCA(n_components=20)
s_pca.fit(df1_scaled)
s_pca_components = s_pca.transform(df1_scaled)

# create a blank cols list
cols = []

# loop through a for loop 20 times (the number of components), and create a col name that is added to the col list
for i in range(1,21):
    cols.append("S_PC"+str(i))

# get the end time
print("End Time:", datetime.datetime.now().time())    

# put these PCs in a df
df_s_pca = pd.DataFrame(s_pca_components, columns = cols)
df_s_pca

###### Dimension Reduction - t-SNE

In [None]:
# check the perplexities of several values

# create a list of various perplexities
list_of_perplex = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]

# create a list of various learning rates
list_of_learn = [200, 500, 1000]

# loop through all of the learning rates
for i in list_of_learn:
    
    # loop through all the perplexities
    for j in list_of_perplex:

        # run the t-SNE on the standardized data for 3 dimensions
        tsne_components = TSNE(n_components=3, learning_rate=i, perplexity=j).fit_transform(df1_scaled)

        # create a blank cols list
        cols = []

        # loop through a for loop 3 times (the number of components), and create a col name that is added to the 
        # col list
        for k in range(1,4):
            cols.append("tSNE"+str(k))
    
        # put these PCs in a df
        df_tsne = pd.DataFrame(tsne_components, columns = cols)
        df_tsne
    
        print("Learning Rate:", str(i))
        print("Perplexity:", str(j))
    
        # create a 3-D scatter plot
        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')
        ax.scatter(df_tsne['tSNE1'], df_tsne['tSNE2'], df_tsne['tSNE3'], c='skyblue', s=60)
        ax.view_init(30, 185) # adjust view angle
        plt.show()
        
        print("##############################################################")

In [None]:
# based on the previous investigation of the visualizations of learning rate and perplexity, I chose 500 and 50

# get the start time
print("Start Time:", datetime.datetime.now().time())

# run the t-SNE on the standardized data for 3 dimensions
tsne_components = TSNE(n_components=3, perplexity=50, learning_rate=500).fit_transform(df1_scaled)

# create a blank cols list
cols = []

# loop through a for loop 3 times (the number of components), and create a col name that is added to the col list
for j in range(1,4):
    cols.append("tSNE"+str(j))

# get the end time
print("End Time:", datetime.datetime.now().time())
    
# put these PCs in a df
df_tsne = pd.DataFrame(tsne_components, columns = cols)
df_tsne

###### Combine Dimension Reduction Columns

In [None]:
# concat all three dimension reduction sets into one table
df_dr = pd.concat([df_tsne, df_s_pca, df_pca], axis=1)

# add the rating column
df_dr['rating_cat'] = df1['rating_cat']

df_dr

In [None]:
# check to see that ratings columns carried over correctly (missing data means the indexing got messed up somewhere)
missing = sum(df_dr['rating_cat'].isnull().values)
missing

### <font color="#3c38a8">Step 2: Classification</font>

###### Define Model Functions

In [None]:
# define the KNN funtion

def knn(X_train, X_test, y_train, y_test):
    
    # get the start time
    print("Start Time:", datetime.datetime.now().time())
    
    # list all of the neighbor parameters
    k_list = [5, 7, 9, 11, 13, 15]
    
    # Create a dataframe of the excel template
    knn_df = pd.DataFrame(columns=['Your Name',
                               'Random State',
                               'Algorithm',
                               'k-Neighbors',
                               'n_estimators',
                               'learning_rate',
                               'subsample',
                               'max_depth',
                               'Best F1 Score',
                               'Best Accuracy Score'])
    
    # loop through all of the neighbor parameters
    for i in k_list:
    
        # Create the classifier based on i neighbors and fit it to the data along with correct labels
        clf = KNeighborsClassifier(n_neighbors=i, algorithm = 'kd_tree', metric="euclidean", p=2)  
        clf.fit(X_train, y_train)
            
        # create a prediction based on test set
        y_pred = clf.predict(X_test)
            
        # find the F1 score (macro does not take into account class imbalances -> doesn't appear to be 
        # any in this data)
        f_one = f1_score(y_test , y_pred, average='macro')
        
        # find the accuracy score
        acc = accuracy_score(y_test, y_pred)
    
        temp = pd.DataFrame([['Name', 20, 'KNN', i, 'NA', 'NA', 'NA', 'NA', f_one, acc]], 
                            columns=list(knn_df.columns))
        
        knn_df = knn_df.append(temp)
    
    # get the end time
    print("End Time:", datetime.datetime.now().time())
    
    # return the df
    return knn_df

In [None]:
# define the Random Forest function

def random_forest(X_train, X_test, y_train, y_test):
    
    # get the start time
    print("Start Time:", datetime.datetime.now().time())
    
    # create a list for the estimator parameters
    estimators_list = [100, 500]    
    
    # Create a dataframe of the excel template
    rf_df = pd.DataFrame(columns=['Your Name',
                               'Random State',
                               'Algorithm',
                               'k-Neighbors',
                               'n_estimators',
                               'learning_rate',
                               'subsample',
                               'max_depth',
                               'Best F1 Score',
                               'Best Accuracy Score'])

    # loop through all of the estimator parameters
    for i in estimators_list:
    
        # Create the classifier based on i estimators and fit it to the data along with correct labels
        clf = RandomForestClassifier(n_estimators=i, min_samples_split=5, min_samples_leaf=5, random_state=20)
        clf.fit(X_train, y_train)
    
        # create a prediction based on test set
        y_pred = clf.predict(X_test)
        
        # find the F1 score (macro does not take into account class imbalances -> doesn't appear to be 
        # any in this data)
        f_one = f1_score(y_test , y_pred, average='macro')
        
        # find the accuracy score
        acc = accuracy_score(y_test, y_pred)
    
        temp = pd.DataFrame([['Name', 20, 'RF', 'NA', i, 'NA', 'NA', 'NA', f_one, acc]], 
                            columns=list(rf_df.columns))
        
        rf_df = rf_df.append(temp)
    
    # get the end time
    print("End Time:", datetime.datetime.now().time())
    
    # return the df
    return rf_df

In [None]:
# define the Gradient Boosting function

def gradient_boosting(X_train, X_test, y_train, y_test):
    
    # get the start time
    print("Start Time:", datetime.datetime.now().time())
    
    # create a list for the learning rate parameters
    learning_rate_list = [0.1, 0.01, 0.001]   
    
    # create a list for the estimator parameters
    estimators_list = [100, 500] 
    
    # create a list for the subsample parameters
    subsample_list = [0.6, 0.8, 1]
    
    # Create a dataframe of the excel template
    gb_df = pd.DataFrame(columns=['Your Name',
                               'Random State',
                               'Algorithm',
                               'k-Neighbors',
                               'n_estimators',
                               'learning_rate',
                               'subsample',
                               'max_depth',
                               'Best F1 Score',
                               'Best Accuracy Score'])
    
    # loop through all of the three parameters using a nested for loop system to get every combination
    for i in learning_rate_list:
        for j in estimators_list:
            for k in subsample_list:
                
                # Create the classifier based on the three parameters from the loops
                clf = GradientBoostingClassifier(learning_rate=i, n_estimators=j, subsample=k, min_samples_split=5, 
                                                 min_samples_leaf=5, random_state=20)
                clf.fit(X_train, y_train)

                # create a prediction based on test set
                y_pred = clf.predict(X_test)
    
                # find the F1 score (macro does not take into account class imbalances -> doesn't appear to be 
                # any in this data)
                f_one = f1_score(y_test , y_pred, average='macro')
        
                # find the accuracy score
                acc = accuracy_score(y_test, y_pred)
                
                temp = pd.DataFrame([['Name', 20, 'GB', 'NA', j, i, k, 'NA', f_one, acc]], 
                            columns=list(gb_df.columns))
        
                gb_df = gb_df.append(temp)
    
    # get the end time
    print("End Time:", datetime.datetime.now().time())
    
    # return the df
    return gb_df

In [None]:
# define the XGBoost function

def xgboost(X_train, X_test, y_train, y_test):
    
    # get the start time
    print("Start Time:", datetime.datetime.now().time())
    
    # create a list for the learning rate parameters
    learning_rate_list = [0.1, 0.01, 0.001]   
    
    # create a list for the estimator parameters
    estimators_list = [100, 500] 
    
    # create a list for the subsample parameters
    subsample_list = [0.6, 0.8, 1]
    
    # create a list for the max depth parameters
    max_depth_list = [5, 7, 9]
    
    # Create a dataframe of the excel template
    xgb_df = pd.DataFrame(columns=['Your Name',
                               'Random State',
                               'Algorithm',
                               'k-Neighbors',
                               'n_estimators',
                               'learning_rate',
                               'subsample',
                               'max_depth',
                               'Best F1 Score',
                               'Best Accuracy Score'])
    
    # loop through all of the three parameters using a nested for loop system to get every combination
    for i in learning_rate_list:
        for j in estimators_list:
            for k in subsample_list:
                for m in max_depth_list:
                
                    # Create the classifier based on the three parameters from the loops
                    clf = XGBClassifier(learning_rate=i, n_estimators=j, subsample=k, max_depth=m, random_state=20)
                    clf.fit(X_train, y_train)

                    # create a prediction based on test set
                    y_pred = clf.predict(X_test)
    
 
                    # find the F1 score (macro does not take into account class imbalances -> doesn't appear to be 
                    # any in this data)
                    f_one = f1_score(y_test , y_pred, average='macro')
        
                    # find the accuracy score
                    acc = accuracy_score(y_test, y_pred)
                
                    temp = pd.DataFrame([['Name', 20, 'XGBoost', 'NA', j, i, k, m, f_one, acc]], 
                                columns=list(xgb_df.columns))
        
                    xgb_df = xgb_df.append(temp)
    
    # get the end time
    print("End Time:", datetime.datetime.now().time())
    
    # return the df
    return xgb_df

###### Create Test/Train

In [None]:
# create the test/train split for pca columns

# X = all pca columns, Y = rating
X = df_dr.iloc[:,23:-1]
Y = df_dr["rating_cat"]


X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X, Y, test_size=0.2, random_state=20)

######################################################################################################################

# create the test/train split for sparse pca columns
# X = all sparse pca columns, Y = rating (from before)
X = df_dr.iloc[:,3:23]

# perform the split, using 20% for test data
X_train_s_pca, X_test_s_pca, y_train_s_pca, y_test_s_pca = train_test_split(X, Y, test_size=0.2, random_state=20)

######################################################################################################################

# create the test/train split for t-SNE columns

# X = all t-SNE columns, Y = rating (from before)
X = df_dr.iloc[:,0:3]

# perform the split, using 20% for test data
X_train_tsne, X_test_tsne, y_train_tsne, y_test_tsne = train_test_split(X, Y, test_size=0.2, random_state=20)

###### KNN

In [None]:
# run the knn function for the t-SNE data
df_knn_tsne = knn(X_train_tsne, X_test_tsne, y_train_tsne, y_test_tsne)
print(df_knn_tsne)
print("--------------------------------------------------------------------------------------")

######################################################################################################################

# run the knn function for the Sparse PCA data
df_knn_s_pca = knn(X_train_s_pca, X_test_s_pca, y_train_s_pca, y_test_s_pca)
print(df_knn_s_pca)
print("--------------------------------------------------------------------------------------")


######################################################################################################################

# run the knn function for the PCA data
df_knn_pca = knn(X_train_pca, X_test_pca, y_train_pca, y_test_pca)
print(df_knn_pca)
print("--------------------------------------------------------------------------------------")

###### Random Forest

In [None]:
# run the Random Forest function for the t-SNE data
df_rf_tsne = random_forest(X_train_tsne, X_test_tsne, y_train_tsne, y_test_tsne)
print(df_rf_tsne)
print("--------------------------------------------------------------------------------------")

######################################################################################################################

# run the Random Forest function for the Sparse PCA data
df_rf_s_pca = random_forest(X_train_s_pca, X_test_s_pca, y_train_s_pca, y_test_s_pca)
print(df_rf_s_pca)
print("--------------------------------------------------------------------------------------")

######################################################################################################################

# run the Random Forest function for the PCA data
df_rf_pca = random_forest(X_train_pca, X_test_pca, y_train_pca, y_test_pca)
print(df_rf_pca)
print("--------------------------------------------------------------------------------------")

###### Gradient Boosting

In [None]:
# run the Gradient Boosting function for the t-SNE data
df_gb_tsne = gradient_boosting(X_train_tsne, X_test_tsne, y_train_tsne, y_test_tsne)
print(df_gb_tsne)
print("--------------------------------------------------------------------------------------")

######################################################################################################################

# run the Gradient Boosting function for the Sparse PCA data
df_gb_s_pca = gradient_boosting(X_train_s_pca, X_test_s_pca, y_train_s_pca, y_test_s_pca)
print(df_gb_s_pca)
print("--------------------------------------------------------------------------------------")

######################################################################################################################

# run the Gradient Boosting function for the PCA data
df_gb_pca = gradient_boosting(X_train_pca, X_test_pca, y_train_pca, y_test_pca)
print(df_gb_pca)
print("--------------------------------------------------------------------------------------")

###### XGBoost

In [None]:
# run the XGBoost function for the t-SNE data
df_xgb_tsne = xgboost(X_train_tsne, X_test_tsne, y_train_tsne, y_test_tsne)
print(df_xgb_tsne)
print("--------------------------------------------------------------------------------------")

######################################################################################################################

# run the XGBoost function for the Sparse PCA data
df_xgb_s_pca = xgboost(X_train_s_pca, X_test_s_pca, y_train_s_pca, y_test_s_pca)
print(df_xgb_s_pca)
print("--------------------------------------------------------------------------------------")

######################################################################################################################

# run the XGBoost function for the PCA data
df_xgb_pca = xgboost(X_train_pca, X_test_pca, y_train_pca, y_test_pca)
print(df_xgb_pca)
print("--------------------------------------------------------------------------------------")

###### Combine the Outputs

In [None]:
# combine all the model outputs for t-SNE
df_tsne_final = pd.concat([df_knn_tsne, df_rf_tsne, df_gb_tsne, df_xgb_tsne])
print(df_tsne_final)
print("--------------------------------------------------------------------------------------")

######################################################################################################################

# combine all the model outputs for Sparse PCA
df_s_pca_final = pd.concat([df_knn_s_pca, df_rf_s_pca, df_gb_s_pca, df_xgb_s_pca])
print(df_s_pca_final)
print("--------------------------------------------------------------------------------------")

######################################################################################################################

# combine all the model outputs for PCA
df_pca_final = pd.concat([df_knn_pca, df_rf_pca, df_gb_pca, df_xgb_pca])
print(df_pca_final)
print("--------------------------------------------------------------------------------------")

######################################################################################################################

In [None]:
# export the files
df_tsne_final.to_csv("Quiz2(t-SNE).csv", index=False)
df_s_pca_final.to_csv("Quiz2(SparsePCA).csv", index=False)
df_pca_final.to_csv("Quiz2(PCA).csv", index=False)

### <font color="#3c38a8">Step 2.5: Using ADASYN to Expand Data, Then Classification</font>

###### Define ADASYN Model and Function

In [None]:
# create the adasyn model, setting a seed for replicity and setting the strategy to resample all classes
ada = ADASYN(random_state=20, sampling_strategy='all')

# define an adasyn function:
def ada_func(X, y):
    X_res, y_res = ada.fit_resample(X, y)
    
    # As a note: attempting to run adasyn a second time to increase the data resulted in an error
    
    # Check the counts of each class
    print("The number of 'Poor' values is:", np.count_nonzero(y_res == 'Poor'))
    print("The number of 'Good' values is:", np.count_nonzero(y_res == 'Good'))
    print("The number of 'Great' values is:", np.count_nonzero(y_res == 'Great'))
    
    return X_res, y_res

###### Run ADASYN and Split the Data by Dimension Reduction Technique 

In [None]:
# split the data based on dimension reduction technique

# all pca columns
X_pca_ada = df_dr.iloc[:,23:-1]

# all sparse pca columns
X_s_pca_ada = df_dr.iloc[:,3:23]

# all t-SNE columns
X_tsne_ada = df_dr.iloc[:,0:3]

y_ada = df_dr.iloc[:,-1]

In [None]:
# run the adasyn function to generate feature spaces for all three dimension reduction methods
print("t-SNE")
X_res_tsne, y_res_tsne = ada_func(X_tsne_ada, y_ada)
print("------------------------------------------------")

print("Sparse PCA")
X_res_s_pca, y_res_s_pca = ada_func(X_s_pca_ada, y_ada)
print("------------------------------------------------")

print("PCA")
X_res_pca, y_res_pca = ada_func(X_pca_ada, y_ada)

##### Test/Train Split

In [None]:
# create the test/train split for t-SNE columns, using 20% of the data as test
X_train_tsne_ada, X_test_tsne_ada, y_train_tsne_ada, y_test_tsne_ada = train_test_split(X_res_tsne, 
                                                                                    y_res_tsne,
                                                                                    test_size=0.2, 
                                                                                    random_state=20)

######################################################################################################################

# create the test/train split for sparse pca columns, using 20% of the data as test
X_train_s_pca_ada, X_test_s_pca_ada, y_train_s_pca_ada, y_test_s_pca_ada = train_test_split(X_res_s_pca, 
                                                                                            y_res_s_pca, 
                                                                                            test_size=0.2, 
                                                                                            random_state=20)

######################################################################################################################

# create the test/train split for pca columns, using 20% of the data as test
X_train_pca_ada, X_test_pca_ada, y_train_pca_ada, y_test_pca_ada = train_test_split(X_res_pca, 
                                                                                    y_res_pca,
                                                                                    test_size=0.2, 
                                                                                    random_state=20)

##### KNN

In [None]:
# run the knn function for the t-SNE data
df_knn_tsne_ada = knn(X_train_tsne_ada, X_test_tsne_ada, y_train_tsne_ada, y_test_tsne_ada)
print(df_knn_tsne_ada)
print("--------------------------------------------------------------------------------------")

######################################################################################################################

# run the knn function for the Sparse PCA data
df_knn_s_pca_ada = knn(X_train_s_pca_ada, X_test_s_pca_ada, y_train_s_pca_ada, y_test_s_pca_ada)
print(df_knn_s_pca_ada)
print("--------------------------------------------------------------------------------------")


######################################################################################################################

# run the knn function for the PCA data
df_knn_pca_ada = knn(X_train_pca_ada, X_test_pca_ada, y_train_pca_ada, y_test_pca_ada)
print(df_knn_pca_ada)
print("--------------------------------------------------------------------------------------")

##### Random Forest

In [None]:
# run the Random Forest function for the t-SNE data
df_rf_tsne_ada = random_forest(X_train_tsne_ada, X_test_tsne_ada, y_train_tsne_ada, y_test_tsne_ada)
print(df_rf_tsne_ada)
print("--------------------------------------------------------------------------------------")

######################################################################################################################

# run the Random Forest function for the Sparse PCA data
df_rf_s_pca_ada = random_forest(X_train_s_pca_ada, X_test_s_pca_ada, y_train_s_pca_ada, y_test_s_pca_ada)
print(df_rf_s_pca_ada)
print("--------------------------------------------------------------------------------------")

######################################################################################################################

# run the Random Forest function for the PCA data
df_rf_pca_ada = random_forest(X_train_pca_ada, X_test_pca_ada, y_train_pca_ada, y_test_pca_ada)
print(df_rf_pca_ada)
print("--------------------------------------------------------------------------------------")

##### Gradient Boosting

In [None]:
# run the Gradient Boosting function for the t-SNE data
df_gb_tsne_ada = gradient_boosting(X_train_tsne_ada, X_test_tsne_ada, y_train_tsne_ada, y_test_tsne_ada)
print(df_gb_tsne_ada)
print("--------------------------------------------------------------------------------------")

######################################################################################################################

# run the Gradient Boosting function for the Sparse PCA data
df_gb_s_pca_ada = gradient_boosting(X_train_s_pca_ada, X_test_s_pca_ada, y_train_s_pca_ada, y_test_s_pca_ada)
print(df_gb_s_pca_ada)
print("--------------------------------------------------------------------------------------")

######################################################################################################################

# run the Gradient Boosting function for the PCA data
df_gb_pca_ada = gradient_boosting(X_train_pca_ada, X_test_pca_ada, y_train_pca_ada, y_test_pca_ada)
print(df_gb_pca_ada)
print("--------------------------------------------------------------------------------------")

##### XGBoost

In [None]:
# run the XGBoost function for the t-SNE data
df_xgb_tsne_ada = xgboost(X_train_tsne_ada, X_test_tsne_ada, y_train_tsne_ada, y_test_tsne_ada)
print(df_xgb_tsne_ada)
print("--------------------------------------------------------------------------------------")

######################################################################################################################

# run the XGBoost function for the Sparse PCA data
df_xgb_s_pca_ada = xgboost(X_train_s_pca_ada, X_test_s_pca_ada, y_train_s_pca_ada, y_test_s_pca_ada)
print(df_xgb_s_pca_ada)
print("--------------------------------------------------------------------------------------")

######################################################################################################################

# run the XGBoost function for the PCA data
df_xgb_pca_ada = xgboost(X_train_pca_ada, X_test_pca_ada, y_train_pca_ada, y_test_pca_ada)
print(df_xgb_pca_ada)
print("--------------------------------------------------------------------------------------")

##### Combine the Outputs

In [None]:
# combine all the model outputs for t-SNE
df_tsne_final_ada = pd.concat([df_knn_tsne_ada, df_rf_tsne_ada, df_gb_tsne_ada, df_xgb_tsne_ada])
print(df_tsne_final_ada)
print("--------------------------------------------------------------------------------------")

######################################################################################################################

# combine all the model outputs for Sparse PCA
df_s_pca_final_ada = pd.concat([df_knn_s_pca_ada, df_rf_s_pca_ada, df_gb_s_pca_ada, df_xgb_s_pca_ada])
print(df_s_pca_final_ada)
print("--------------------------------------------------------------------------------------")

######################################################################################################################

# combine all the model outputs for PCA
df_pca_final_ada = pd.concat([df_knn_pca_ada, df_rf_pca_ada, df_gb_pca_ada, df_xgb_pca_ada])
print(df_pca_final_ada)
print("--------------------------------------------------------------------------------------")

######################################################################################################################

In [None]:
# export the files
df_tsne_final_ada.to_csv("Quiz2(t-SNE)-WithADASYN.csv", index=False)
df_s_pca_final_ada.to_csv("Quiz2(SparsePCA)-WithADASYN.csv", index=False)
df_pca_final_ada.to_csv("Quiz2(PCA)-WithADASYN.csv", index=False)

### <font color="#3c38a8">Extra Examination of Data</font>

###### Using Different Number of Columns for KNN

In [None]:
# for the Sparse PCA columns

# initialize a blank df
knn_range_df_s_pca = pd.DataFrame()

# set a count to start with the first two columns
count = 2

# loop through to get the first two then all columns
for i in range(4, 23):
    
    # specify the range of x features based on the loop; keep y the same
    X = df_dr.iloc[:,3:i]
    Y = df_dr["rating_cat"]

    # perform the split, using 20% for test data
    X_train_s_pca, X_test_s_pca, y_train_s_pca, y_test_s_pca = train_test_split(X, Y, test_size=0.2, random_state=20)
    
    # run the knn function for the Sparse PCA data
    df_knn_s_pca = knn(X_train_s_pca, X_test_s_pca, y_train_s_pca, y_test_s_pca)
    
    # definfe names for the accuracy and F1 score using the number of PCs in the loop
    acc_name = str(count) + "_PCs_Accuracy"
    f1_name = str(count) + "_PCs_Accuracy"
    
    # create columns in the df initialized earlier that correspond to the accuracys and F1 scores calculated
    knn_range_df_s_pca[acc_name] = df_knn_s_pca["Best Accuracy Score"]
    knn_range_df_s_pca[f1_name] = df_knn_s_pca["Best F1 Score"]
    
    # add one to the count at the end of the loop
    count += 1
    
knn_range_df_s_pca

In [None]:
# for the PCA columns

# initialize a blank df
knn_range_df_pca = pd.DataFrame()

# set a count to start with the first two columns
count = 2



# loop through to get the first two then all columns
for i in range(24, len(df_dr.columns)):
    
    # specify the range of x features based on the loop; keep y the same
    X = df_dr.iloc[:,23:i]
    Y = df_dr["rating_cat"]

    # perform the split, using 20% for test data
    X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X, Y, test_size=0.2, random_state=20)
    
    # run the knn function for the Sparse PCA data
    df_knn_pca = knn(X_train_pca, X_test_pca, y_train_pca, y_test_pca)
    
    # definfe names for the accuracy and F1 score using the number of PCs in the loop
    acc_name = str(count) + "_PCs_Accuracy"
    f1_name = str(count) + "_PCs_Accuracy"
    
    # create columns in the df initialized earlier that correspond to the accuracys and F1 scores calculated
    knn_range_df_pca[acc_name] = df_knn_pca["Best Accuracy Score"]
    knn_range_df_pca[f1_name] = df_knn_pca["Best F1 Score"]
    
    # add one to the count at the end of the loop
    count += 1
    
knn_range_df_pca

It does not appear that the number of columns makes any difference in the scoring of the Sparse PCA and PCA data sets. However, this check was done in a sequential manner; perhaps using a random assortment of a random number of PCs would generate some higher results

###### Remove Test/Train Split

In [None]:
# X_pca = all pca columns
X_pca = df_dr.iloc[:,23:-1]

# X_s_pca = all sparse pca columns
X_s_pca = df_dr.iloc[:,3:23]

# X_tsne = all t-SNE columns
X_tsne = df_dr.iloc[:,0:3]

# Y_all = the response values
Y_all = df_dr["rating_cat"]

In [None]:
# run the knn function for the t-SNE data
df_knn_tsne_all = knn(X_tsne, X_tsne, Y_all, Y_all)
print(df_knn_tsne_all)
print("--------------------------------------------------------------------------------------")

######################################################################################################################

# run the knn function for the Sparse PCA data
df_knn_s_pca_all = knn(X_s_pca, X_s_pca, Y_all, Y_all)
print(df_knn_s_pca_all)
print("--------------------------------------------------------------------------------------")


######################################################################################################################

# run the knn function for the PCA data
df_knn_pca_all = knn(X_pca, X_pca, Y_all, Y_all)
print(df_knn_pca_all)
print("--------------------------------------------------------------------------------------")

In [None]:
# run the Random Forest function for the t-SNE data
df_rf_tsne_all = random_forest(X_pca, X_pca, Y_all, Y_all)
print(df_rf_tsne_all)
print("--------------------------------------------------------------------------------------")

######################################################################################################################

# run the Random Forest function for the Sparse PCA data
df_rf_s_pca_all = random_forest(X_pca, X_pca, Y_all, Y_all)
print(df_rf_s_pca_all)
print("--------------------------------------------------------------------------------------")

######################################################################################################################

# run the Random Forest function for the PCA data
df_rf_pca_all = random_forest(X_pca, X_pca, Y_all, Y_all)
print(df_rf_pca_all)
print("--------------------------------------------------------------------------------------")

In [None]:
# run the Gradient Boosting function for the t-SNE data
df_gb_tsne_all = gradient_boosting(X_pca, X_pca, Y_all, Y_all)
print(df_gb_tsne_all)
print("--------------------------------------------------------------------------------------")

######################################################################################################################

# run the Gradient Boosting function for the Sparse PCA data
df_gb_s_pca_all = gradient_boosting(X_pca, X_pca, Y_all, Y_all)
print(df_gb_s_pca_all)
print("--------------------------------------------------------------------------------------")

######################################################################################################################

# run the Gradient Boosting function for the PCA data
df_gb_pca_all = gradient_boosting(X_pca, X_pca, Y_all, Y_all)
print(df_gb_pca_all)
print("--------------------------------------------------------------------------------------")

In [None]:
# run the XGBoost function for the t-SNE data
df_xgb_tsne_all = xgboost(X_pca, X_pca, Y_all, Y_all)
print(df_xgb_tsne_all)
print("--------------------------------------------------------------------------------------")

######################################################################################################################

# run the XGBoost function for the Sparse PCA data
df_xgb_s_pca_all = xgboost(X_pca, X_pca, Y_all, Y_all)
print(df_xgb_s_pca_all)
print("--------------------------------------------------------------------------------------")

######################################################################################################################

# run the XGBoost function for the PCA data
df_xgb_pca_all = xgboost(X_pca, X_pca, Y_all, Y_all)
print(df_xgb_pca_all)
print("--------------------------------------------------------------------------------------")

The conclusion I can draw from this experiment is that using all of the data (no test/train split) results in better scores for F1 and accuracy.