In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split

# Feature selection
from mlxtend.feature_selection import ExhaustiveFeatureSelector

# Classifiers
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Set random seed
RANDOM_STATE = 0

In [2]:
# Create the dataframe from the given dataset
dataLocation = "../data/ACME-HappinessSurvey2020.csv"
df = pd.read_csv(dataLocation)

In [3]:
# Get descriptiove statistics of the entire dataset to get the general idea about the data set.
df.describe()

Unnamed: 0,Y,X1,X2,X3,X4,X5,X6
count,126.0,126.0,126.0,126.0,126.0,126.0,126.0
mean,0.547619,4.333333,2.531746,3.309524,3.746032,3.650794,4.253968
std,0.499714,0.8,1.114892,1.02344,0.875776,1.147641,0.809311
min,0.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,0.0,4.0,2.0,3.0,3.0,3.0,4.0
50%,1.0,5.0,3.0,3.0,4.0,4.0,4.0
75%,1.0,5.0,3.0,4.0,4.0,4.0,5.0
max,1.0,5.0,5.0,5.0,5.0,5.0,5.0


In [4]:
# CHeck if any columns in the dataset contain null values

In [5]:
df.isnull().sum()

Y     0
X1    0
X2    0
X3    0
X4    0
X5    0
X6    0
dtype: int64

In [6]:
# The data set does not contain any null values. Now proceed with further steps
# This is a classification problem with single dependent variable "Y" and six independent variables

In [7]:
# Examine the correlation between predictor variables
df.corr()

Unnamed: 0,Y,X1,X2,X3,X4,X5,X6
Y,1.0,0.28016,-0.024274,0.150838,0.064415,0.224522,0.167669
X1,0.28016,1.0,0.059797,0.283358,0.087541,0.432772,0.411873
X2,-0.024274,0.059797,1.0,0.184129,0.114838,0.039996,-0.062205
X3,0.150838,0.283358,0.184129,1.0,0.302618,0.358397,0.20375
X4,0.064415,0.087541,0.114838,0.302618,1.0,0.293115,0.215888
X5,0.224522,0.432772,0.039996,0.358397,0.293115,1.0,0.320195
X6,0.167669,0.411873,-0.062205,0.20375,0.215888,0.320195,1.0


In [8]:
# Divide the data into training data and test data (80:20)
X = df.loc[:, df.columns!='Y']
X = X.loc[:, X.columns!='X2']
X = X.loc[:, X.columns!='X4']
y = df['Y']

#X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20)
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=RANDOM_STATE, test_size=0.20)

In [9]:
classifiers = [{'name': 'Naive Bayes',
                'classifier': GaussianNB()},
               {'name': 'Logistic Regression',
                'classifier': LogisticRegression(random_state=RANDOM_STATE)},
               {'name': 'K-Nearest Neighbours',
                'classifier': KNeighborsClassifier()},
               {'name': 'Decision Tree',
                'classifier': DecisionTreeClassifier(random_state=RANDOM_STATE)},
               {'name': 'Neural Net',
                'classifier': MLPClassifier(random_state=RANDOM_STATE, max_iter=10000)},
               {'name': 'Support Vector Machines',
                'classifier': SVC(random_state=RANDOM_STATE)},
               {'name': 'Random Forest',
                'classifier': RandomForestClassifier(random_state=RANDOM_STATE)},
               {'name': 'XGBoost',
                'classifier': GradientBoostingClassifier(random_state=RANDOM_STATE)}
              ]

In [10]:
for classifier in classifiers:
    model = classifier['classifier'].fit(X_train, y_train)
    
    y_test_pred = model.predict(X_test)
    totalTestPoints = len(y_test_pred)
    correctlyPredictedTest = (y_test == y_test_pred).sum()

    y_train_pred = model.predict(X_train)
    totalTrainPoints = len(y_train_pred)
    correctlyPredictedTrain = (y_train == y_train_pred).sum()
    
    print(f"({classifier['name']}) \nTest Accuracy: {round(100*correctlyPredictedTest/totalTestPoints,2)}%; Train Accuracy: {round(100*correctlyPredictedTrain/totalTrainPoints,2)}%\n")

(Naive Bayes) 
Test Accuracy: 65.38%; Train Accuracy: 55.0%

(Logistic Regression) 
Test Accuracy: 61.54%; Train Accuracy: 57.0%

(K-Nearest Neighbours) 
Test Accuracy: 57.69%; Train Accuracy: 81.0%

(Decision Tree) 
Test Accuracy: 61.54%; Train Accuracy: 88.0%

(Neural Net) 
Test Accuracy: 53.85%; Train Accuracy: 85.0%

(Support Vector Machines) 
Test Accuracy: 53.85%; Train Accuracy: 80.0%

(Random Forest) 
Test Accuracy: 61.54%; Train Accuracy: 88.0%

(XGBoost) 
Test Accuracy: 61.54%; Train Accuracy: 86.0%



## Questions
1. Classifier selection criterion
2. Depending on randomness in test/train split, the performance changes. How to measure the performance? Multiple runs?