In [14]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split

# Classifiers
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Create the dataframe from the given dataset
dataLocation = "../data/ACME-HappinessSurvey2020.csv"
df = pd.read_csv(dataLocation)

In [3]:
# Get descriptiove statistics of the entire dataset to get the general idea about the data set.
df.describe()

Unnamed: 0,Y,X1,X2,X3,X4,X5,X6
count,126.0,126.0,126.0,126.0,126.0,126.0,126.0
mean,0.547619,4.333333,2.531746,3.309524,3.746032,3.650794,4.253968
std,0.499714,0.8,1.114892,1.02344,0.875776,1.147641,0.809311
min,0.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,0.0,4.0,2.0,3.0,3.0,3.0,4.0
50%,1.0,5.0,3.0,3.0,4.0,4.0,4.0
75%,1.0,5.0,3.0,4.0,4.0,4.0,5.0
max,1.0,5.0,5.0,5.0,5.0,5.0,5.0


In [4]:
# CHeck if any columns in the dataset contain null values

In [5]:
df.isnull().sum()

Y     0
X1    0
X2    0
X3    0
X4    0
X5    0
X6    0
dtype: int64

In [6]:
# The data set does not contain any null values. Now proceed with further steps
# This is a classification problem with single dependent variable "Y" and six independent variables

In [7]:
# Divide the data into training data and test data (80:20)
X = df.loc[:, df.columns!='Y']
y = df['Y']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20)

In [8]:
classifier = GaussianNB()
model = classifier.fit(X_train, y_train)
y_pred = model.predict(X_test)
totalTestPoints = len(y_test)
correctlyPredicted = (y_test == y_pred).sum()

print("Naive Bayes:")
print(f"Testing Accuracy: {round(100*correctlyPredicted/totalTestPoints,2)}%") # TP+TN/Total

Naive Bayes:
Testing Accuracy: 61.54%


In [9]:
classifier = LogisticRegression()
model = classifier.fit(X_train, y_train)
y_pred = model.predict(X_test)
totalTestPoints = len(y_test)
correctlyPredicted = (y_test == y_pred).sum()

print("Logistic Regression:")
print(f"Testing Accuracy: {round(100*correctlyPredicted/totalTestPoints,2)}%") # TP+TN/Total

Logistic Regression:
Testing Accuracy: 61.54%


In [10]:
classifier = KNeighborsClassifier(n_neighbors=3)
model = classifier.fit(X_train, y_train)
y_pred = model.predict(X_test)
totalTestPoints = len(y_test)
correctlyPredicted = (y_test == y_pred).sum()

print("K-Nearest Neighbours:")
print(f"Testing Accuracy: {round(100*correctlyPredicted/totalTestPoints,2)}%") # TP+TN/Total

K-Nearest Neighbours:
Testing Accuracy: 53.85%


In [11]:
classifier = DecisionTreeClassifier(random_state=0, max_depth=4)
model = classifier.fit(X_train, y_train)
y_pred = model.predict(X_test)
totalTestPoints = len(y_test)
correctlyPredicted = (y_test == y_pred).sum()

print("Decision Tree:")
print(f"Testing Accuracy: {round(100*correctlyPredicted/totalTestPoints,2)}%") # TP+TN/Total

Decision Tree:
Testing Accuracy: 46.15%


In [12]:
classifier = MLPClassifier(alpha=1, max_iter=10000)
model = classifier.fit(X_train, y_train)
y_pred = model.predict(X_test)
totalTestPoints = len(y_test)
correctlyPredicted = (y_test == y_pred).sum()

print("Neural Net:")
print(f"Testing Accuracy: {round(100*correctlyPredicted/totalTestPoints,2)}%") # TP+TN/Total

Neural Net:
Testing Accuracy: 42.31%


In [13]:
classifier = SVC(gamma=2, C=1)
model = classifier.fit(X_train, y_train)
y_pred = model.predict(X_test)
totalTestPoints = len(y_test)
correctlyPredicted = (y_test == y_pred).sum()

print("Support Vector Machine:")
print(f"Testing Accuracy: {round(100*correctlyPredicted/totalTestPoints,2)}%") # TP+TN/Total

Support Vector Machine:
Testing Accuracy: 42.31%


In [27]:
classifier = RandomForestClassifier(max_depth=5, n_estimators=10, random_state=0)
model = classifier.fit(X_train, y_train)
y_pred = model.predict(X_test)
totalTestPoints = len(y_test)
correctlyPredicted = (y_test == y_pred).sum()

print("Random Forest:")
print(f"Testing Accuracy: {round(100*correctlyPredicted/totalTestPoints,2)}%") # TP+TN/Total

Random Forest:
Testing Accuracy: 50.0%


## Questions
1. Classifier selection criterion
2. Depending on test/train split, the performance changes. How to measure the performance? Multiple runs?