# A fastest growing startups is in the logistics and delivery domain. They work with several partners and make on-demand delivery to tehir customers. During the COVID-19 pandemic, they are facing several different challenges and everyday they are trying to address these challenges.

They want to predict what makes their customers happy or unhappy using six features, so they can take necessary actions.

Attributes X1 to X6 indicate the responses for each question and have values from 1 to 5 where the smaller number indicates less and the higher number indicates more towards the answer.

Y = target attribute (Y) with values indicating 0 (unhappy) and 1 (happy) customers
X1 = my order was delivered on time
X2 = contents of my order was as I expected
X3 = I ordered everything I wanted to order
X4 = I paid a good price for my order
X5 = I am satisfied with my courier
X6 = the app makes ordering easy for me

In [179]:
# IMPORT LIBRARYS
import os
import csv
import time 
import pandas as pd
import numpy as np
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV,cross_val_score,train_test_split
from sklearn.preprocessing import normalize
from sklearn.metrics import classification_report, confusion_matrix  
from sklearn.feature_selection import SelectKBest,chi2,RFE
from lazypredict.Supervised import LazyClassifier, LazyRegressor
from sklearn.svm import SVR

In [180]:
# OPEN THE .CSV FORMAT WITH PANDAS

data = pd.read_csv (r'ACME-HappinessSurvey2020.csv')
print (data)
#print(data.shape)
#print(data)

     Y  X1  X2  X3  X4  X5  X6
0    0   3   3   3   4   2   4
1    0   3   2   3   5   4   3
2    1   5   3   3   3   3   5
3    0   5   4   3   3   3   5
4    0   5   4   3   3   3   5
..  ..  ..  ..  ..  ..  ..  ..
121  1   5   2   3   4   4   3
122  1   5   2   3   4   2   5
123  1   5   3   3   4   4   5
124  0   4   3   3   4   4   5
125  0   5   3   2   5   5   5

[126 rows x 7 columns]


In [181]:
#Checking the data
df = pd.DataFrame(data)
df.describe()

Unnamed: 0,Y,X1,X2,X3,X4,X5,X6
count,126.0,126.0,126.0,126.0,126.0,126.0,126.0
mean,0.55,4.33,2.53,3.31,3.75,3.65,4.25
std,0.5,0.8,1.11,1.02,0.88,1.15,0.81
min,0.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,0.0,4.0,2.0,3.0,3.0,3.0,4.0
50%,1.0,5.0,3.0,3.0,4.0,4.0,4.0
75%,1.0,5.0,3.0,4.0,4.0,4.0,5.0
max,1.0,5.0,5.0,5.0,5.0,5.0,5.0


In [182]:
#DIVIDE TO FEATURES AND LABES, THEN DIVIDE DATA ON TRAIN AND VALIDATION
x = df.loc[:,'X1':]
#x_new=normalize(x) # NORMALIZE DATA
x_new=x # raw data
y=df.loc[:,'Y']
#print(x,y)
X_train, X_test, y_train, y_test = train_test_split(x_new, y, test_size=0.30, random_state=42)
#print(X_train)


In [183]:
#  LAZY PREDICT TO CHECK THE BEST ALGORITHM FOR THE DATA
classi = LazyClassifier(predictions=True)
# fit and train the model 
start_time_1=time.time()
models_c,predictions_c=classi.fit(X_train, X_test, y_train, y_test)
end_time_1=time.time()

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 46.62it/s]


In [184]:
# SHOW BEST MODELS
models_c

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
QuadraticDiscriminantAnalysis,0.68,0.69,0.69,0.68,0.01
XGBClassifier,0.63,0.64,0.64,0.63,0.05
RandomForestClassifier,0.63,0.64,0.64,0.63,0.12
ExtraTreeClassifier,0.63,0.64,0.64,0.63,0.01
BaggingClassifier,0.63,0.64,0.64,0.63,0.02
AdaBoostClassifier,0.63,0.63,0.63,0.63,0.07
LabelSpreading,0.61,0.61,0.61,0.6,0.01
LabelPropagation,0.61,0.61,0.61,0.6,0.01
ExtraTreesClassifier,0.61,0.61,0.61,0.6,0.09
Perceptron,0.61,0.61,0.61,0.6,0.01


In [185]:
# FEATURES SELECTION
estimator = SVR(kernel="linear")
selector = RFE(estimator, n_features_to_select=4, step=1)
selector = selector.fit(X_train, y_train)
selector.support_# relevants values
selector.ranking_# NUMBER ONE ARE THE BEST FEATURES

array([1, 1, 1, 2, 3, 1])

In [186]:
# SELECT BEST FEATURES
rank=selector.ranking_
X_best_features=[]
for i in range(len(rank)):    
    if(rank[i]==1):        
        X_best_features.append(x_new.iloc[:,i])        
X_best_features = pd.concat(X_best_features,axis=1)
# DIVIDE IN TRAIN AND VALIDATION WITH EBST FEATURES
X_train, X_test, y_train, y_test = train_test_split(X_best_features, y, test_size=0.30, random_state=42)


# RESULTS

In [187]:

# FIRS MODEL QuadraticDiscriminantAnalysis
#pipe = Pipeline([('sc', StandardScaler()), ('qd', QuadraticDiscriminantAnalysis())])
#pipe.fit(X_train, y_train)
#Pipeline(steps=[('sc', StandardScaler()), ('qd', QuadraticDiscriminantAnalysis())])
#pipe.score(X_test, y_test)
clf = QuadraticDiscriminantAnalysis()
print('Training Score:',cross_val_score(clf, X_train, y_train, cv=5, scoring='recall_macro'))
clf.fit(X_train, y_train)
print('Validation Score:',clf.score(X_test, y_test))



Training Score: [0.4        0.6875     0.49350649 0.75714286 0.42142857]
Validation Score: 0.631578947368421


In [188]:
# SECOND MODEL XGBClassifieR
clf1 = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0)
print('Training Score:',cross_val_score(clf1, X_train, y_train, cv=5, scoring='recall_macro'))

clf1.fit(X_train, y_train)
print('Validation Score:',clf1.score(X_test, y_test))


Training Score: [0.3375     0.55       0.43506494 0.49285714 0.61428571]
Validation Score: 0.5526315789473685


In [189]:
# THIRD MODEL RandomForestClassifier
clf2 = RandomForestClassifier(max_depth=2, random_state=0)
print('Training Score:',cross_val_score(clf2, X_train, y_train, cv=5, scoring='recall_macro'))

clf2.fit(X_train, y_train)
print('Validation Score:',clf2.score(X_test, y_test))



Training Score: [0.45       0.6375     0.5974026  0.71428571 0.66428571]
Validation Score: 0.5
