# Empathy Detection

In [85]:
import pandas as pd
from numpy import *
from statistics import *
from math import *
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn import svm, tree, linear_model
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.naive_bayes import MultinomialNB, BernoulliNB 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
pd.options.mode.chained_assignment = None  

data = pd.read_csv("responses.csv",delimiter=",")

# Preprocessing

In [86]:
# Encoding strings
encoding = {"Smoking": {"never smoked": 1, "tried smoking": 2, "former smoker": 3, "current smoker": 4},
            "Alcohol": {"never": 1, "social drinker": 2, "drink a lot": 3},
            "Punctuality": {"i am often early": 1, "i am often early": 2, "i am often running late": 3, "i am always on time": 4},
            "Lying": {"never": 1, "sometimes": 2, "only to avoid hurting someone": 3, "everytime it suits me": 4},
            "Internet usage": {"no time at all": 1, "less than an hour a day": 2, "few hours a day": 3, "most of the day": 4},
            "Gender": {"male": 1, "female": 2},
            "Left - right handed": {"right handed": 1, "left handed": 2},
            "Education": {"primary school": 1, "currently a primary school pupil": 1, "secondary school": 2, "college/bachelor degree": 3, "masters degree": 4, "doctorate degree": 5},
            "Only child": {"no": 1, "yes": 2},
            "Village - town": {"village": 1, "city": 2},
            "House - block of flats": {"block of flats": 1, "house/bungalow": 2}}
data.replace(encoding, inplace=True)

# Removing rows with class label undefined
data = data[pd.notnull(data['Empathy'])]

# Filling the missing values with rounded mean for the column
data.fillna(round(data.mean()), inplace = True)

# Encoding class to zeros and ones
data['Class'] = 42
data['Class'].loc[data['Empathy'] == 1] = 0
data['Class'].loc[data['Empathy'] == 2] = 0
data['Class'].loc[data['Empathy'] == 3] = 0
data['Class'].loc[data['Empathy'] == 4] = 1
data['Class'].loc[data['Empathy'] == 5] = 1
data.drop(['Empathy'], axis=1, inplace = True)

**Normalizing data using 2 ways and see which normalizing is best**

1- Normalized Between zero and one by max of each column

2- Catagorical from 1 to 5

In [87]:
# Splitting the data into x and y
x = data.values[:,0:149]
y = data.values[:,149]

# Normalizing the data between 0 and 1 by MaxScaler
scaler = MaxAbsScaler() 
xNormalized = scaler.fit_transform(x)

# Normalizing the data between 1 and 5, rounded to nearest interger.
relacs = MinMaxScaler(feature_range=(1, 5)) 
data['Number of siblings'] = relacs.fit_transform(((data['Number of siblings'] - 0)/(10)).values.reshape(-1, 1)).round()
data['Height'] = relacs.fit_transform(((data['Height'] - 60)/(205-60)).values.reshape(-1, 1)).round()
data['Weight'] = relacs.fit_transform(((data['Weight'] - 40)/(165-40)).values.reshape(-1, 1)).round()
data['Age'] = relacs.fit_transform(((data['Age'] - 15)/(30-15)).values.reshape(-1, 1)).round()

xCatagorical = data.values[:,0:149]

 ** Spliting data into train, test and validate **

In [100]:
xOrignalTrain, xOrignalTest, xNormalizedTrain, xNormalizedTest, xCatagoricalTrain, xCatagoricalTest,yTrain, yTest = train_test_split(
    x, xNormalized, xCatagorical, y, test_size=0.2, random_state=69)

# Classifer Selection and Tuning

I am testing alot of classifers to see which classifer does best on which data. I have a rough idea what classifer is best for which data type but let see.

In [89]:
# Testing Different Models on Different Datasets.
print("All results will be generated using 10 fold CV")
print("----------------------------------------------")


print("Linear Seperator")
print('-----------------')

# Perceptron
PERC = Perceptron(tol=1e-3, random_state=0)
print("Perceptron - Orignal Data", "Accuracy = ", 100*mean(cross_val_score(PERC, xOrignalTrain, yTrain, cv=10)))
print("Perceptron - Normalized Data", "Accuracy = ", 100*mean(cross_val_score(PERC, xNormalizedTrain, yTrain, cv=10)))
print("Perceptron - Catagorical Data", "Accuracy = ", 100*mean(cross_val_score(PERC, xCatagoricalTrain, yTrain, cv=10)))

# SVM - Linear
SVC = LinearSVC(random_state=0)
print("SVM Linear - Orignal Data", "Accuracy = ", 100*mean(cross_val_score(SVC, xOrignalTrain, yTrain, cv=10)))
print("SVM Linear - Normalized Data", "Accuracy = ", 100*mean(cross_val_score(SVC, xNormalizedTrain, yTrain, cv=10)))
print("SVM Linear - Catagorical Data", "Accuracy = ", 100*mean(cross_val_score(SVC, xCatagoricalTrain, yTrain, cv=10)))


print("Non-Linear Seperator")
print('--------------------')

# Decision Tree. Max depth has been optimized by trying multiple depths.
DT = tree.DecisionTreeClassifier(max_depth=10)
print("Decision Tree - Orignal Data", "Accuracy = ", 100*mean(cross_val_score(DT, xOrignalTrain, yTrain, cv=10)))
print("Decision Tree - Normalized Data", "Accuracy = ", 100*mean(cross_val_score(DT, xNormalizedTrain, yTrain, cv=10)))
print("Decision Tree - Catagorigal Data", "Accuracy = ", 100*mean(cross_val_score(DT, xCatagoricalTrain, yTrain, cv=10)))

# KNN. Differnt Neighbours were tried to optimized n
KNN = KNeighborsClassifier(n_neighbors=5)
print("KNN - Orignal Data", "Accuracy = ", 100*mean(cross_val_score(KNN, xOrignalTrain, yTrain, cv=10)))
print("KNN - Normalized Data", "Accuracy = ", 100*mean(cross_val_score(KNN, xNormalizedTrain, yTrain, cv=10)))
print("KNN - Catagorical Data", "Accuracy = ", 100*mean(cross_val_score(KNN, xCatagoricalTrain, yTrain, cv=10)))


# SVM RBF
SVM = svm.SVC(random_state=0)
print("SVM RBF - Orignal Data", "Accuracy = ", 100*mean(cross_val_score(SVM, xOrignalTrain, yTrain, cv=10)))
print("SVM RBF - Normalized Data", "Accuracy = ", 100*mean(cross_val_score(SVM, xNormalizedTrain, yTrain, cv=10)))
print("SVM RBF - Catagorical Data","Accuracy = ", 100*mean(cross_val_score(SVM, xCatagoricalTrain, yTrain, cv=10)))



print("Probabistic Model")
print('-----------------')

# Naive Bayes
NB = MultinomialNB()
print("NB - Original Data", "Accuracy = ", 100*mean(cross_val_score(NB, xOrignalTrain, yTrain, cv=10)))
print("NB - Catagorical Data", "Accuracy = ", 100*mean(cross_val_score(NB, xCatagoricalTrain, yTrain, cv=10)))

print("Ensemble Method")
print('---------------')

# Random Forests. n_estimators maximied by trying different values and max depth is same thats used for DT
RF = RandomForestClassifier(n_estimators=30, max_depth=10, random_state=0)
print("Random Forests- Orignal Data", "Accuracy = ", 100*mean(cross_val_score(RF, xOrignalTrain, yTrain, cv=10)))
print("Random Forests- Normalized Data", "Accuracy = ", 100*mean(cross_val_score(RF, xNormalizedTrain, yTrain, cv=10)))
print("Random Forests- Catagorical Data", "Accuracy = ", 100*mean(cross_val_score(RF, xCatagoricalTrain, yTrain, cv=10)))

All results will be generated using 10 fold CV
----------------------------------------------
Linear Seperator
-----------------
Perceptron - Orignal Data Accuracy =  57.3313017659
Perceptron - Normalized Data Accuracy =  69.0315088295
Perceptron - Catagorical Data Accuracy =  68.397171433
SVM Linear - Orignal Data Accuracy =  63.9305360213
SVM Linear - Normalized Data Accuracy =  70.6644202219
SVM Linear - Catagorical Data Accuracy =  66.033091108
Non-Linear Seperator
--------------------
Decision Tree - Orignal Data Accuracy =  63.5670807939
Decision Tree - Normalized Data Accuracy =  64.3264963276
Decision Tree - Catagorigal Data Accuracy =  65.6891897171
KNN - Orignal Data Accuracy =  64.5668854509
KNN - Normalized Data Accuracy =  65.4279574934
KNN - Catagorical Data Accuracy =  64.9342084701
SVM RBF - Orignal Data Accuracy =  70.647171433
SVM RBF - Normalized Data Accuracy =  67.7860603219
SVM RBF - Catagorical Data Accuracy =  73.5131075168
Probabistic Model
-----------------
NB

**Inference**

1) Catagorical Data does the best with most classifer

2) SVM RBF(My proposed technique), Random Forests and Naive Bayes gives the best results using validation

# Training

In [101]:
# Defining some models to compare with

# NB
NB = MultinomialNB()
NB.fit(xCatagoricalTrain, yTrain)

#KNN
KNN = KNeighborsClassifier(n_neighbors = 3)
KNN.fit(xCatagoricalTrain, yTrain)

# RF
RF = RandomForestClassifier(n_estimators=30, max_depth=10, random_state=0)
RF.fit(xCatagoricalTrain, yTrain)

#SVM
SVM = svm.SVC(random_state=0, gamma = 'auto')
SVM.fit(xCatagoricalTrain, yTrain)

print("Models Trained")

Models Trained


# Testing

In [103]:
print("Baseline accuracies")
print("		Random:", "Accuracy:",(100*accuracy_score((random.randint(2, size=len(yTest))) , yTest)), "Recall:", 100*recall_score(yTest, (random.randint(2, size=len(yTest)))), "Precision:", 100*precision_score(yTest, (random.randint(2, size=len(yTest)))))
print("		Mode:", "Accuracy:",(100*accuracy_score(ones(len(yTest)) , yTest)), "Recall:", 100*recall_score(yTest, ones(len(yTest))), "Precision:", 100*precision_score(yTest, ones(len(yTest))))
print("		KNN:", "Accuracy:", (100*KNN.score(xCatagoricalTest, yTest)), "Recall:",  100*recall_score(yTest,KNN.predict(xCatagoricalTest)), "Precision:", 100*precision_score(yTest,KNN.predict(xCatagoricalTest)))
print("Now some good classifers")
print("		Naive bayes:", "Accuracy:", (100*NB.score(xCatagoricalTest, yTest)), "Recall:", 100*recall_score(yTest,NB.predict(xCatagoricalTest)), "Precision:", 100*precision_score(yTest,NB.predict(xCatagoricalTest)))
print("		Random forest:", "Accuracy:", (100*RF.score(xCatagoricalTest, yTest)),  "Recall:", 100*recall_score(yTest,RF.predict(xCatagoricalTest)), "Precision:", 100*precision_score(yTest,RF.predict(xCatagoricalTest)))


print("Now by best classifer")
print("		SVM RBF:", "Accuracy:", (100*SVM.score(xCatagoricalTest, yTest)),  "Recall:", 100*recall_score(yTest,SVM.predict(xCatagoricalTest)), "Precision:", 100*precision_score(yTest,SVM.predict(xCatagoricalTest)))


Baseline accuracies
		Random: Accuracy: 54.2288557214 Recall: 49.2307692308 Precision: 59.2592592593
		Mode: Accuracy: 64.6766169154 Recall: 100.0 Precision: 64.6766169154
		KNN: Accuracy: 62.1890547264 Recall: 60.7692307692 Precision: 75.9615384615
Now some good classifers
		Naive bayes: Accuracy: 72.1393034826 Recall: 80.7692307692 Precision: 77.2058823529
		Random forest: Accuracy: 70.1492537313 Recall: 94.6153846154 Precision: 69.8863636364
Now by best classifer
		SVM RBF: Accuracy: 73.631840796 Recall: 96.1538461538 Precision: 72.2543352601


I am also using recall and precision as an evaluation critera given the nature of task.
High recall would mean correctly classifying given a person was empathic. Precision would mean how many were empathic from our classification of empathic. Assuming that this model is used for classifying empathic person to be recruited. We dont want to misclassifying an empathic person, However even if we misclassify a non-empathic person, the person could be later screened in other stages of recuitment. Hence our classification critera is high recall and high accuracy overall.