# Advisory report
___
<pre>Teamname    : Submission Impossible 💥  
Group nr    : 32 - Company C  
Students    : {Andrei Dragomir, Ece Doganer, Márk Kerekes, Ariana Vargas Pastor}  
Student nrs : {2669304,         2552855,     2696796,      2710153}  </pre>
___

#### Structure of the project:
1. Data Exploration:
- Data visualisation;
- Comparisons of our company's hiree distributions as opposed to those of the other companies;
- Hiree descriptive data distributions (based on gender, age, nationality and sports) compared to the distributions of all applicants for company C;
- Hiree indicative data distributions compared to the distributions of all applicants for company C;
- Data processing and cleaning.

2. Modelling:
- Model M1: Neural Network with single hidden layer and no drop-out

- Model M2: Predictive model based on any indicators

- Model M3: Model based only on the given descriptors (age, nationality, gender and sports)
   
    
`IMPORTANT DECISIONS: Ratio of training/test data; Model selection; Hyperparameter optimization`
    
3. Evaluation and advice:
- Use _accuracy_ to test predictive models
- Analyse one of our models (suggesting M2)
    - test different feature combinations that result in the best accuracy rate
- Provide advice for the HR department
    - Should the model be used?
    - How should the model be used?
    - What future evaluations and calibrations needed in the future?
    - Discuss potential risks imposed by the usage of this model



## 0. Importing the data set and libraries required

In terms of data cleaning, we have checked for null values and observed that there are no missing entries after loading the dataset.

In [None]:
#Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn; seaborn.set()
import statistics
from sklearn import model_selection

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from turtle import title
from enum import unique

from sklearn.metrics import precision_recall_fscore_support
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from tensorflow import keras
from keras.models import Sequential
from keras.callbacks import Callback
from keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

#Loading dataset and checking for any possible NaN values
recruitmentData = pd.read_csv (r'recruitmentdataset-2022-1.3.csv')
print(recruitmentData.isnull().values.any())

# print(recruitmentData)

## 1. Data Exploration

### 1.1 Data visualisation of the general population

In [None]:
# violin plot combo age for general pop, employees, candidates 
df = pd.read_csv (r'recruitmentdataset-2022-1.3.csv') 

#data 
general_pop_age = df.age 
candidates_age = df.age[df['company'] == 'C'] 
employees_age = candidates_age[df['decision'] == True] 

fig, (ax1, ax2, ax3) = plt.subplots(nrows=1, ncols=3) 
#plot 1 
ax1.violinplot(general_pop_age, showmedians=True) 
ax1.boxplot(general_pop_age) 
ax1.set_title('Population age') 
ax1.set_xticks([1]) 
ax1.set_xticklabels(["Population"]) 

#plot2 
ax2.violinplot(candidates_age, showmedians=True) 
ax2.boxplot(candidates_age) 
ax2.set_title('Candidates age') 
ax2.set_xticks([1]) 
ax2.set_xticklabels(["Candidates"]) 

#plot3 
ax3.violinplot(employees_age, showmedians=True) 
ax3.boxplot(employees_age) 
ax3.set_title('Employees age') 
ax3.set_xticks([1]) 
ax3.set_xticklabels(["Employees"]) 
plt.title('Age comparison') 
plt.show() 

In [None]:
#gender comparison
df = pd.read_csv (r'recruitmentdataset-2022-1.3.csv')

#data 
pop_male_nr = df['gender'].value_counts()['male']
pop_female_nr = df['gender'].value_counts()['female']
pop_other_nr = df['gender'].value_counts()['other']

candidates = df[df['company'] == 'C']
candidates_male = candidates[candidates['gender'] == 'male']
candidates_female = candidates[candidates['gender'] == 'female']
candidates_other = candidates[candidates['gender'] == 'other']

hired = candidates[candidates['decision'] == True]
hired_male = hired[hired['gender'] == 'male']
hired_female = candidates_female[candidates_female['decision'] == True]
hired_other = candidates_other[candidates_other['decision'] == True]

print("male candidates: ", len(candidates_male))
print("male employees: ", len(candidates_male[candidates_male['decision'] == True]) )

#figure
n = 3
ind = np.arange(n)
width = 0.25
x = ['Male', 'Female', 'Other']

y_population = [pop_male_nr, pop_female_nr, pop_other_nr]
bar1 = plt.bar(ind, y_population, width, color='y')
z_candidates = [candidates_male.shape[0], candidates_female.shape[0], candidates_other.shape[0]]
bar2 = plt.bar(ind+width, z_candidates, width, color='g')

g_hired = [hired_male.shape[0], hired_female.shape[0], hired_other.shape[0]]
bar3 = plt.bar(ind+width*2, g_hired, width, color='b')

plt.xlabel("Gender")
plt.ylabel("Total number")
plt.title("Comparison of gender count")

plt.xticks(ind+width, x)
plt.legend((bar1,bar2, bar3), ('Population', 'Candidates', 'Employees'))
plt.show()

In [None]:
#Result storage for each model for plotting

#nationality comparison
df = pd.read_csv (r'recruitmentdataset-2022-1.3.csv')

#data 
pop_dutch = df['nationality'].value_counts()['Dutch']
pop_german = df['nationality'].value_counts()['German']
pop_belgian = df['nationality'].value_counts()['Belgian']

can = df[df['company'] == 'C']
can_dutch = can['nationality'].value_counts()['Dutch']
can_german = can['nationality'].value_counts()['German']
can_belgian = can['nationality'].value_counts()['Belgian']

emp = can[can['decision'] == True]
emp_dutch = emp['nationality'].value_counts()['Dutch']
emp_german = emp['nationality'].value_counts()['German']
emp_belgian = emp['nationality'].value_counts()['Belgian']

print(can_dutch, emp_dutch, can_german, emp_german, can_belgian, emp_belgian)

# figure
n =3
ind = np.arange(n)
width = 0.25
x = ['Population', 'Candidates', 'Employees']

y_dutch = [pop_dutch, can_dutch, emp_dutch]
bar1 = plt.bar(ind, y_population, width, color='y')
z_candidates = [can_dutch, can_german, can_belgian]
bar2 = plt.bar(ind+width, z_candidates, width, color='g')
g_hired = [emp_dutch, emp_german, emp_belgian]
bar3 = plt.bar(ind+width*2, g_hired, width, color='b')

# plt.xlabel("Nationality")
plt.ylabel("Total number")
plt.title("Comparison of nationality")

plt.xticks(ind+width, ('Dutch', 'German', 'Belgian'))
plt.legend((bar1,bar2, bar3), x)
plt.show()

In [None]:
not_hired_dutch = can_dutch - emp_dutch
not_hired_german = can_german - emp_german
not_hired_belgian = can_belgian - emp_belgian

fig, (ax1,ax2,ax3) = plt.subplots(3, 1)

labels0 = 'Not hired Dutch', 'Hired Dutch'
sizes0 = not_hired_dutch, emp_dutch
ax1.pie(sizes0, labels=labels0)

labels1 = "Not hired German", "Hired German"
sizes1 = not_hired_german, emp_german
ax2.pie(sizes0, labels=labels1)

labels2 = "Not hired Belgian", "Hired Belgian"
sizes2 = not_hired_belgian, emp_belgian
ax3.pie(sizes2, labels=labels2)

plt.show()

In [None]:
fig, axs = plt.subplots(8, 1) 
fig.set_figwidth(20)
fig.set_figheight(15)
axs = axs.ravel()

print(recruitmentData['sport'].unique)
for idx, x in enumerate(recruitmentData['sport'].unique()):
    emp_sport = len(recruitmentData.query("decision and company == 'C' and sport == '%s'" % x))
    can_sport = len(recruitmentData.query("company == 'C'")) - emp_sport
    rest_sport = len(recruitmentData) - can_sport

    labels = 'hired candidates', 'not hired candidates', 'population who didn`t apply'
    ratios = emp_sport, can_sport, rest_sport
    axs[idx].pie(ratios, labels=labels)
    axs[idx].set_title(x)

plt.show()

In [None]:
emp_debate = len(recruitmentData.query("decision and company == 'C' and `ind-debateclub`"))
can_debate = len(recruitmentData.query("company == 'C'")) - emp_debate
rest_debate = len(recruitmentData) - can_debate

labels = 'hired candidates', 'not hired candidates', 'population who didn`t apply' 
ratios = emp_debate, can_debate, rest_debate
plt.pie(ratios, labels=labels)
plt.title('Debate experience')

plt.show() 

emp_debate = len(recruitmentData.query("decision and company == 'C' and not `ind-debateclub`"))
can_debate = len(recruitmentData.query("company == 'C'")) - emp_debate
rest_debate = len(recruitmentData) - can_debate

labels = 'hired candidates', 'not hired candidates', 'population who didn`t apply' 
ratios = emp_debate, can_debate, rest_debate
plt.pie(ratios, labels=labels)
plt.title('No debate experience')

plt.show()

In [None]:
# comparison of professional experience
df = pd.read_csv (r'recruitmentdataset-2022-1.3.csv')

#data 
pop_int_y = df['ind-international_exp'].value_counts()[True]
pop_int_n = df['ind-international_exp'].value_counts()[False]

can = df[df['company'] == 'C']
can_int_y = can['ind-international_exp'].value_counts()[True]
can_int_n = can['ind-international_exp'].value_counts()[False]

emp = can[can['decision'] == True]
emp_int_y = emp['ind-international_exp'].value_counts()[True]
emp_int_n = emp['ind-international_exp'].value_counts()[False]

#fig
n = 3
ind = np.arange(n)
width = 0.20
x = ['Population', 'Candidates', 'Employees']
y = ['professional international experience', 'no professional international experience']

has_exp = [pop_int_y, can_int_y, emp_int_y]
bar1 = plt.bar(ind, has_exp, width, color='y')
no_exp = [pop_int_n, can_int_n, emp_int_n]
bar2 = plt.bar(ind+width, no_exp, width, color='g')

plt.ylabel("Total number of people")
plt.title("Comparison of professional international experience")

plt.xticks(ind+width, x)
plt.legend((bar1,bar2), y)
plt.show()

In [None]:
# #Result storage for each model for plotting

# #languages comparison
df = pd.read_csv (r'recruitmentdataset-2022-1.3.csv')

# #data 
pop_lang1 = df['ind-languages'].value_counts()[0]
pop_lang2 = df['ind-languages'].value_counts()[1]
pop_lang3 = df['ind-languages'].value_counts()[2]
pop_lang4 = df['ind-languages'].value_counts()[3]

can = df[df['company'] == 'C']
can_lang1 = can['ind-languages'].value_counts()[0]
can_lang2 = can['ind-languages'].value_counts()[1]
can_lang3 = can['ind-languages'].value_counts()[2]
can_lang4 = can['ind-languages'].value_counts()[3]

emp = can[can['decision'] == True]
emp_lang1 = emp['ind-languages'].value_counts()[0]
emp_lang2 = emp['ind-languages'].value_counts()[1]
emp_lang3 = emp['ind-languages'].value_counts()[2]
emp_lang4 = emp['ind-languages'].value_counts()[3]

# figure
n = 3
ind = np.arange(n)
width = 0.20
x = ['Population', 'Candidates', 'Employees']
y = ['0 additional languages', '1 additional language', '2 additional languages', '3 additional languages']

lang1 = [pop_lang1, can_lang1, emp_lang1]
bar1 = plt.bar(ind, lang1, width, color='y')
lang2 = [pop_lang2,can_lang2,emp_lang2]
bar2 = plt.bar(ind+width, lang2, width, color='g')
lang3 = [pop_lang3, can_lang3, emp_lang3]
bar3 = plt.bar(ind+width*2, lang3, width, color='b')
lang4 = [pop_lang4, can_lang4, emp_lang4]
bar4 = plt.bar(ind+width*3, lang4, width, color='r')

plt.ylabel("Total number of people")
plt.title("Comparison of amount of additional languages known")

plt.xticks(ind+width, x)
plt.legend((bar1,bar2, bar3, bar4), y)
plt.show()


In [None]:
recruitmentData = pd.read_csv (r'recruitmentdataset-2022-1.3.csv')

emp_sport = len(recruitmentData.query("decision and company == 'C' and `ind-exact_study`"))
can_sport = len(recruitmentData.query("company == 'C'")) - emp_sport
rest_sport = len(recruitmentData) - can_sport

labels = 'hired candidates', 'not hired candidates', 'population who didn`t apply' 
ratios = emp_sport, can_sport, rest_sport
plt.pie(ratios, labels=labels)
plt.title('Scientific background')

plt.show() 

emp_sport = len(recruitmentData.query("decision and company == 'C' and not `ind-exact_study`"))
can_sport = len(recruitmentData.query("company == 'C'")) - emp_sport
rest_sport = len(recruitmentData) - can_sport

labels = 'hired candidates', 'not hired candidates', 'population who didn`t apply' 
ratios = emp_sport, can_sport, rest_sport
plt.pie(ratios, labels=labels)
plt.title('No scientific background')

plt.show()

In [None]:
recruitmentData = pd.read_csv (r'recruitmentdataset-2022-1.3.csv')

emp = len(recruitmentData.query("decision and company == 'C' and `ind-degree` == 'bachelor'"))
can = len(recruitmentData.query("company == 'C'")) - emp
rest = len(recruitmentData) - can

labels = 'hired candidates', 'not hired candidates', 'population who didn`t apply' 
ratios = emp, can, rest
plt.pie(ratios, labels=labels)
plt.title('Bachelor background')

plt.show() 
emp = len(recruitmentData.query("decision and company == 'C' and `ind-degree` == 'phd'"))
can = len(recruitmentData.query("company == 'C'")) - emp
rest = len(recruitmentData) - can

labels = 'hired candidates', 'not hired candidates', 'population who didn`t apply' 
ratios = emp, can, rest
plt.pie(ratios, labels=labels)
plt.title('PhD background')

plt.show() 

emp = len(recruitmentData.query("decision and company == 'C' and `ind-degree` == 'master'"))
can = len(recruitmentData.query("company == 'C'")) - emp
rest = len(recruitmentData) - can

labels = 'hired candidates', 'not hired candidates', 'population who didn`t apply' 
ratios = emp, can, rest
plt.pie(ratios, labels=labels)
plt.title('Master background')

plt.show()

In [None]:
recruitmentData = pd.read_csv (r'recruitmentdataset-2022-1.3.csv')

emp_prog = len(recruitmentData.query("decision and company == 'C' and `ind-programming_exp`"))
can_prog = len(recruitmentData.query("company == 'C'")) - emp_prog
rest_prog = len(recruitmentData) - can_prog

labels = 'hired candidates', 'not hired candidates', 'population who didn`t apply' 
ratios = emp_prog, can_prog, rest_prog
plt.pie(ratios, labels=labels)
plt.title('Programming experience')

plt.show() 

emp_prog = len(recruitmentData.query("decision and company == 'C' and not `ind-programming_exp`"))
can_prog = len(recruitmentData.query("company == 'C'")) - emp_prog
rest_prog = len(recruitmentData) - can_prog

labels = 'hired candidates', 'not hired candidates', 'population who didn`t apply' 
ratios = emp_prog, can_prog, rest_prog
plt.pie(ratios, labels=labels)
plt.title('No programming experience')

plt.show()

In [None]:
recruitmentData = pd.read_csv (r'recruitmentdataset-2022-1.3.csv')

emp_entr = len(recruitmentData.query("decision and company == 'C' and `ind-entrepeneur_exp`"))
can_entr = len(recruitmentData.query("company == 'C'")) - emp_entr
rest_entr = len(recruitmentData) - can_entr

labels = 'hired candidates', 'not hired candidates', 'population who didn`t apply' 
ratios = emp_entr, can_entr, rest_entr
plt.pie(ratios, labels=labels)
plt.title('Entrepreneur experience')

plt.show() 

emp_entr = len(recruitmentData.query("decision and company == 'C' and not `ind-entrepeneur_exp`"))
can_entr = len(recruitmentData.query("company == 'C'")) - emp_entr
rest_entr = len(recruitmentData) - can_entr

labels = 'hired candidates', 'not hired candidates', 'population who didn`t apply' 
ratios = emp_entr, can_entr, rest_entr
plt.pie(ratios, labels=labels)
plt.title('No entrepreneur experience')

plt.show()

### 1.2 Data cleaning and preparation as well as evaluating highest correlating parameters

In this section we will evaluate the features that we have visualized in the plotting above, make some assumptions and test them in terms of data meaningfulness.
These assumption will be used when building a model in the hopes of achieving a fair discrete alternative to our categorical data.

In [None]:
#Focus the dataset to our company
dataSet = recruitmentData.query("company == 'C'")
dataSetC = pd.DataFrame(dataSet, columns=[
    'age',
    'gender',
    'nationality',
    'sport',
    'ind-university_grade',
    'ind-debateclub',
    'ind-programming_exp',
    'ind-international_exp',
    'ind-entrepeneur_exp',
    'ind-languages',
    'ind-exact_study',
    'ind-degree',
    'decision'
])

labelEncoder = LabelEncoder()

# Convert the following numerical labels from interger to float
conversions = {
    'ind-languages' : float,
    'ind-university_grade' : float
}
dataSetC = dataSetC.astype(conversions)

#Transoform age into age groups depending on the general distribution of ages in the dataset
bins= recruitmentData['age'].describe()[3:8]
labels = ['21-24','24-26','26-28','28-32']
dataSetC['ageGroup'] = pd.cut(dataSetC['age'], bins=bins, labels=labels, right=False)
dataSetC = dataSetC.drop('age', axis=1)
dataSetC = dataSetC.dropna()
    
# Label Encoder conversion
dataSetC['decision'] = labelEncoder.fit_transform(dataSetC['decision'])
dataSetC['ind-debateclub'] = labelEncoder.fit_transform(dataSetC['ind-debateclub'])
dataSetC['ind-entrepeneur_exp'] = labelEncoder.fit_transform(dataSetC['ind-entrepeneur_exp'])
dataSetC['ind-exact_study'] = labelEncoder.fit_transform(dataSetC['ind-exact_study'])
dataSetC['ind-programming_exp'] = labelEncoder.fit_transform(dataSetC['ind-programming_exp'])
dataSetC['ind-international_exp'] = labelEncoder.fit_transform(dataSetC['ind-international_exp'])    

# One Hot Encoding conversion for gender, sport, agegroups and degree
dataSetC = pd.get_dummies(dataSetC)

tmp = dataSetC['decision']
# Scale our data 
scaler = StandardScaler()
dataSetCNew = pd.DataFrame(scaler.fit_transform(dataSetC), columns= dataSetC.columns)
dataSetCNew['decision'] = tmp.values

print(dataSetCNew)
# Evaluating correlations in order to potentially find good combinations of features
corr = dataSetCNew.corr()
plt.figure(figsize=(20,20))
seaborn.heatmap(corr, mask=np.zeros_like(corr), cmap=seaborn.diverging_palette(220, 10, as_cmap=True),
            square=True)
plt.show()            
ranking = corr['decision']
ranking = ranking.sort_values()
ranking.name = "Ranking of the predictive power of indicators"
print(ranking)

print(dataSetCNew)


## 2. Models

In [None]:
#Result storage for each model for plotting
modelResults = pd.DataFrame(columns=['model','fit_time','score_time','test_accuracy', 'test_precision', 'test_recall', 'test_f1', 'features'])

### 2.1 Model 1: Neural Network with single hidden layer and no drop-out

In [None]:
# prepare configuration for cross validation test harness
from scikeras.wrappers import KerasClassifier
from keras.callbacks import CSVLogger
from sklearn.metrics import get_scorer_names

results = pd.DataFrame(columns=['model','fit_time','score_time','test_accuracy', 'test_precision', 'test_recall', 'test_f1', 'features'])
models = []
inputChoices = [pd.DataFrame(dataSetCNew, columns=['ind-languages', 'ind-degree_phd', 'ind-degree_master', 'ind-international_exp']),
				dataSetCNew.filter(like='ind').sample(n=4,axis='columns'),
				dataSetCNew.filter(like='ind').sample(n=3,axis='columns'),
				dataSetCNew.filter(like='ind').sample(n=5,axis='columns')]
target = dataSetC['decision']

#Setting hyperparameters and evaluation outputs
seed = 7
splits = 10
epochs = 50
scores = ['accuracy', 'precision_micro', 'recall', 'f1']

# Clear epoch data for new run if it's been ran before
f = open("epochAnalysisM1.csv", "w")
f.truncate()
f.close()

#Training model for each input variation
for idx, input in enumerate(inputChoices):	

	kfold = model_selection.KFold(n_splits=splits, random_state=seed, shuffle=True)
	model = Sequential()

	dataLogger = CSVLogger('epochAnalysisM1.csv', separator=",", append=True)
	
	#input layer
	model.add(Dense(6, kernel_initializer='uniform', activation = 'relu', input_dim = input.columns.size))

	#output layer
	model.add(Dense(1, kernel_initializer='uniform', activation = 'sigmoid'))
	#run model M1
	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
	keras_clf = KerasClassifier(model = model, optimizer="adam", epochs=epochs, verbose=0, callbacks=[dataLogger])

	cv_results = model_selection.cross_validate(keras_clf, input, target, cv=kfold, scoring=scores)
	

	#Saving dataFrame of epoch based results for plotting
	new_row = {'model': 'M1', 
			'fit_time' : cv_results['fit_time'].mean(),
			'score_time' : cv_results['score_time'].mean(),
			'test_accuracy' : cv_results['test_accuracy'].mean(),
			'test_precision' : cv_results['test_precision_micro'].mean(),
			'test_recall' : cv_results['test_recall'].mean(),
			'test_f1' : cv_results['test_f1'].mean(),
			'features' : ', '.join(input.columns)}
	results = results.append(new_row, ignore_index=True)

modelResults = modelResults.append(results.iloc[0].transpose(), ignore_index=True)
print(results)

#### Plotting model outputs

In [None]:
#Setting up figure for result plotting
fig, axs = plt.subplots(2,2)
axs = axs.ravel()
fig.suptitle('Training metric changes (over epochs) of M1 based on different feature choices')
fig.set_figwidth(20)
fig.set_figheight(10)


#Plotting epoch accuracy and loss
epochData = pd.read_csv (r'epochAnalysisM1.csv')
sectionStart = 0

for index in range(0, len(inputChoices), 1):
    #Select the logs only of one set of inputs
    sectionOfInterest = epochData.loc[(index * epochs * splits):((index + 1) * epochs * splits)]

    #Prepare container for data compression
    averagedFrame = pd.DataFrame(columns=epochData.columns)
    for idx, epoch in enumerate(sectionOfInterest['epoch'].unique()):
        oneEpoch = sectionOfInterest.query('epoch == %i' % epoch)
        averagedFrame = averagedFrame.append({'epoch' : epoch,
                                            'accuracy' : oneEpoch['accuracy'].mean(),
                                            'loss' : oneEpoch['loss'].mean()
                                            }, ignore_index=True)
    
    axs[index].set_title('Feature set: %s' % ', '.join(inputChoices[index].columns))
    axs[index].plot(averagedFrame['accuracy'], label='M1 feature set %i accuracy' % (index + 1))
    axs[index].plot(averagedFrame['loss'], label='M1 feature set %i loss' % (index + 1))
    axs[index].legend()

plt.show()

#Plotting results from each model input
interestData = results.drop(['model', 'fit_time', 'score_time', 'features'], axis=1)
XResultsAxis = np.arange(len(interestData.columns))
interestData = interestData.transpose()

fig, axs = plt.subplots(1,1)
fig.set_figwidth(10)
fig.set_figheight(5)

for idx, x in enumerate(results['model']):
    axis = axs.bar(XResultsAxis + idx/(len(interestData.columns) * 2), interestData[idx].values, width= 1/(len(interestData.columns) * 2), label='Feature set %i' % (idx + 1))
    
axs.set_xticks(XResultsAxis)
axs.set_xticklabels(results.drop(['model', 'fit_time', 'score_time', 'features'], axis=1).columns)
axs.set_xlabel('Metrics')
axs.set_ylabel('Results')
axs.set_title('Metric result comparison for M1 with different feature sets')
axs.legend()
plt.show()

print(results['features'])

### 2.2 Model 2: Decision Tree Classifier

In [None]:
features_to_model = [ 'ind-degree_master', 'ind-international_exp',
                      'ind-exact_study', 'ind-debateclub' , 'ind-languages' ]
dataInScope = pd.DataFrame(dataSetCNew, columns=features_to_model)

x = pd.DataFrame(dataInScope.values)
y = dataSetC['decision'].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

clf = tree.DecisionTreeClassifier()
decision_tree = clf.fit(x_train, y_train)

#visualize tree
plt.subplots(figsize=(30, 20))
tree.plot_tree(decision_tree, filled=True, rounded=True)
plt.savefig("decision_tree.png")

#importance of feaures
importances = clf.feature_importances_
forest_importances = pd.Series(importances, index = features_to_model)
fig, ax = plt.subplots()
forest_importances.plot.bar()
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

#accuracy and precision
y_pred = clf.predict(x_test)
print('Model accuracy score : {0:0.4f}'. format(accuracy_score(y_test, y_pred)))
precision_recall = precision_recall_fscore_support(y_pred, y_test, average= 'macro')
print('Precision score (weighted):', precision_recall[0])
print('Recall score (weighted):', precision_recall[1])
f1 = f1_score(y_test, y_pred, average='macro')
print('F1 score (weighted):', f1)

#Saving dataFrame of results for plotting
new_row = {'model': 'M2', 
			'fit_time' : 0,
			'score_time' : 0,
			'test_accuracy' : accuracy_score(y_test, y_pred),
			'test_precision' : precision_recall[0],
			'test_recall' : precision_recall[1],
			'test_f1' : f1,
			'features' : ', '.join(features_to_model)}
	

modelResults = modelResults.append(new_row, ignore_index=True)
print(modelResults)

#### Plotting model outputs

### 2.3 Model 3: Neural Network with single hidden layer and no drop-out on descriptors

In [None]:
# prepare configuration for cross validation test harness
from scikeras.wrappers import KerasClassifier
from keras.callbacks import CSVLogger

results = pd.DataFrame(columns=['model','fit_time','score_time','test_accuracy', 'test_precision', 'test_recall', 'test_f1', 'features'])
models = []



inputChoices = [pd.DataFrame(dataSetCNew, columns=['sport_Tennis', 
												'sport_Running', 
												'sport_Swimming', 
												'sport_Chess',
												'sport_Football',
												'sport_Golf',
												'sport_Cricket',
												'sport_Rugby',
												'gender_female',
												'gender_other',
												'gender_male',
												'nationality_German',
												'nationality_Dutch',
												'nationality_Belgian',
												'ageGroup_21-24',
												'ageGroup_24-26',
												'ageGroup_26-28',
												'ageGroup_28-32'])
				]

target = dataSetC['decision']

#Setting hyperparameters and evaluation outputs
seed = 7
splits = 10
epochs = 50
scores = ['accuracy', 'precision', 'recall', 'f1']

# Clear epoch data for new run if it's been ran before
f = open("epochAnalysisM2.csv", "w")
f.truncate()
f.close()

#Training model for each input variation
for idx, input in enumerate(inputChoices):	

	kfold = model_selection.KFold(n_splits=splits, random_state=seed, shuffle=True)
	model = Sequential()
	dataLogger = CSVLogger('epochAnalysisM2.csv', separator=",", append=True)
	#input layer
	model.add(Dense(6, kernel_initializer='uniform', activation = 'relu', input_dim = input.columns.size))

	#output layer
	model.add(Dense(1, kernel_initializer='uniform', activation = 'sigmoid'))

	#run model M1
	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'],run_eagerly=True)
	keras_clf = KerasClassifier(model = model, optimizer="adam", epochs=epochs, verbose=0, callbacks=[dataLogger])
	cv_results = model_selection.cross_validate(keras_clf, input, target, cv=kfold, scoring=scores)

	#Saving dataFrame of epoch based results for plotting
	new_row = {'model': 'M3', 
			'fit_time' : cv_results['fit_time'].mean(),
			'score_time' : cv_results['score_time'].mean(),
			'test_accuracy' : cv_results['test_accuracy'].mean(),
			'test_precision' : cv_results['test_precision'].mean(),
			'test_recall' : cv_results['test_recall'].mean(),
			'test_f1' : cv_results['test_f1'].mean(),
			'features' : ', '.join(input.columns)}
	results = results.append(new_row, ignore_index=True)

modelResults = modelResults.append(results, ignore_index=True)
print(results)

#### Plotting of Model 3 metrics using all descriptive data

In [None]:
interestData = results.drop(['model', 'fit_time', 'score_time', 'features'], axis=1)
XResultsAxis = np.arange(len(interestData.columns))
interestData = interestData.transpose()

fig, axs = plt.subplots(1,1)
fig.set_figwidth(20)
fig.set_figheight(10)
axis = axs.bar(XResultsAxis + idx/(len(interestData.columns) * 2), interestData[idx].values, width= 1/(len(interestData.columns) * 2), label='Model 3')
axs.set_xticks(XResultsAxis)
axs.set_xticklabels(results.drop(['model', 'fit_time', 'score_time', 'features'], axis=1).columns)
axs.set_xlabel('Metrics')
axs.set_ylabel('Results')
axs.set_title('Metric result comparison for M1 with different feature sets')
axs.legend()
plt.show()

print(results['features'])

#### Plotting of Model 3 metrics for different subgroups

In [None]:
import random

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#Options for each descriptor
ageColumns = ['ageGroup_21-24', 'ageGroup_24-26', 'ageGroup_26-28', 'ageGroup_28-32']
nationalityColumns = ['nationality_German', 'nationality_Dutch', 'nationality_Belgian']
genderColumns = ['gender_female', 'gender_other', 'gender_male']

metrics = pd.DataFrame(columns=['model', 'test_accuracy', 'test_precision', 'test_recall', 'test_f1', 'features'])

dataSet = pd.DataFrame(dataSetCNew, columns=['sport_Tennis', 
												'sport_Running', 
												'sport_Swimming', 
												'sport_Chess',
												'sport_Football',
												'sport_Golf',
												'sport_Cricket',
												'sport_Rugby',
												'gender_female',
												'gender_other',
												'gender_male',
												'nationality_German',
												'nationality_Dutch',
												'nationality_Belgian',
												'ageGroup_21-24',
												'ageGroup_24-26',
												'ageGroup_26-28',
												'ageGroup_28-32'])
				
#Selecting a random combination of 3 descriptive features, 12 times
x = 0
while x < 12:
	choices = [random.choice(ageColumns), random.choice(nationalityColumns), random.choice(genderColumns)]
	query = "`%s` > 0 and `%s` > 0 and `%s` > 0" % (choices[0], choices[1], choices[2])
	tmpInput = dataSet.query(query)
	print(len(tmpInput))
	#If the queried combination has no inputs, we skip it
	if len(tmpInput) != 0:
		x_pred = tmpInput
		y_test = dataSetCNew.query(query)['decision']
		y_pred = model.predict(x_pred)

		#Take all scores from the prediction
		new_row = {'model': 'M3', 
				'test_accuracy' : accuracy_score(y_test.values,list(map(lambda x: 0 if x<0.5 else 1, y_pred))),
				'test_precision' : precision_score(y_test.values,list(map(lambda x: 0 if x<0.5 else 1, y_pred))),
				'test_recall' : recall_score(y_test.values,list(map(lambda x: 0 if x<0.5 else 1, y_pred))),
				'test_f1' : f1_score(y_test.values,list(map(lambda x: 0 if x<0.5 else 1, y_pred))),
				'features' : ', '.join(choices)}
		metrics = metrics.append(new_row, ignore_index=True)
		x += 1

#Plotting results from each model input
interestData = metrics.drop(['model', 'features'], axis=1)
XResultsAxis = np.arange(len(interestData.columns))
interestData = interestData.transpose()

fig, axs = plt.subplots(1,1)
fig.set_figwidth(20)
fig.set_figheight(10)

for idx, x in enumerate(metrics['model']):
    axis = axs.bar(XResultsAxis + idx/(len(interestData.columns) * 2), interestData[idx].values, width= 1/(len(interestData.columns) * 2), label=metrics['features'][idx])
    
axs.set_xticks(XResultsAxis)
axs.set_xticklabels(results.drop(['model', 'fit_time', 'score_time', 'features'], axis=1).columns)
axs.set_xlabel('Metrics')
axs.set_ylabel('Results')
axs.set_title('Metric result comparison for all models')
axs.legend()
plt.show()

# print(metrics)

	

In [None]:
#Plotting results from each model input
interestData = modelResults.drop(['model', 'fit_time', 'score_time', 'features'], axis=1)
XResultsAxis = np.arange(len(interestData.columns))
interestData = interestData.transpose()

fig, axs = plt.subplots(1,1)
fig.set_figwidth(10)
fig.set_figheight(5)

for idx, x in enumerate(modelResults['model']):
    axis = axs.bar(XResultsAxis + idx/(len(interestData.columns) * 2), interestData[idx].values, width= 1/(len(interestData.columns) * 2), label='Model %i' % (idx + 1))
    
axs.set_xticks(XResultsAxis)
axs.set_xticklabels(results.drop(['model', 'fit_time', 'score_time', 'features'], axis=1).columns)
axs.set_xlabel('Metrics')
axs.set_ylabel('Results')
axs.set_title('Metric result comparison for all models')
axs.legend()
plt.show()