# Predicting Student Academic Performance using artificial neural networks

In [None]:
import pandas as pd    # for data analysis
import numpy as np     # a libraru for array functions
import matplotlib.pyplot as plt # for making plots
import seaborn as sns  # a plotting package built on top of pandas

#import machine learning modules
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing

#import feature importance modules
# !pip install yellowbrick #uncomment if yellowbrick module not installed
from yellowbrick.model_selection import FeatureImportances
from sklearn.ensemble import RandomForestClassifier

In [None]:
data = pd.read_csv('../input/xAPI-Edu-Data/xAPI-Edu-Data.csv')
data.head()

In [None]:
data.tail()

In [None]:
data.shape

In [None]:
data.info()

## Renaming feature names
For more intuitiveness, we will be renaming the following columns:  
NationalITy ➡ Nationality  
VisITedResources ➡ VisitedResources  
ParentschoolSatisfaction ➡ ParentSchoolSatisfaction  
raisedhands ➡ raisedHands  

In [None]:
data.rename(columns={'NationalITy':'Nationality',
'VisITedResources':'VisitedResources',
'ParentschoolSatisfaction':'ParentSchoolSatisfaction',
'raisedhands':'raisedHands'}, inplace=True) 

In [None]:
data.head()

### Minor Tweaks here and there:

I will be dropping the `PlaceofBirth` column as it is the same as the `Nationality` 

In [None]:
data.drop(columns='PlaceofBirth', inplace=True)

In [None]:
print(list(data.columns), sep=' ')

'Kuwait' is represented as 'KW' in Nationality. I'll change it with this cell below:

In [None]:
data['Nationality'].replace({'KW':'Kuwait'}, inplace=True)

# Features in the data

|Feature  | Description |
|:--------|:------------|
|Gender | student's gender (nominal: 'Male' or 'Female’) |
|Nationality | student's nationality (nominal:’ Kuwait’,’ Lebanon’,’ Egypt’,’ SaudiArabia’,’ USA’, ’Jordan’,’Venezuela’,’ Iran’,’ Tunis’,’ Morocco’,’ Syria’,’ Palestine’,’ Iraq’,’ Lybia’)|
|Place of birth | student's Place of birth (nominal:’ Kuwait’,’ Lebanon’,’ Egypt’,’ SaudiArabia’,’ USA’,’ Jordan’,’Venezuela’,’ Iran’,’ Tunis’,’ Morocco’,’ Syria’,’ Palestine’,’ Iraq’,’ Lybia’)|
|Educational Stages | educational level student belongs (nominal:‘lowerlevel’, ’MiddleSchool’, ’HighSchool’)|
|Grade Levels | grade student belongs (nominal: ‘G-01’, ‘G-02’, ‘G-03’, ‘G-04’, ‘G-05’, ‘G-06’, ‘G-07’, ‘G-08’, ‘G-09’, ‘G-10’, ‘G-11’, ‘G-12 ‘)|
|Section ID | classroom student belongs (nominal:’A’,’B’,’C’)|
|Topic | course topic (nominal:’ English’,’ Spanish’, ‘French’,’ Arabic’,’ IT’,’ Math’,’ Chemistry’, ‘Biology’, ‘Science’,’ History’,’ Quran’,’ Geology’)|
|Semester | school year semester (nominal:’ First’,’ Second’)|
|Relation | Parent responsible for student (nominal:’Mum’,’Father’)|
|Raised hand | how many times the student raises his/her hand on classroom (numeric:0-100)|
|Visited resources | how many times the student visits a course content (numeric:0-100)|
|Viewing announcements | how many times the student checks the new announcements (numeric:0-100)|
|Discussion groups | how many times the student participate on discussion groups (numeric:0-100)|
|Parent Answering Survey | parent answered the surveys which are provided from school or not ( nominal: ’Yes’,’No’)|
|Parent School Satisfaction | the Degree of parent satisfaction from school (nominal:’Yes’,’No’)|
|Student Absence Days | the number of absence days for each student (nominal: above-7, under-7)|

In [None]:
ax = sns.countplot(x='Class', data=data, order=['L', 'M', 'H'])
for p in ax.patches:
    ax.annotate('{:.2f}%'.format((p.get_height() * 100) / len(data)), (p.get_x() + 0.24, p.get_height() + 2))
plt.show()

In [None]:
fig, axarr  = plt.subplots(2,figsize=(10,10))
sns.countplot(x='gender', data=data, order=['M','F'], ax=axarr[0])
sns.countplot(x='gender', hue='Class', data=data, order=['M', 'F'],hue_order = ['L', 'M', 'H'], ax=axarr[1])
plt.show()

In [None]:
fig, axarr  = plt.subplots(2,figsize=(10,10))
sns.countplot(x='Nationality', data=data, ax=axarr[0])
sns.countplot(x='Nationality', hue='Class', data=data,hue_order = ['L', 'M', 'H'], ax=axarr[1])
plt.show()

In [None]:
fig, axarr  = plt.subplots(2,figsize=(10,10))
sns.countplot(x='StageID', data=data, ax=axarr[0])
sns.countplot(x='StageID', hue='Class', data=data, hue_order = ['L', 'M', 'H'], ax=axarr[1])
plt.show()

In [None]:
fig, axarr  = plt.subplots(2,figsize=(10,10))
sns.countplot(x='GradeID', data=data, order=['G-02', 'G-04', 'G-05', 'G-06', 'G-07', 'G-08', 'G-09', 'G-10', 'G-11', 'G-12'], ax=axarr[0])
sns.countplot(x='GradeID', hue='Class', data=data, order=['G-02', 'G-04', 'G-05', 'G-06', 'G-07', 'G-08', 'G-09', 'G-10', 'G-11', 'G-12'], hue_order = ['L', 'M', 'H'], ax=axarr[1])
plt.show()

In [None]:
fig, axarr  = plt.subplots(2,figsize=(10,10))
sns.countplot(x='SectionID', data=data, order=['A', 'B', 'C'], ax = axarr[0])
sns.countplot(x='SectionID', hue='Class', data=data, order=['A', 'B', 'C'],hue_order = ['L', 'M', 'H'], ax = axarr[1])
plt.show()

In [None]:
fig, axarr  = plt.subplots(2,figsize=(10,10))
sns.countplot(x='Topic', data=data, ax = axarr[0])
sns.countplot(x='Topic', hue='Class', data=data,hue_order = ['L', 'M', 'H'], ax = axarr[1])
plt.show()

In [None]:
fig, axarr  = plt.subplots(2,figsize=(10,10))
sns.countplot(x='Semester', data=data, ax = axarr[0])
sns.countplot(x='Semester', hue='Class', data=data,hue_order = ['L', 'M', 'H'], ax = axarr[1])
plt.show()

In [None]:
fig, axarr  = plt.subplots(2,figsize=(10,10))
sns.countplot(x='Relation', data=data, ax = axarr[0])
sns.countplot(x='Relation', hue='Class', data=data,hue_order = ['L', 'M', 'H'], ax = axarr[1])
plt.show()

### Visualizing the continuous data in the numeric features

In [None]:
sns.pairplot(data, hue="Class", diag_kind="kde", hue_order = ['L', 'M', 'H'], markers=["o", "s", "D"])
plt.show()

In [None]:
data.groupby('Topic').median()

#### Here we can see part of the likely reason why the all of the geology students pass. They have far higher median numerical values than most other courses.

In [None]:
data.groupby('GradeID').median()

#### Here, looking at the median data again we can see part of the likely reason why the 5th and 9th grade students performed as they did as well.

In [None]:
fig, axarr  = plt.subplots(2,figsize=(10,10))
sns.countplot(x='ParentAnsweringSurvey', data=data, order=['Yes', 'No'], ax = axarr[0])
sns.countplot(x='ParentAnsweringSurvey', hue='Class', data=data, order=['Yes', 'No'], hue_order = ['L', 'M', 'H'], ax = axarr[1])
plt.show()

In [None]:
fig, axarr  = plt.subplots(2,figsize=(10,10))
sns.countplot(x='ParentSchoolSatisfaction', data=data, order=['Good', 'Bad'], ax = axarr[0])
sns.countplot(x='ParentSchoolSatisfaction', hue='Class', data=data, order=['Good', 'Bad'],hue_order = ['L', 'M', 'H'], ax = axarr[1])
plt.show()

In [None]:
fig, axarr  = plt.subplots(2,figsize=(10,10))
sns.countplot(x='StudentAbsenceDays', data=data, order=['Under-7', 'Above-7'], ax = axarr[0])
sns.countplot(x='StudentAbsenceDays', hue='Class', data=data, order=['Under-7', 'Above-7'],hue_order = ['L', 'M', 'H'], ax = axarr[1])
plt.show()

## Preprocessing the Data
Data preprocessing consists of the steps that were taken to prepare our data for the artificial neural network.

In [None]:
X = data.drop(columns='Class')
y = data['Class']

In [None]:
X.head()

In [None]:
X = pd.get_dummies(X)

In [None]:
shape = X.shape
print(f'Number of rows: {shape[0]}\nNumber of columns: {shape[1]}')

In [None]:
X.head()

We'll be setting the `random_state` to 42 for reproducibility

In [None]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=42)

## Model Definition

In [None]:
model = MLPClassifier(random_state=42, max_iter=1000)

In [None]:
model.fit(train_X, train_y)

In [None]:
pred_y = model.predict(test_X)

In [None]:
pred_y

## Evaluating the model

In [None]:
cm = confusion_matrix(test_y, pred_y)
cm

In [None]:
sns.heatmap(cm, cmap='icefire')

In [None]:
print(classification_report(test_y, pred_y))

### Feature Importance
We'll be using the yellowbrick library to plot the various feature importances

In [None]:
new_X = data.drop(columns='Class')
new_y = data['Class']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(new_X, new_y, test_size=0.25, random_state=42)

In [None]:
fi_model = RandomForestClassifier()
plt.figure(figsize=(25,20), dpi=400)
viz = FeatureImportances(fi_model) # create a visualizer using the random forest estimator
viz.fit(train_X, train_y)
# viz.show(outpath='feature importances.png')
viz.show();