In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

# Getting Started

Goal of the project: To identify students who might need early intervention i.e to predict whether the student will pas or fail.

The dataset: Let's see the columns of the dataset.

* school:  "student's school",
* sex: "student's sex",
* age: "student's age"
* address: "student's home address type"
* famsize: "family size"
* Pstatus: "parent's cohabitation status"
* Medu: "mother's education"
* Fedu: "father's education"
* Mjob: "mother's job"
* Fjob: "father's job"
* reason: "reason to choose this school"
* guardian: "student's guardian"
* traveltime: "home to school travel time"
* studytime: "weekly study time"
* failures: "number of past class failures"
* schoolsup: "extra educational support"
* famsup: "family educational support"
* paid: "extra paid classes within the course subject"
* activities: "extra-curricular activities"
* nursery: "attended nursery school"
* higher: "wants to take higher education"
* internet: "Internet access at home"
* romantic: "with a romantic relationship"
* famrel: "quality of family relationships"
* freetime: "free time after school"
* goout: "going out with friends"
* Dalc: "workday alcohol consumption"
* Walc: "weekend alcohol consumption"
* health: "current health status"
* absences: "number of school absences"
* passed": "did the student pass the final exam"

So we do have all total 30 featres to predict whether a student pass or fail.  so the target variable is "passed". Now it's time to explore the data.


# Data Exploration

In [None]:
# importing librararies
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, make_scorer

%load_ext autoreload
%autoreload 2

%matplotlib inline
plt.style.use('ggplot')

In [None]:
# reading the data
student_data = pd.read_csv('../input/student-data.csv')
display(student_data.head())

## Finding out the following
* Total number of students
* Total numbe of students passed and failed
* Total number of features
* Graduation rate of the class

In [None]:
n_students = student_data.shape[0]
stud_passed = student_data['passed'].value_counts().loc['yes']
stud_failed = student_data['passed'].value_counts().loc['no']
n_features = student_data.shape[1]
grad_rate = ( stud_passed / float(stud_passed + stud_failed) ) * 100
print("Number of students: ", n_students)
print("Number of students passed: ", stud_passed)
print("Number of students failed: ", stud_failed)
print("Total number of features: ", n_features)
print("Graduation rate: ", round(grad_rate, 2))

# Prepare the data

Divide our dataset into features and target, so for us the target columns is 'passed'. 
1. Split dataset into features and target 
2. Converting non-numeric columns to numeric
3.  Spliting the whole dataset into training and testing 


In [None]:
X_feature = student_data.drop('passed', axis='columns')
y_target = student_data['passed']

features = list(X_feature.columns)
target = student_data.columns[-1]

print("Features are: ")
for i, feature in enumerate(features):
    print(i+1, " ", feature)
    
print("\nTarget is: ", target)

## Preprocessing features

As we can see, there are several features which are non-numeric. And some features are categorical variable . To handle those categorical variable we will use pandas get_dummies() to create dummy variables and columns with binary variables to 0/1.

In [None]:
def preprocess_features(X_feature):
    """Convert binary value features into 0/1 and categorical non numeric variables to dummy variables"""
    
    # initialize the new dataframe i.e output
    output_df = pd.DataFrame(index=X_feature.index)
    
    # iterate through each feature column
    for col, col_data in X_feature.iteritems():
        if col_data.dtype == object:
            col_data = col_data.replace(['yes', 'no'], [1, 0])
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix=col)
            
        output_df = output_df.join(col_data)
    
    return output_df
X_feature = preprocess_features(X_feature)
print("Number of columns after precprocessing: {}, \n{}".format(len(X_feature.columns), 
                                                                list(X_feature.columns)))

## Splitting data into train and test set

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_feature, y_target, test_size = 0.25, 
                                                    stratify=y_target, random_state=42)
print("Training set size: {}".format(len(X_train)))
print("Test set size: {}".format(len(X_test)))

# Exploratory Data Analysis

Now we have a separate test set and we are not going to touch that until we found a good model to testify. Untill that time let's explore what the training data has to offer.

In [None]:
def plot_categorical(df, count_col='passed', plots_per_row=3):
    df_cat = df.select_dtypes(include=['object'])
    
    for i, col in enumerate(df_cat.columns):
        plot_index = i % plots_per_row
        
        if plot_index == 0:
            f, axes = plt.subplots(1, plots_per_row, figsize=(15, 5))
            sns.despine(left=True)
        sns.countplot(x=col, hue=count_col, data=df, ax=axes[plot_index])
        
        

In [None]:
df = student_data.iloc[:296, :]
plot_categorical(df)

A nice visualiztion to understand how these variables varies with our target class labels.
* Look at the internet feature, the distibution is not balanced at all, those who have an internet connection are passed twice as of those who haven't passed. So as those who havenot internet connection the number of student pass is more than number of student failed. So in both cases these seems to draw same conclusion which leads no where.
* Let's take the higher education feature, students wants to persue higher education has higher chance of passing the exam compared to a negligible count for those who wants to persue higher education but somehow failed.
* Have a look at the paid feature, which tells us about student subscribe to extra paid classes, in that if we look at the graph then either student took paid courses or not the number of passing is more in both the cases. So then it barely determines whether a student will pas or fail. Because whether a student took paid course or not the number of passing and failing count is almost same in both cases.

Like this many interesting information can be obtained from this graph. But the problem is we really cant determine which feature lead to where. In that case a good predictive model can tells us the answer. Before that let's have a look at the numeric variables.

In [None]:
def plot_numeric(df, col_t='passed', plots_per_row=2):
    df_num = df.select_dtypes(exclude=object)
    
    if not (col_t in df_num.columns):
        df_num = df_num.join(df[col_t])
    
    for i, col in enumerate(df_num.columns):
        if col == col_t:
            continue
        plot_idx = i % plots_per_row
        
        if plot_idx == 0:
            f, axes = plt.subplots(1, plots_per_row, figsize=(15, 5))
            sns.despine(left=True)
            
        # separating dataframe into pass and fail
        df_num_yes = df_num.loc[df_num[col_t] == 'yes']
        yes_label = '{0} - passed'.format(col)
        
        df_num_no = df_num.loc[df_num[col_t] == 'no']
        no_label = '{0} - Failed'.format(col)
        
        sns.kdeplot(df_num_yes[col], ax=axes[plot_idx], shade=True, label=yes_label)
        sns.kdeplot(df_num_no[col], ax=axes[plot_idx], shade=True, label=no_label)
        
        axes[plot_idx].set_title('Distribution of "{0}" \nfactored by "{1}"\nFeature:"{2}"  Target:"{3}"'
                                 .format(col, col_t, col, col_t))
        axes[plot_idx].set(xlabel=col)

In [None]:
plot_numeric(df)

* Young students have a high probability of pass the exam while the graph shows studuent with age above 20 has a lower probability of failure.
* Student's mother having a higher education tells us that the student have greater chance of getting passed whereas student's mother having a primary education says that it has greater chance to get failed in exam.
* Is travel time a key factor? Yes but in this case it doesn't say much. As student travels less distance have higher probability of getting passed while the other side tells us that the mass distribution for less travel time with failed has also bit higher probability not as high as passed but it has.

We discovered both categorical and numerical distribution now let's build model to testify.

# Training and Evaluating models

In [None]:
# Training a model
import time

def train_classifier(clf, X_train, y_train, verbose=True):
    """
    Trains a classifier that is passed in to this function
    
    :param clf: sklearn model object
    :param X_train: feature dataframe object
    :param y_train: target variables
    :param verbose: flag to print training information
    """
    start = time.time()
    clf.fit(X_train, y_train)
    end = time.time()
    train_time = end - start
    
    if verbose:
        print("Trained: {}".format(clf.__class__.__name__))
        print("Training time (secs)".format(train_time))
        
    return clf, train_time

In [None]:
# predictions on provided training and testing data set
def predict_labels(clf, features, target, verbose=True):
    """
    Predicts and calculate the f1_score for the given feature set using provided target values
    
    :param clf: sklearn model object
    :param features: feature set values
    :param target: target values
    
    """
    
    start = time.time()
    y_pred = clf.predict(features)
    end = time.time()
    
    prediction_time = end - start
    
    if verbose:
        print("predicted labels using {}".format(clf.__class__.__name__))
        print("Prediction time: {}".format(prediction_time))
        
    f1_measure = f1_score(target.values, y_pred, pos_label='yes')
    
    return f1_measure, prediction_time

In [None]:
def train_evaluate(clf, X_train, y_train, X_test, y_test, verbose=True):
    """
    Training and evaluation on training and test dataset
    :param clf: sklearn classfier object
    :param X_train: training feature dataset
    :param y_train: training target set
    :param X_test: test feature set
    :param y_test: test labels
    """
    
    clf, train_time = train_classifier(clf, X_train, y_train, verbose=verbose)
    train_f1, train_pred_time = predict_labels(clf, X_train, y_train, verbose=verbose)
    test_f1, test_pred_time = predict_labels(clf, X_test, y_test, verbose=verbose)
    
    print("F1 score on training set: {}".format(train_f1))
    print("F1 score on test set: {}".format(test_f1))
    
    f1_scores = {'F1_train': train_f1,
                'F1_test': test_f1}
    
    timings = {
        "Training_time": train_time,
        "prediction_train_time": train_pred_time,
        "prediction_test_time": test_pred_time
    }
    
    return clf, f1_scores, timings

In [None]:
def subset_train_predict(clf, X_train, y_train, X_test, y_test, subset_sizes, verbose=False):
    df_rows = []
    
    for i in subset_sizes:
        row = {'training_size': i}
        
        X_train_subset = X_train[:i]
        y_train_subset = y_train[:i]
        
        clf, f1_scores, timings = train_evaluate(clf, X_train, y_train, X_test, y_test, verbose=verbose)
        
        row.update(f1_scores)
        row.update(timings)
        
        df_rows.append(row)
        
    return pd.DataFrame(df_rows)

In [None]:
def plot_computation_time(df):
    fig, ax = plt.subplots(figsize=(12, 9))
    _ = ax.plot(df.training_size, df.Training_time, label="Training time")
    _ = ax.plot(df.training_size, df.prediction_train_time, label='Prediction time - Train')
    _ = ax.plot(df.training_size, df.prediction_test_time, label='Predtiction time - Test')
    _ = ax.legend(loc='upper-left')
    ax.set_xticks(subset_sizes)
    ax.set_xticklabels(subset_sizes, rotation=45)
    ax.set_xlabel('Training set size')
    ax.set_ylabel('seconds')
    ax.set_title('Training/Prediction times')
    plt.show()

In [None]:
def plot_f1_scores(df):
    fig, ax = plt.subplots(figsize=(12, 9))
    _ = ax.plot(df.training_size, df.F1_train, label='F1 scores - Training')
    _ = ax.plot(df.training_size, df.F1_test, label='F1 scores - Testing')
    _ = ax.legend(loc='upper-left')
    ax.set_xticks(subset_sizes)
    ax.set_xticklabels(subset_sizes, rotation=45)
    ax.set_xlabel('Training set size')
    ax.set_ylabel('F1 score')
    ax.set_title('F1 scores of each training samples')
    plt.show()

## Decision Tree

In [None]:
subset_sizes = range(100, 301, 10)
tree_model = subset_train_predict(DecisionTreeClassifier(), X_train, y_train,
                                 X_test, y_test, subset_sizes=subset_sizes)
tree_model

In [None]:
def plot_tree(tree_model, feature_names):
    with open("tree.dot", 'w') as f:
        export_graphviz(tree_model, out_file=f,
                       impurity=False,
                       rounded=True, 
                       filled=True,
                       leaves_parallel=False,
                       feature_names=feature_names, 
                       class_names=['fail', 'pass'])

In [None]:
tree_stats = tree_model[20:21]
#tree_model.columns
plot_computation_time(tree_model)

In [None]:
plot_f1_scores(tree_model)

In [None]:
tree_model, _, _ = train_evaluate(DecisionTreeClassifier(),
                                 X_train, y_train,
                                 X_test, y_test,
                                 verbose=False)

In [None]:
plot_tree(tree_model, X_train.columns)

In [None]:
%%bash

dot -Tpng tree.dot -o tree.png

In [None]:
from IPython.display import Image
Image(filename='tree.png', width=920, height=1280)

# Random Forest

In [None]:
subset_sizes = range(100, 301, 10)

rf_model = subset_train_predict(RandomForestClassifier(),
                                         X_train, y_train,
                                         X_test, y_test,
                                         subset_sizes=subset_sizes)
rf_model

In [None]:
plot_computation_time(rf_model)

In [None]:
plot_f1_scores(rf_model)

In [None]:
rf_model, _, _ = train_evaluate(RandomForestClassifier(), X_train, y_train,
                               X_test, y_test, verbose=False)

In [None]:
rf_importances = pd.DataFrame({'Feature': X_train.columns, 
                               'Importance': rf_model.feature_importances_}, 
                             index=X_train.columns)

In [None]:
rf_importances.sort_values(by='Importance',
                           ascending=False).plot(kind='bar', 
                                                 figsize=(16, 10),
                                                 title='Feature Importance')
plt.show()

If we look at the feature bars then one obvious features are absences and failures. After all if carry a hypothesis to predict one student's result for the next exam then we will first ask these questions, 
1. How many days that student present in the class?
2. How is his result in the past exams, did he passed or failed from last couple of exams?

# Logistic Regression

In [None]:
# As it is a linear model, and to get the best we need to have all feature in the same scale
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [None]:
subset_sizes = range(100, 301, 10)

lr_model = subset_train_predict(LogisticRegression(),
                               X_train, y_train,
                               X_test, y_test,
                               subset_sizes=subset_sizes)
lr_model

In [None]:
plot_computation_time(lr_model)

In [None]:
plot_f1_scores(lr_model)

[ISSUE]: That's really strange as the f1 score on every subset of test data is 0.75 from start to end, How this could be possible.


In [None]:
lr_models, _, _ = train_evaluate(LogisticRegression(penalty='l1'),
                               X_train_scaled, y_train,
                               X_test_scaled, y_test,
                               verbose=False)

In [None]:
lr_coeffs = pd.DataFrame({'Feature': X_train.columns, 
                          'Coefficient': np.abs(lr_models.coef_[0])}, 
                             index=X_train.columns)

lr_coeffs.sort_values(by='Coefficient', 
                      ascending=False).plot(kind='bar',
                                            figsize=(16, 10),
                                            color='#cd7058',
                                            title='Logistic Regression Coefficients with L1 penalty')
plt.show()

In [None]:
# let's look at +ve and -ve feature relevance score
lr_fe_coeffs = pd.DataFrame({'Feature': X_train.columns, 
                          'Coefficient': lr_models.coef_[0]}, 
                             index=X_train.columns)

lr_fe_coeffs.sort_values(by='Coefficient', 
                      ascending=False).plot(kind='bar',
                                            figsize=(16, 10),
                                            color='#cd7058',
                                            title='Logistic Regression Coefficients with L1 penalty')
plt.show()

So failures is actually negetively effect the prediction.

In [None]:
# positively corelated features
lr_fe_coeffs.sort_values(by='Coefficient', ascending=False).head()

In [None]:
# negetively corelated features
lr_fe_coeffs.sort_values(by='Coefficient', ascending=False).tail()

# Optimizing Models

## Decision Tree Fine Tuned

In [None]:
f1_scorer = make_scorer(f1_score, pos_label='yes')


In [None]:
tree_param_grid = {
    'criterion':['gini', 'entropy'],
    'max_depth': [2, 3, 5, 7, 9],
    'min_samples_split':[2, 10, 20, 30, 40],
    'min_samples_leaf':[1, 2, 5, 10],
    'max_features':[None, 'auto', 'sqrt', 1, 2, 5, 10]
}

tree_grid = GridSearchCV(DecisionTreeClassifier(), param_grid=tree_param_grid,
                        scoring=f1_scorer, cv=5, n_jobs=-1, verbose=True)

In [None]:
tree_grid.fit(X_train, y_train)

In [None]:
print('Best cross-validated tuned F1 score for Decision Tree: {}'.format(tree_grid.best_score_))

In [None]:
print("Choosen parameters are: {}".format(tree_grid.best_params_))

In [None]:
plot_tree(tree_grid.best_estimator_, X_train.columns)

In [None]:
%%bash

dot -Tpng tree.dot -o tree.png

In [None]:
from IPython.display import Image
Image(filename='tree.png', width=920, height=1280)

In [None]:
dt_final, f1_dt_final, timing_dt_final = train_evaluate(DecisionTreeClassifier(**tree_grid.best_params_),
                                          X_train, y_train,
                                          X_test, y_test,
                                          verbose=False)

tree_tuned_stats = pd.DataFrame([f1_dt_final]).join(pd.DataFrame([timing_dt_final]))
tree_tuned_stats

## Random Forest Fine Tuned

In [None]:
rf_params = {
    'criterion': ['gini', 'entropy'],
    'max_features': ['auto', 3, 5, 10, 20, 30, 40],
    'max_depth': [None, 2, 4, 8, 12],
    'n_estimators': [10, 50, 100, 500]
}


In [None]:
rf_grid = GridSearchCV(estimator=RandomForestClassifier(),
                      param_grid=rf_params,
                      scoring=f1_scorer,cv=5,
                      n_jobs=-1, verbose=True)

In [None]:
rf_grid.fit(X_train, y_train)

In [None]:
print("Best cross validated tuned F1 score for Random Forest {}".format(rf_grid.best_score_))

In [None]:
print("Best choosen parameters: {}".format(rf_grid.best_params_))

In [None]:
rf_importance_tuned = pd.DataFrame({'Importance': rf_grid.best_estimator_.feature_importances_,
                                   'Feature': X_train.columns}).sort_values(by='Importance',
                                                                            ascending=False)
rf_importance_tuned.head()

In [None]:
rf_importance_tuned.index = rf_importance_tuned.Feature.values
rf_importance_tuned.plot(kind='bar', figsize=(16, 10))

In [None]:
rf_final, f1_rf_final, rf_timing_final = train_evaluate(RandomForestClassifier(**rf_grid.best_params_),
                                                       X_train, y_train,
                                                       X_test, y_test,
                                                       verbose=False)

rf_tuned_stats = pd.DataFrame([f1_rf_final]).join(pd.DataFrame([rf_timing_final]))
rf_tuned_stats