# Student Intervention System

In [2]:
import numpy as np
import pandas as pd
from time import time
from sklearn.metrics import f1_score

student_data = pd.read_csv("student-data.csv")
print ("Student data read successfully!")

Student data read successfully!


## Explore Dataset

In [3]:
n_students = len(student_data)
n_features = len(student_data.iloc[0]) - 1
n_passed = len(student_data[student_data['passed'] == 'yes'])
n_failed = len(student_data[student_data['passed'] == 'no'])
grad_rate = float(n_passed)/n_students * 100

print("Total number of students （number of datapoints): {}".format(n_students))
print("Number of features: {}".format(n_features))
print("Number of students who passed (graduates): {}".format(n_passed))
print("Number of students who failed (non-graduates): {}".format(n_failed))
print("Graduation rate of the class: {:.2f}%".format(grad_rate))

Total number of students （number of datapoints): 395
Number of features: 30
Number of students who passed (graduates): 265
Number of students who failed (non-graduates): 130
Graduation rate of the class: 67.09%


## Data Preparation

In [5]:
feature_cols = list(student_data.columns[:-1])

target_col = student_data.columns[-1] 

print("Feature columns:\n{}".format(feature_cols))
print("\nTarget column: {}".format(target_col))

X_all = student_data[feature_cols]
y_all = student_data[target_col]

print("\nFeature values:")
print(X_all.head())

Feature columns:
['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences']

Target column: passed

Feature values:
  school sex  age address famsize Pstatus  Medu  Fedu     Mjob      Fjob  \
0     GP   F   18       U     GT3       A     4     4  at_home   teacher   
1     GP   F   17       U     GT3       T     1     1  at_home     other   
2     GP   F   15       U     LE3       T     1     1  at_home     other   
3     GP   F   15       U     GT3       T     4     2   health  services   
4     GP   F   16       U     GT3       T     3     3    other     other   

    ...    higher internet  romantic  famrel  freetime goout Dalc Walc health  \
0   ...       yes       no        no       4         3     4    1    1      3   
1   ...       

## Feature PreProceesing

In [6]:
def preprocess_features(X):
    output = pd.DataFrame(index = X.index)
    for col, col_data in X.iteritems():
        
        if col_data.dtype == object:
            col_data = col_data.replace(['yes', 'no'], [1, 0])

        if col_data.dtype == object:
            # Example: 'school' => 'school_GP' and 'school_MS'
            col_data = pd.get_dummies(col_data, prefix = col)  
        
        output = output.join(col_data)
    
    return output

X_all = preprocess_features(X_all)
print("Processed feature columns ({} total features):\n{}".format(len(X_all.columns), list(X_all.columns)))

Processed feature columns (48 total features):
['school_GP', 'school_MS', 'sex_F', 'sex_M', 'age', 'address_R', 'address_U', 'famsize_GT3', 'famsize_LE3', 'Pstatus_A', 'Pstatus_T', 'Medu', 'Fedu', 'Mjob_at_home', 'Mjob_health', 'Mjob_other', 'Mjob_services', 'Mjob_teacher', 'Fjob_at_home', 'Fjob_health', 'Fjob_other', 'Fjob_services', 'Fjob_teacher', 'reason_course', 'reason_home', 'reason_other', 'reason_reputation', 'guardian_father', 'guardian_mother', 'guardian_other', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences']


## Training and Test Split

In [7]:
from sklearn.cross_validation import train_test_split
from sklearn.utils import shuffle

num_train = 300

num_test = X_all.shape[0] - num_train

X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, train_size=num_train, test_size=num_test)

print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 300 samples.
Testing set has 95 samples.




In [8]:
def train_classifier(clf, X_train, y_train):
    
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    
    print("Trained model in {:.4f} seconds".format(end - start))

    
def predict_labels(clf, features, target):
    
    start = time()
    y_pred = clf.predict(features)
    end = time()
    
    print("Made predictions in {:.4f} seconds.".format(end - start))
    return f1_score(target.values, y_pred, pos_label='yes')


def train_predict(clf, X_train, y_train, X_test, y_test):
    print("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    train_classifier(clf, X_train, y_train)
    print("F1 score for training set: {:.4f}.".format(predict_labels(clf, X_train, y_train)))
    print("F1 score for test set: {:.4f}.".format(predict_labels(clf, X_test, y_test)))
    print("\n")

## Model Performance

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

clf_A = RandomForestClassifier(random_state=0)
clf_B = GaussianNB()
clf_C = LogisticRegression(random_state=0)

X_train_100 = X_train[:100]
y_train_100 = y_train[:100]

X_train_200 = X_train[:200]
y_train_200 = y_train[:200]

X_train_300 = X_train
y_train_300 = y_train

for clf in [clf_A, clf_B, clf_C]:
    for j in [(X_train_100, y_train_100), (X_train_200, y_train_200), (X_train_300, y_train_300)]:
        train_predict(clf, j[0], j[1], X_test, y_test)

Training a RandomForestClassifier using a training set size of 100. . .
Trained model in 0.0360 seconds
Made predictions in 0.0040 seconds.
F1 score for training set: 0.9925.
Made predictions in 0.0000 seconds.
F1 score for test set: 0.7015.


Training a RandomForestClassifier using a training set size of 200. . .
Trained model in 0.0240 seconds
Made predictions in 0.0000 seconds.
F1 score for training set: 0.9887.
Made predictions in 0.0040 seconds.
F1 score for test set: 0.7344.


Training a RandomForestClassifier using a training set size of 300. . .
Trained model in 0.0240 seconds
Made predictions in 0.0040 seconds.
F1 score for training set: 0.9927.
Made predictions in 0.0040 seconds.
F1 score for test set: 0.7132.


Training a GaussianNB using a training set size of 100. . .
Trained model in 0.0040 seconds
Made predictions in 0.0000 seconds.
F1 score for training set: 0.3953.
Made predictions in 0.0000 seconds.
F1 score for test set: 0.4096.


Training a GaussianNB using a traini

In [10]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier


clf_A = DecisionTreeClassifier(random_state=0)
clf_B = SGDClassifier(random_state=0)
clf_C = SVC(random_state=0)
clf_D = KNeighborsClassifier()

X_train_100 = X_train[:100]
y_train_100 = y_train[:100]

X_train_200 = X_train[:200]
y_train_200 = y_train[:200]

X_train_300 = X_train
y_train_300 = y_train

for clf in [clf_A, clf_B, clf_C, clf_D]:
    for j in [(X_train_100, y_train_100), (X_train_200, y_train_200), (X_train_300, y_train_300)]:
        train_predict(clf, j[0], j[1], X_test, y_test)

Training a DecisionTreeClassifier using a training set size of 100. . .
Trained model in 0.0000 seconds
Made predictions in 0.0040 seconds.
F1 score for training set: 1.0000.
Made predictions in 0.0000 seconds.
F1 score for test set: 0.7377.


Training a DecisionTreeClassifier using a training set size of 200. . .
Trained model in 0.0040 seconds
Made predictions in 0.0000 seconds.
F1 score for training set: 1.0000.
Made predictions in 0.0000 seconds.
F1 score for test set: 0.5983.


Training a DecisionTreeClassifier using a training set size of 300. . .
Trained model in 0.0055 seconds
Made predictions in 0.0000 seconds.
F1 score for training set: 1.0000.
Made predictions in 0.0000 seconds.
F1 score for test set: 0.6116.


Training a SGDClassifier using a training set size of 100. . .
Trained model in 0.0760 seconds
Made predictions in 0.0000 seconds.
F1 score for training set: 0.8121.
Made predictions in 0.0000 seconds.
F1 score for test set: 0.7712.


Training a SGDClassifier using a 



## Model Tuning

In [12]:
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer

def predict_labels(clf, features, target):
    start = time()
    y_pred = clf.predict(features)
    score = clf.score(features, target.values)
    end = time()
    print("Score: ", score)
    
    print("Made predictions in {:.4f} seconds.".format(end - start))
    return f1_score(target.values, y_pred, pos_label='yes')


parameters = { "penalty":["l2","l1"], 
               "C":[1,10,100,1000],
              }

clf = LogisticRegression()

f1_scorer = make_scorer(f1_score, pos_label='yes')

grid_obj = GridSearchCV(clf, parameters, scoring=f1_scorer)

grid_obj = grid_obj.fit(X_train, y_train)
print(grid_obj)
clf = grid_obj.best_estimator_
print(clf)

print("Tuned model has a training F1 score of {:.4f}.".format(predict_labels(clf, X_train, y_train)))
print("Tuned model has a testing F1 score of {:.4f}.".format(predict_labels(clf, X_test, y_test)))



GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l2', 'l1'], 'C': [1, 10, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True,
       scoring=make_scorer(f1_score, pos_label=yes), verbose=0)
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Score:  0.77
Made predictions in 0.0000 seconds.
Tuned model has a training F1 score of 0.8463.
Score:  0.6526315789473685
Made predictions in 0.0000 seconds.
Tuned model has a testing F1 sco