## Data retrieval

In [24]:
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')

import pandas as pd

# get data
df = pd.read_csv('student_records.csv')
df

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore,Recommend
0,Henry,A,Y,90,85,Yes
1,John,C,N,85,51,Yes
2,David,F,N,10,17,No
3,Holmes,B,Y,75,71,No
4,Marvin,E,N,20,30,No
5,Simon,A,Y,92,79,Yes
6,Robert,B,Y,60,59,No
7,Trent,C,Y,75,33,No
8,Jokin,B,Y,65,70,Yes
9,Antxon,E,N,28,32,No


## Data preparation

### Feature extraction and engineering

In [25]:
# get features and corresponding outcomes
feature_names = ['OverallGrade', 'Obedient', 'ResearchScore', 'ProjectScore']
training_features = df[feature_names]

outcome_name = ['Recommend']
outcome_labels = df[outcome_name]

In [26]:
# view features
training_features

Unnamed: 0,OverallGrade,Obedient,ResearchScore,ProjectScore
0,A,Y,90,85
1,C,N,85,51
2,F,N,10,17
3,B,Y,75,71
4,E,N,20,30
5,A,Y,92,79
6,B,Y,60,59
7,C,Y,75,33
8,B,Y,65,70
9,E,N,28,32


In [27]:
# view outcome labels
outcome_labels

Unnamed: 0,Recommend
0,Yes
1,Yes
2,No
3,No
4,No
5,Yes
6,No
7,No
8,Yes
9,No


In [28]:
# list down features based on type
numeric_feature_names = ['ResearchScore', 'ProjectScore']
categoricial_feature_names = ['OverallGrade', 'Obedient']

### Numeric Feature Scaling

In [29]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

# fit scaler on numeric features
ss.fit(training_features[numeric_feature_names])

# scale numeric features now
training_features[numeric_feature_names] = ss.transform(training_features[numeric_feature_names])

# view updated featureset
training_features

Unnamed: 0,OverallGrade,Obedient,ResearchScore,ProjectScore
0,A,Y,1.02554,1.338668
1,C,N,0.855921,0.017929
2,F,N,-1.688357,-1.302811
3,B,Y,0.516684,0.794834
4,E,N,-1.34912,-0.797822
5,A,Y,1.093387,1.105596
6,B,Y,0.007829,0.328691
7,C,Y,0.516684,-0.681286
8,B,Y,0.177447,0.755989
9,E,N,-1.07773,-0.720132


### Engineering Categorical Features

In [30]:
training_features = pd.get_dummies(training_features, columns=categoricial_feature_names)
# view newly engineering features
training_features

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_B,OverallGrade_C,OverallGrade_E,OverallGrade_F,Obedient_N,Obedient_Y
0,1.02554,1.338668,1,0,0,0,0,0,1
1,0.855921,0.017929,0,0,1,0,0,1,0
2,-1.688357,-1.302811,0,0,0,0,1,1,0
3,0.516684,0.794834,0,1,0,0,0,0,1
4,-1.34912,-0.797822,0,0,0,1,0,1,0
5,1.093387,1.105596,1,0,0,0,0,0,1
6,0.007829,0.328691,0,1,0,0,0,0,1
7,0.516684,-0.681286,0,0,1,0,0,0,1
8,0.177447,0.755989,0,1,0,0,0,0,1
9,-1.07773,-0.720132,0,0,0,1,0,1,0


In [31]:
# get list of new categorical features
categorical_engineered_features = list(set(training_features.columns) - set(numeric_feature_names))
categorical_engineered_features

['Obedient_Y',
 'OverallGrade_F',
 'OverallGrade_E',
 'OverallGrade_C',
 'OverallGrade_B',
 'OverallGrade_A',
 'Obedient_N']

## Modeling

In [44]:
from sklearn.linear_model import LogisticRegression
import numpy as np

from sklearn.cross_validation import train_test_split
X = training_features
y = np.array(outcome_labels['Recommend'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
print('training: ' + str(X_train.shape))
print('test: ' + str(X_test.shape))

training: (10, 9)
test: (3, 9)


## Model Evaluation

Fitting the model on the whole dataset

In [45]:
# fit the model
lr = LogisticRegression() 
model = lr.fit(training_features, y)
# view model parameters
model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [46]:
# simple evaluation on training data
pred_labels = model.predict(training_features)
actual_labels = np.array(outcome_labels['Recommend'])

# evaluate model performance
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print('Accuracy:', float(accuracy_score(actual_labels, pred_labels))*100, '%')
print('Classification Stats:')
print(classification_report(actual_labels, pred_labels))

Accuracy: 84.61538461538461 %
Classification Stats:
             precision    recall  f1-score   support

         No       0.88      0.88      0.88         8
        Yes       0.80      0.80      0.80         5

avg / total       0.85      0.85      0.85        13



Using training/test datasets

In [47]:
# fit the model
lr = LogisticRegression() 
model = lr.fit(X_train, y_train)
# view model parameters
model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [35]:
# simple evaluation on training data
pred_labels = model.predict(X_train)
actual_labels = y_train

# evaluate model performance
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print('Accuracy:', float(accuracy_score(actual_labels, pred_labels))*100, '%')
print('Classification Stats:')
print(classification_report(actual_labels, pred_labels))

Accuracy: 80.0 %
Classification Stats:
             precision    recall  f1-score   support

         No       0.83      0.83      0.83         6
        Yes       0.75      0.75      0.75         4

avg / total       0.80      0.80      0.80        10



In [36]:
# evaluation on test data
pred_labels = model.predict(X_test)
actual_labels = y_test

# evaluate model performance
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print('Accuracy:', float(accuracy_score(actual_labels, pred_labels))*100, '%')
print('Classification Stats:')
print(classification_report(actual_labels, pred_labels))

Accuracy: 100.0 %
Classification Stats:
             precision    recall  f1-score   support

         No       1.00      1.00      1.00         2
        Yes       1.00      1.00      1.00         1

avg / total       1.00      1.00      1.00         3



## Model Deployment 

In [50]:
# fit the model on the whole dataset before dumping it
lr = LogisticRegression() 
model = lr.fit(training_features, y)

In [51]:
from sklearn.externals import joblib
import os
# save model and scaler to be deployed on your server
joblib.dump(model, r'model.pickle') 
joblib.dump(ss, r'scaler.pickle') 

['scaler.pickle']

## Prediction in Action

In [52]:
# load model and scaler objects
model = joblib.load(r'model.pickle')
scaler = joblib.load(r'scaler.pickle')

In [53]:
## data retrieval
new_data = pd.DataFrame([{'Name': 'Nathan', 'OverallGrade': 'F', 'Obedient': 'N', 'ResearchScore': 30, 'ProjectScore': 20},
                  {'Name': 'Thomas', 'OverallGrade': 'A', 'Obedient': 'Y', 'ResearchScore': 78, 'ProjectScore': 80}])
new_data = new_data[['Name', 'OverallGrade', 'Obedient', 'ResearchScore', 'ProjectScore']]
new_data

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore
0,Nathan,F,N,30,20
1,Thomas,A,Y,78,80


In [54]:
## data preparation
prediction_features = new_data[feature_names]

# scaling
prediction_features[numeric_feature_names] = scaler.transform(prediction_features[numeric_feature_names])

# engineering categorical variables
prediction_features = pd.get_dummies(prediction_features, columns=categoricial_feature_names)

# view feature set
prediction_features

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_F,Obedient_N,Obedient_Y
0,-1.009883,-1.186275,0,1,1,0
1,0.618455,1.144442,1,0,0,1


In [55]:
# add missing categorical feature columns
current_categorical_engineered_features = set(prediction_features.columns) - set(numeric_feature_names)
missing_features = set(categorical_engineered_features) - current_categorical_engineered_features
for feature in missing_features:
    # add zeros since feature is absent in these data samples
    prediction_features[feature] = [0] * len(prediction_features) 

# view final feature set
prediction_features

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_F,Obedient_N,Obedient_Y,OverallGrade_B,OverallGrade_E,OverallGrade_C
0,-1.009883,-1.186275,0,1,1,0,0,0,0
1,0.618455,1.144442,1,0,0,1,0,0,0


In [56]:
## predict using model
predictions = model.predict(prediction_features)

## display results
new_data['Recommend'] = predictions
new_data

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore,Recommend
0,Nathan,F,N,30,20,No
1,Thomas,A,Y,78,80,Yes
