In [1]:
import pandas as pd

In [2]:
# get data
df = pd.read_csv('student_records.csv')

In [3]:
print(df)

     Name OverallGrade Obedient  ResearchScore  ProjectScore Recommend
0   Henry            A        Y             90            85       Yes
1    John            C        N             85            51       Yes
2   David            F        N             10            17        No
3  Holmes            B        Y             75            71        No
4  Marvin            E        N             20            30        No
5   Simon            A        Y             92            79       Yes
6  Robert            B        Y             60            59        No
7   Trent            C        Y             75            33        No


In [4]:
# Data preparation
# Feature extraction and engineering

In [5]:
feature_names = ['OverallGrade','Obedient','ResearchScore','ProjectScore']

In [6]:
training_features = df[feature_names]

In [7]:
outcome_name = ['Recommend']

In [8]:
outcome_lables = df[outcome_name]

In [9]:
# view features
training_features

Unnamed: 0,OverallGrade,Obedient,ResearchScore,ProjectScore
0,A,Y,90,85
1,C,N,85,51
2,F,N,10,17
3,B,Y,75,71
4,E,N,20,30
5,A,Y,92,79
6,B,Y,60,59
7,C,Y,75,33


In [10]:
# view outcome labels
outcome_lables

Unnamed: 0,Recommend
0,Yes
1,Yes
2,No
3,No
4,No
5,Yes
6,No
7,No


In [11]:
# list down features based on type
numerical_features_names = ['ResearchScore','ProjectScore']
categorical_feature_name = ['OverallGrade','Obedient']

In [12]:
# numeric feature scaling
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
# fit scaler on numeric features
ss.fit(training_features[numerical_features_names])
training_features[numerical_features_names] = ss.transform(training_features[numerical_features_names])
# view updated featureset
training_features

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,OverallGrade,Obedient,ResearchScore,ProjectScore
0,A,Y,0.899583,1.37665
1,C,N,0.730648,-0.091777
2,F,N,-1.80339,-1.560203
3,B,Y,0.392776,0.772004
4,E,N,-1.465519,-0.998746
5,A,Y,0.967158,1.117516
6,B,Y,-0.114032,0.253735
7,C,Y,0.392776,-0.869179


In [13]:
## Engineering Categorical Features

In [14]:
training_features = pd.get_dummies(training_features,columns=categorical_feature_name)
# view newly engineering features
training_features


Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_B,OverallGrade_C,OverallGrade_E,OverallGrade_F,Obedient_N,Obedient_Y
0,0.899583,1.37665,1,0,0,0,0,0,1
1,0.730648,-0.091777,0,0,1,0,0,1,0
2,-1.80339,-1.560203,0,0,0,0,1,1,0
3,0.392776,0.772004,0,1,0,0,0,0,1
4,-1.465519,-0.998746,0,0,0,1,0,1,0
5,0.967158,1.117516,1,0,0,0,0,0,1
6,-0.114032,0.253735,0,1,0,0,0,0,1
7,0.392776,-0.869179,0,0,1,0,0,0,1


In [18]:
# get list of new categorical features
categorical_engineered_features = list(set(training_features.columns) - set(numerical_features_names))
categorical_engineered_features

['Obedient_N',
 'Obedient_Y',
 'OverallGrade_B',
 'OverallGrade_E',
 'OverallGrade_F',
 'OverallGrade_C',
 'OverallGrade_A']

In [19]:
# Modeling

In [20]:
from sklearn.linear_model import LogisticRegression
import numpy as np

# fit the model
lr = LogisticRegression()

In [21]:
model = lr.fit(training_features, np.array(outcome_lables['Recommend']))
# view model parameters
model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [22]:
# Model evaluation

In [23]:
# simple evaluation on training data
pred_labels = model.predict(training_features)
actual_labels = np.array(outcome_lables['Recommend'])

#evaluate model performance

In [30]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print('accuracy: ', float(accuracy_score(actual_labels,pred_labels))*100,'%')
print('Classification Stats:')
print(classification_report(actual_labels, pred_labels))

accuracy:  100.0 %
Classification Stats:
             precision    recall  f1-score   support

         No       1.00      1.00      1.00         5
        Yes       1.00      1.00      1.00         3

avg / total       1.00      1.00      1.00         8



In [31]:
# Model deployment

from sklearn.externals import joblib
import os

#save model to deploy on your server
if not os.path.exists('Model'):
    os.mkdir('Model')
if not os.path.exists('Scaler'):
    os.mkdir('Scaler')

joblib.dump(model, r'Model/model.pickle') 
joblib.dump(ss, r'Scaler/scaler.pickle')

['Scaler/scaler.pickle']

In [32]:
# prediction in action
model = joblib.load(r'Model/model.pickle')
scaler = joblib.load(r'Scaler/Scaler.pickle')

In [33]:
## data retrieval
new_data = pd.DataFrame([{'Name': 'Nathan', 'OverallGrade': 'F', 'Obedient': 'N', 'ResearchScore': 30, 'ProjectScore': 20},
                  {'Name': 'Thomas', 'OverallGrade': 'A', 'Obedient': 'Y', 'ResearchScore': 78, 'ProjectScore': 80}])
new_data = new_data[['Name', 'OverallGrade', 'Obedient', 'ResearchScore', 'ProjectScore']]
new_data

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore
0,Nathan,F,N,30,20
1,Thomas,A,Y,78,80


In [43]:
## data preparation
prediction_features = new_data[feature_names]
# scaling
prediction_features[numerical_features_names] = scaler.transform(prediction_features[numerical_features_names])

# engineering categorical variables
prediction_features = pd.get_dummies(prediction_features, columns=categorical_feature_name)

# view feature set
prediction_features

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_F,Obedient_N,Obedient_Y
0,-1.127647,-1.430636,0,1,1,0
1,0.494137,1.160705,1,0,0,1


In [45]:
# add missing categorical feature columns
current_categorical_engineered_features = set(prediction_features.columns) - set(numerical_features_names)
missing_features = set(categorical_engineered_features) - current_categorical_engineered_features
for feature in missing_features:
    # add zeros since feature is absent in these data samples
    prediction_features[feature] = [0] * len(prediction_features) 

# view final feature set
prediction_features

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_F,Obedient_N,Obedient_Y,OverallGrade_B,OverallGrade_E,OverallGrade_C
0,-1.127647,-1.430636,0,1,1,0,0,0,0
1,0.494137,1.160705,1,0,0,1,0,0,0


In [46]:
## predict using model
predictions = model.predict(prediction_features)

## display results
new_data['Recommend'] = predictions
new_data

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore,Recommend
0,Nathan,F,N,30,20,No
1,Thomas,A,Y,78,80,Yes
