In [1]:
# import required libraries
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score

In [2]:
# load data
dataset = pd.read_csv('grades_dataset2.csv')
print(dataset)

   extra_hours  grade attend_class letter_grade
0           40   42.6           No            F
1           40   56.8          Yes            E
2           60   68.0          Yes            D
3           70   60.0           No            D
4           80   69.5           No            C
5           90   88.1          Yes            B
6          100   80.4           No            B
7          110   83.3          Yes            B
8          120   75.8           No            C
9          120   94.7          Yes            A


In [3]:
# prepare datasets to be fed in the model
# predict attend class given extra hours and grade
CV =  dataset.attend_class.values.reshape((len(dataset.attend_class), ))
data = (dataset.loc[:,'extra_hours':'grade'].values).reshape((len(dataset.attend_class), 2))

In [4]:
# Create a LogisticRegression object
LogReg = LogisticRegression()

In [5]:
# Train the model using the training sets
LogReg.fit(data, CV)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [6]:
# return model output
print('Coefficients (m): \n', LogReg.coef_)
print('Intercept (b): \n', LogReg.intercept_)

Coefficients (m): 
 [[-0.05989228  0.08042383]]
Intercept (b): 
 [-0.56292288]


In [7]:
#predict the class for each data point in training data
predicted = LogReg.predict(data)
print("Predictions: \n", np.array([predicted]).T)

Predictions: 
 [['Yes']
 ['Yes']
 ['Yes']
 ['Yes']
 ['Yes']
 ['Yes']
 ['No']
 ['No']
 ['No']
 ['No']]


In [8]:
# predict the probability/likelihood of the prediction
print("Probability of prediction: \n",LogReg.predict_proba(data))

Probability of prediction: 
 [[0.38522207 0.61477793]
 [0.16666366 0.83333634]
 [0.21209205 0.78790795]
 [0.4824953  0.5175047 ]
 [0.44148164 0.55851836]
 [0.24377164 0.75622836]
 [0.52150542 0.47849458]
 [0.61105961 0.38894039]
 [0.8394134  0.1605866 ]
 [0.53341501 0.46658499]]


In [9]:
print("Accuracy score for the model: \n", LogReg.score(data,CV))

Accuracy score for the model: 
 0.5


In [10]:
print(metrics.confusion_matrix(CV, predicted, labels=["Yes","No"]))

[[3 2]
 [3 2]]


In [11]:
# Calculating 5 fold cross validation results
model = LogisticRegression()
kf = KFold(n_splits=5)
scores = cross_val_score(model, data, CV, cv=kf)
print("Accuracy of every fold in 5 fold cross validation: ", abs(scores))
print("Mean of the 5 fold cross-validation: %0.2f" % abs(scores.mean()))

Accuracy of every fold in 5 fold cross validation:  [0.5 0.5 0.5 0.5 0.5]
Mean of the 5 fold cross-validation: 0.50


In [12]:
datapoint = np.array([100,60]).reshape(1, -1)
print("Does he attend class, if he gets 60 after putting 100 hours of effort?\n ", 
      LogReg.predict(datapoint),LogReg.predict_proba(datapoint))

Does he attend class, if he gets 60 after putting 100 hours of effort?
  ['No'] [[0.84899251 0.15100749]]
