In [1]:
import io
import requests
import pandas as pd
import numpy as np

In [3]:
url="https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/ISLR/College.csv"
s=requests.get(url).content
data =pd.read_csv(io.StringIO(s.decode('utf-8')), delimiter = ",")
del data["Unnamed: 0"]
data["Private"] = pd.get_dummies(data["Private"],drop_first=True)
data.head()

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,1,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,1,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,1,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,1,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
4,1,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [4]:
from sklearn.model_selection import train_test_split
data_train, data_test = train_test_split(data, train_size = 0.7)

In [5]:
train, validate, test = np.split(data.sample(frac=1), [int(.5*len(data)), int(.75*len(data))])

In [6]:
scale_train = data_train.iloc[:,1::]
scale_test = data_test.iloc[:,1::]
scale_train.head()

Unnamed: 0,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
370,610,461,189,26,52,685,49,8200,3300,550,1000,63,69,12.0,16,8128,64
738,6502,3539,1372,11,51,7484,1904,7844,4108,400,2000,76,79,15.3,16,6773,52
456,1132,847,302,58,89,1379,214,16200,4200,436,2486,90,90,10.4,14,14329,62
212,346,274,146,51,87,704,63,9900,3670,630,1818,59,59,10.5,14,8095,54
569,8598,4562,1143,56,93,5060,146,6550,4170,600,650,79,84,19.1,25,5716,76


In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(scale_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [8]:
scale_train.iloc[:,:] = scaler.transform(scale_train)
scale_test.iloc[:,:] = scaler.transform(scale_test)
scale_test.head()

Unnamed: 0,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
58,-0.570998,-0.602032,-0.627774,-0.398507,-0.252867,-0.602849,-0.56742,-0.114087,-0.325661,-0.883322,-0.637199,0.268511,-0.558273,-0.212151,-0.287155,-0.080297,-0.436108
518,0.411711,0.641714,0.274119,-0.635646,-0.983223,0.177729,0.564013,0.419884,1.945781,0.624556,-0.491121,0.574046,0.321381,0.063682,-0.612678,0.120232,-0.08485
443,-0.507491,-0.465468,-0.537699,0.846472,0.842666,-0.542324,-0.603338,0.981826,-0.322017,-0.581746,-0.345043,0.268511,-0.219945,-0.813968,-0.043013,0.346449,-1.372796
581,2.916051,3.476523,6.422045,1.32075,1.573022,6.049078,1.554115,-1.326965,-0.852081,0.32298,1.180012,1.062903,0.795041,2.24527,0.526651,-0.200176,0.207866
672,-0.673975,-0.697422,-0.628915,-0.339222,-0.618045,-0.521,-0.261332,-1.69388,-2.210939,0.32298,0.677504,-0.281453,-0.152279,2.370649,-1.589246,-1.11759,-1.314253


In [9]:
from sklearn import linear_model
logreg = linear_model.LogisticRegression()
logreg.fit(scale_train, data_train["Private"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [10]:
from sklearn.metrics import confusion_matrix
Private_predicted = logreg.predict(scale_train)
conf = confusion_matrix(data_train["Private"],Private_predicted) 
conf

array([[124,  15],
       [ 12, 392]])

In [11]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(data_train["Private"],Private_predicted) 
acc


0.95027624309392267

In [12]:
from sklearn.metrics import log_loss
loss = log_loss(data_train["Private"],Private_predicted)
loss

1.717419809620703

In [13]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
prec = precision_score(data_train["Private"],Private_predicted, average = None)
prec

array([ 0.91176471,  0.96314496])

In [14]:
from sklearn.metrics import recall_score
rec = recall_score(data_train["Private"],Private_predicted, average = None)
rec

array([ 0.89208633,  0.97029703])

In [17]:
prob_predicted = logreg.predict_proba(scale_train)[:,1]

In [18]:
def conf_matrix(prob_predicted,values, treshold):
    predicted_values = np.where(prob_predicted > treshold, 1, 0)
    matrix = confusion_matrix(values, predicted_values) 
    return matrix

In [19]:
conf_matrix(prob_predicted,data_train["Private"], .5)

array([[124,  15],
       [ 12, 392]])

In [20]:
def sensitivity(prob_predicted,values, treshold):
    matrix = conf_matrix(prob_predicted,values, treshold)
    return matrix[1,1] / (matrix[1,0] + matrix[1,1])

In [21]:
sensitivity(prob_predicted,data_train["Private"], .5)

0.97029702970297027

In [22]:
def precision(prob_predicted,values, treshold):
    matrix = conf_matrix(prob_predicted,values, treshold)
    return matrix[1,1] / (matrix[1,1] + matrix[0,1])

In [23]:
precision(prob_predicted,data_train["Private"], .5)

0.96314496314496312

In [24]:
def specifity(prob_predicted,values, treshold):
    matrix = conf_matrix(prob_predicted,values, treshold)
    return matrix[0,0] / (matrix[0,0] + matrix[0,1])

In [25]:
specifity(prob_predicted,data_train["Private"], .5)

0.8920863309352518

In [26]:
def f_beta(prob_predicted,values, beta, treshold):
    rec = sensitivity(prob_predicted,values, treshold)
    prec = precision(prob_predicted,values, treshold)
    return 2 * prec * rec / (beta*prec + rec)

In [27]:
f_beta(prob_predicted,data_train["Private"],1, .5)

0.96670776818742288

In [28]:
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
f1 = f1_score(data_train["Private"],Private_predicted)
f1

0.96670776818742288

In [30]:
fbeta = fbeta_score(data_train["Private"],Private_predicted, 4)
fbeta

0.96987338087614616

In [31]:
np.median(prob_predicted)

0.96882927962245957

In [32]:
def optimal_cutoff(prob_predicted,values,cost_matrix):
    X = np.linspace(0.0,1.0,101)
    Y = [calculate_cost(prob_predicted,values,cost_matrix, x) for x in X]
    return X[Y.index(max(Y))]
    

In [33]:
def calculate_cost(prob_predicted,values,cost_matrix, treshold):
    conf = conf_matrix(prob_predicted,values, treshold)
    return conf[0,0]*cost_matrix[0,0] + conf[1,0]*cost_matrix[1,0] + conf[0,1]*cost_matrix[0,1] + conf[1,1]*cost_matrix[1,1]

In [34]:
cost_matrix = np.array([[-50, -100],[-100, -50]])
cost_matrix

array([[ -50, -100],
       [-100,  -50]])

In [35]:
calculate_cost(prob_predicted,data_train["Private"],cost_matrix, 0.5)

-28500

In [36]:
 optimal_cutoff(prob_predicted,data_train["Private"],cost_matrix)

0.46000000000000002

In [37]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(logreg, scale_train, data_train["Private"], cv=10)
scores     

array([ 0.92727273,  0.96363636,  0.92727273,  0.92727273,  0.98148148,
        0.92592593,  0.94444444,  0.94444444,  0.92592593,  0.9245283 ])

In [38]:
from sklearn import metrics
from sklearn.model_selection import cross_val_predict
predicted = cross_val_predict(logreg, scale_train, data_train["Private"], cv=10)
metrics.accuracy_score( data_train["Private"],  predicted) 

0.93922651933701662