In [42]:
%matplotlib inline
import numpy as np
import pandas as pd
import sklearn as sk
import seaborn as sns
import warnings; warnings.simplefilter('ignore')
import matplotlib.pyplot as plt
import mglearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (confusion_matrix,precision_score,recall_score,f1_score,
    roc_curve,roc_auc_score,precision_recall_curve,accuracy_score,classification_report)

In [2]:
data = pd.read_csv("Energi_Viborg_Dandas_data.csv")

#drop columns not needed after asking the company about the meaning of these features

columns_to_be_removed = ['ID', 'mslink', 'XKoordinat','YKoordinat','LedningID','Dobbeltled','EjerKompon','SystemKode','KategoriAf','DatoUdf']
data=data.drop(columns_to_be_removed,axis='columns')

# in the column DatoSaneri is the date of repairing and if there is no date it means it is not repaired

data['DatoSaneri'].fillna(0, inplace=True)

# take only the pipes that are broken(by TV insection) now and the repaired ones

data_with_TVObsAndSaneri = data[data['TVObsKode'].isin([1]) | data['DatoSaneri'] > 0]

In [3]:
#get around 2077 rows with not broken pipes

data_not_broken = data[~data['TVObsKode'].isin([0]) | data['DatoSaneri'] == 0]
data_not_broken = data_not_broken.sample(n=2077) 

In [4]:
frames = [data_with_TVObsAndSaneri, data_not_broken]
  
data_final = pd.concat(frames)
data_final

Unnamed: 0,fra_kote,til_kote,Laengde,Fald,DiameterIn,MaterialeK,anlag_aar,TransportK,Funktionsk,TVObsKode,DatoOprett,DatoOpdate,DatoSaneri
36,34.72,33.48,64.88,19.112207,300.0,1.0,1939.0,1,0,0.0,2010,2014,1997.0
42,39.46,39.16,91.75,3.269755,400.0,1.0,1939.0,1,0,1.0,2010,2014,0.0
43,39.71,39.48,87.69,2.622876,300.0,1.0,1939.0,1,0,1.0,2010,2014,0.0
64,40.55,40.08,52.11,9.019382,250.0,1.0,1945.0,1,0,1.0,2010,2014,0.0
65,40.38,40.55,68.39,-2.485744,250.0,1.0,1945.0,1,0,1.0,2010,2014,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10374,49.85,49.14,60.54,11.727783,200.0,4.0,2008.0,1,0,0.0,2010,2014,0.0
14100,43.68,43.39,15.27,18.991487,240.0,4.0,1964.0,1,0,0.0,2010,2014,0.0
12106,17.61,16.66,99.58,9.540068,200.0,1.0,1976.0,1,0,0.0,2010,2014,0.0
7001,32.31,31.89,38.97,10.777521,250.0,4.0,2000.0,1,0,0.0,2010,2014,0.0


In [5]:
data = data_final

In [6]:
# data_fs= np.where(np.isnan(data_features))
# data_fs
print("Number of rows before removing NaNs: {}".format(data.shape[0]))
data = data.dropna()
print("Number of rows after removing NaNs: {}".format(data.shape[0]))

Number of rows before removing NaNs: 4154
Number of rows after removing NaNs: 4154


In [7]:
#get data copied
datacopy = data


# add  age column

#get current year
from datetime import date
now = date.today().year


def age_df(datacopy):

    if (datacopy['TVObsKode'] == 1) and (datacopy['DatoSaneri'] > 0) :
        return (now - datacopy['DatoSaneri'])
    elif (datacopy['TVObsKode'] == 1) and (datacopy['DatoSaneri']== 0):
        return (now - datacopy['anlag_aar'])
    elif (datacopy['TVObsKode'] == 0) and (datacopy['DatoSaneri'] > 0):
        return (now - datacopy['DatoSaneri'])
    elif (datacopy['TVObsKode']== 0) and (datacopy['DatoSaneri']== 0):
        return (now - datacopy['anlag_aar'])

datacopy['Age'] = datacopy.apply(age_df, axis = 1)

In [8]:
# add a column 'PipeStatus'
# 1 as broken and 0 as not broken

def broken_df(datacopy):

    if (datacopy['TVObsKode'] == 1) and (datacopy['DatoSaneri'] < (datacopy['DatoOpdate'])) and (datacopy['DatoSaneri'] != 0):
        return 1
    elif (datacopy['TVObsKode'] == 1) and (datacopy['DatoSaneri'] >= (datacopy['DatoOpdate'])) and (datacopy['DatoSaneri'] != 0):
        return 0
    elif (datacopy['TVObsKode'] == 1) and (datacopy['DatoSaneri']== 0):
        return 1
    elif (datacopy['TVObsKode'] == 0) and (datacopy['DatoSaneri'] > 0):
        return 0
    elif (datacopy['TVObsKode']== 0) and (datacopy['DatoSaneri']== 0):
        return 0

datacopy['PipeStatus'] = datacopy.apply(broken_df, axis = 1)

In [9]:
# datacopy = datacopy.sample(n=22) 
# datacopy

In [10]:
# data_fs= np.where(np.isnan(datacopy))
# data_fs
# row = datacopy.iloc[369] #index=1 => second row
# print(row)

In [11]:
# data_fs= np.where(np.isnan(data_features))
# data_fs
print("Number of rows before removing NaNs: {}".format(datacopy.shape[0]))
datacopy = datacopy.dropna()
print("Number of rows after removing NaNs: {}".format(datacopy.shape[0]))

Number of rows before removing NaNs: 4154
Number of rows after removing NaNs: 4154


In [12]:
#drop columns not needed after adding new features

columns_to_be_removed = ['DatoOprett', 'DatoOpdate']
datacopy=datacopy.drop(columns_to_be_removed,axis='columns')
datacopy[0:-1]

Unnamed: 0,fra_kote,til_kote,Laengde,Fald,DiameterIn,MaterialeK,anlag_aar,TransportK,Funktionsk,TVObsKode,DatoSaneri,Age,PipeStatus
36,34.72,33.48,64.88,19.112207,300.0,1.0,1939.0,1,0,0.0,1997.0,24.0,0
42,39.46,39.16,91.75,3.269755,400.0,1.0,1939.0,1,0,1.0,0.0,82.0,1
43,39.71,39.48,87.69,2.622876,300.0,1.0,1939.0,1,0,1.0,0.0,82.0,1
64,40.55,40.08,52.11,9.019382,250.0,1.0,1945.0,1,0,1.0,0.0,76.0,1
65,40.38,40.55,68.39,-2.485744,250.0,1.0,1945.0,1,0,1.0,0.0,76.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13261,26.76,26.25,106.29,4.798194,250.0,1.0,1943.0,1,0,0.0,0.0,78.0,0
10374,49.85,49.14,60.54,11.727783,200.0,4.0,2008.0,1,0,0.0,0.0,13.0,0
14100,43.68,43.39,15.27,18.991487,240.0,4.0,1964.0,1,0,0.0,0.0,57.0,0
12106,17.61,16.66,99.58,9.540068,200.0,1.0,1976.0,1,0,0.0,0.0,45.0,0


In [13]:
# creating features set and target

columns_to_be_removed = ['PipeStatus']
data_features= datacopy.drop(columns_to_be_removed,axis='columns')
columns_to_be_removed = ['fra_kote','til_kote', 'Laengde','Fald','DiameterIn','MaterialeK','anlag_aar','TransportK','Funktionsk','TVObsKode','DatoSaneri','Age']
data_target=datacopy.drop(columns_to_be_removed,axis='columns')

In [14]:
# data_fs= np.where(np.isnan(data_features))
# data_fs
print("Number of rows before removing NaNs: {}".format(data.shape[0]))
data = data.dropna()
print("Number of rows after removing NaNs: {}".format(data.shape[0]))

Number of rows before removing NaNs: 4154
Number of rows after removing NaNs: 4154


# Tuning hyperparameters with train-test split and grid search cross validation

In [30]:
# Divide the data into training and test
X_train, X_test, y_train, y_test = train_test_split(
    data_features, data_target, stratify=data_target, random_state=42)

In [32]:
#List Hyperparameters that we want to tune by cross validation

C = [100, 10, 1.0, 0.1, 0.01]
penalty = ['l1', 'l2']

#Convert to dictionary
hyperparameters = dict(C = C , penalty= penalty)

#Create new logistic object
logreg = LogisticRegression( solver = 'liblinear')

#Use GridSearch
clf = GridSearchCV(logreg, hyperparameters, cv=5,return_train_score=True, scoring="recall")

#Fit the model
best_model = clf.fit(X_train, y_train)

#Print The value of best Hyperparameters
print('Best C:', best_model.best_estimator_.get_params()['C'])
print('Best penalty:', best_model.best_estimator_.get_params()['penalty'])
    
print("Accuracy on training set: {:.3f}".format(best_model.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(best_model.score(X_test, y_test)))

Best C: 10
Best penalty: l2
Accuracy on training set: 0.995
Accuracy on test set: 0.986


In [33]:
#Create forest Object.
a = best_model.best_estimator_.get_params()['C']
b = best_model.best_estimator_.get_params()['penalty']

logreg = LogisticRegression(C = a, penalty= b, solver = 'liblinear')

#Create x and y variables.
x = data_features
y = data_target

#Split data into training and testing.
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=42)

#Training the model.
logreg.fit(X_train, y_train)

#Predict test data set.
y_pred =logreg.predict(X_test)

#Checking performance our model with classification report.
print(classification_report(y_test, y_pred))

#Checking performance our model with ROC Score.
roc_auc_score(y_test, y_pred)

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       835
           1       0.96      1.00      0.98       204

    accuracy                           0.99      1039
   macro avg       0.98      0.99      0.99      1039
weighted avg       0.99      0.99      0.99      1039



0.9946107784431139

# Tuning hyperparameters with validation set split

In [34]:
# Divide the data into training, test and validation

X_trainval, X_test, y_trainval, y_test = train_test_split(data_features, data_target, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, random_state=43)
print("Size of training set:{}".format(X_train.shape[0]))
print("Size of validation set:{}".format(X_val.shape[0]))
print("Size of test set:{}".format(X_test.shape[0]))

Size of training set:2336
Size of validation set:779
Size of test set:1039


In [35]:
best_score = 0

for C in [100, 10, 1.0, 0.1, 0.01, 0.001]:
    for penalty in ['l1', 'l2']:
        
        # Learn the model
        logreg = LogisticRegression(penalty=penalty, C=C, solver = 'liblinear')
        logreg.fit(X_train, y_train)
        
        # Evaluate the model
        score = logreg.score(X_val, y_val)
        
        # If improvement, store score and parameter
        if score>best_score:
            best_score = score
            best_C = C
            best_penalty= penalty
                        


# Build a model on the combine training and valiation data
logreg = LogisticRegression(penalty= best_penalty, C =best_C, solver = 'liblinear')
logreg.fit(X_trainval, y_trainval)

print("Best best_C found: {}".format(best_C))
print("Best best_penalty found: {}".format(best_penalty))
print("Best score on validation set: {}".format(best_score))
print("Score on training/validation set: {}".format(logreg.score(X_trainval, y_trainval)))
print("Score on test set: {}".format(logreg.score(X_test, y_test)))

Best best_C found: 100
Best best_penalty found: l1
Best score on validation set: 0.993581514762516
Score on training/validation set: 0.9967897271268058
Score on test set: 0.9980750721847931


In [37]:
#Predict test data set.
y_pred = logreg.predict(X_test)

#Checking performance our model with classification report.
print(classification_report(y_test, y_pred))

#Checking performance our model with ROC Score.
roc_auc_score(y_test, y_pred)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       835
           1       1.00      0.99      1.00       204

    accuracy                           1.00      1039
   macro avg       1.00      1.00      1.00      1039
weighted avg       1.00      1.00      1.00      1039



0.9950980392156863

# Tuning parameters with cross validation split

In [38]:
# Divide the data into training and test , no validation data

X_trainval, X_test, y_trainval, y_test = train_test_split(data_features, data_target, random_state=42)
print("Size of training set:{}".format(X_train.shape[0]))
print("Size of test set:{}".format(X_test.shape[0]))

Size of training set:2336
Size of test set:1039


In [39]:
best_score = 0

for C in [100, 10, 1.0, 0.1, 0.01, 0.001]:
    for penalty in ['l1', 'l2']:
        # Learn the model 
        logreg = LogisticRegression(penalty=penalty, C=C, solver = 'liblinear')
        
        # Perform cross validation
        scores = cross_val_score(logreg, X_trainval, y_trainval, cv=5)
        
        # Compute the mean score
        score = scores.mean()
        
        
        # If improvement, store score and parameter
        if score>best_score:
            best_score = score
            best_C = C
            best_penalty= penalty

# Build a model on the combine training and valiation data
logreg = LogisticRegression(penalty= best_penalty, C=best_C, solver = 'liblinear')
logreg.fit(X_trainval, y_trainval)

print("Best best_C found: {}".format(best_C))
print("Best best_penalty found: {}".format(best_penalty))
print("Best average score: {}".format(best_score))
print("Score on training/validation set: {}".format(logreg.score(X_trainval, y_trainval)))
print("Score on test set: {}".format(logreg.score(X_test, y_test)))

Best best_C found: 100
Best best_penalty found: l1
Best average score: 0.9961476725521671
Score on training/validation set: 0.9974317817014446
Score on test set: 0.9980750721847931


In [40]:
#Predict test data set.
y_pred = logreg.predict(X_test)

#Checking performance our model with classification report.
print(classification_report(y_test, y_pred))

#Checking performance our model with ROC Score.
roc_auc_score(y_test, y_pred)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       835
           1       1.00      0.99      1.00       204

    accuracy                           1.00      1039
   macro avg       1.00      1.00      1.00      1039
weighted avg       1.00      1.00      1.00      1039



0.9950980392156863

# Tuning hyperparameters with train-test split and Normalisation

In [43]:
# Divide the data into training and test
X_train, X_test, y_train, y_test = train_test_split(
    data_features, data_target, stratify=data_target, random_state=42)

# preprocessing using 0-1 scaling
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [44]:
#List Hyperparameters that we want to tune by cross validation

C = [100, 10, 1.0, 0.1, 0.01]
penalty = ['l1', 'l2']

#Convert to dictionary
hyperparameters = dict(C = C , penalty= penalty)

#Create new logistic object
logreg = LogisticRegression( solver = 'liblinear')

#Use GridSearch
clf = GridSearchCV(logreg, hyperparameters, cv=5,return_train_score=True, scoring="recall")

#Fit the model
best_model = clf.fit(X_train_scaled, y_train)

#Print The value of best Hyperparameters
print('Best C:', best_model.best_estimator_.get_params()['C'])
print('Best penalty:', best_model.best_estimator_.get_params()['penalty'])
    
print("Accuracy on training set: {:.3f}".format(best_model.score(X_train_scaled, y_train)))
print("Accuracy on test set: {:.3f}".format(best_model.score(X_test_scaled, y_test)))

Best C: 0.1
Best penalty: l1
Accuracy on training set: 1.000
Accuracy on test set: 1.000


In [45]:
#Create forest Object.
a = best_model.best_estimator_.get_params()['C']
b = best_model.best_estimator_.get_params()['penalty']

logreg = LogisticRegression(C = a, penalty= b, solver = 'liblinear')

#Training the model.
logreg.fit(X_train_scaled, y_train)

#Predict test data set.
y_pred =logreg.predict(X_test_scaled)

#Checking performance our model with classification report.
print(classification_report(y_test, y_pred))

#Checking performance our model with ROC Score.
roc_auc_score(y_test, y_pred)

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       818
           1       0.94      1.00      0.97       221

    accuracy                           0.99      1039
   macro avg       0.97      0.99      0.98      1039
weighted avg       0.99      0.99      0.99      1039



0.991442542787286

# Tuning hyperparameters with train-test split and standardization

In [46]:
# Divide the data into training and test
X_train, X_test, y_train, y_test = train_test_split(
    data_features, data_target, stratify=data_target, random_state=42)

# preprocessing using zero mean and unit variance scaling
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [47]:
#List Hyperparameters that we want to tune by cross validation

C = [100, 10, 1.0, 0.1, 0.01]
penalty = ['l1', 'l2']

#Convert to dictionary
hyperparameters = dict(C = C , penalty= penalty)

#Create new logistic object
logreg = LogisticRegression( solver = 'liblinear')

#Use GridSearch
clf = GridSearchCV(logreg, hyperparameters, cv=5,return_train_score=True, scoring="recall")

#Fit the model
best_model = clf.fit(X_train_scaled, y_train)

#Print The value of best Hyperparameters
print('Best C:', best_model.best_estimator_.get_params()['C'])
print('Best penalty:', best_model.best_estimator_.get_params()['penalty'])
    
print("Accuracy on training set: {:.3f}".format(best_model.score(X_train_scaled, y_train)))
print("Accuracy on test set: {:.3f}".format(best_model.score(X_test_scaled, y_test)))

Best C: 0.1
Best penalty: l1
Accuracy on training set: 1.000
Accuracy on test set: 1.000


In [48]:
#Create forest Object.
a = best_model.best_estimator_.get_params()['C']
b = best_model.best_estimator_.get_params()['penalty']

logreg = LogisticRegression(C = a, penalty= b, solver = 'liblinear')

#Training the model.
logreg.fit(X_train_scaled, y_train)

#Predict test data set.
y_pred =logreg.predict(X_test_scaled)

#Checking performance our model with classification report.
print(classification_report(y_test, y_pred))

#Checking performance our model with ROC Score.
roc_auc_score(y_test, y_pred)

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       818
           1       0.94      1.00      0.97       221

    accuracy                           0.99      1039
   macro avg       0.97      0.99      0.98      1039
weighted avg       0.99      0.99      0.99      1039



0.991442542787286

# Tuning hyperparameters with validation set split and Normalisation

In [49]:
# Divide the data into training, test and validation

X_trainval, X_test, y_trainval, y_test = train_test_split(data_features, data_target, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, random_state=43)

# preprocessing using 0-1 scaling
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_val_scaled = scaler.transform(X_val)
X_trainval_scaled = scaler.transform( X_trainval)

print("Size of training set:{}".format(X_train_scaled.shape[0]))
print("Size of validation set:{}".format(X_val_scaled.shape[0]))
print("Size of test set:{}".format(X_test_scaled.shape[0]))

Size of training set:2336
Size of validation set:779
Size of test set:1039


In [50]:
best_score = 0

for C in [100, 10, 1.0, 0.1, 0.01, 0.001]:
    for penalty in ['l1', 'l2']:
        
        # Learn the model
        logreg = LogisticRegression(penalty=penalty, C=C, solver = 'liblinear')
        logreg.fit(X_train_scaled, y_train)
        
        # Evaluate the model
        score = logreg.score(X_val_scaled, y_val)
        
        # If improvement, store score and parameter
        if score>best_score:
            best_score = score
            best_C = C
            best_penalty= penalty
                        


# Build a model on the combine training and valiation data
logreg = LogisticRegression(penalty= best_penalty, C =best_C, solver = 'liblinear')
logreg.fit(X_trainval_scaled, y_trainval)

print("Best best_C found: {}".format(best_C))
print("Best best_penalty found: {}".format(best_penalty))
print("Best score on validation set: {}".format(best_score))
print("Score on training/validation set: {}".format(logreg.score(X_trainval_scaled, y_trainval)))
print("Score on test set: {}".format(logreg.score(X_test_scaled, y_test)))

Best best_C found: 100
Best best_penalty found: l1
Best score on validation set: 0.993581514762516
Score on training/validation set: 0.9983948635634029
Score on test set: 0.9980750721847931


In [52]:
#Predict test data set.
y_pred = logreg.predict(X_test_scaled)

#Checking performance our model with classification report.
print(classification_report(y_test, y_pred))

#Checking performance our model with ROC Score.
roc_auc_score(y_test, y_pred)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       835
           1       1.00      1.00      1.00       204

    accuracy                           1.00      1039
   macro avg       1.00      1.00      1.00      1039
weighted avg       1.00      1.00      1.00      1039



0.9969502172126334

# Tuning hyperparameters with validation set split and standardization

In [53]:
# Divide the data into training, test and validation

X_trainval, X_test, y_trainval, y_test = train_test_split(data_features, data_target, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, random_state=43)

# preprocessing using zero mean and unit variance scaling
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_val_scaled = scaler.transform(X_val)
X_trainval_scaled = scaler.transform( X_trainval)

print("Size of training set:{}".format(X_train_scaled.shape[0]))
print("Size of validation set:{}".format(X_val_scaled.shape[0]))
print("Size of test set:{}".format(X_test_scaled.shape[0]))

Size of training set:2336
Size of validation set:779
Size of test set:1039


In [54]:
best_score = 0

for C in [100, 10, 1.0, 0.1, 0.01, 0.001]:
    for penalty in ['l1', 'l2']:
        
        # Learn the model
        logreg = LogisticRegression(penalty=penalty, C=C, solver = 'liblinear')
        logreg.fit(X_train_scaled, y_train)
        
        # Evaluate the model
        score = logreg.score(X_val_scaled, y_val)
        
        # If improvement, store score and parameter
        if score>best_score:
            best_score = score
            best_C = C
            best_penalty= penalty
                        


# Build a model on the combine training and valiation data
logreg = LogisticRegression(penalty= best_penalty, C =best_C, solver = 'liblinear')
logreg.fit(X_trainval_scaled, y_trainval)

print("Best best_C found: {}".format(best_C))
print("Best best_penalty found: {}".format(best_penalty))
print("Best score on validation set: {}".format(best_score))
print("Score on training/validation set: {}".format(logreg.score(X_trainval_scaled, y_trainval)))
print("Score on test set: {}".format(logreg.score(X_test_scaled, y_test)))

Best best_C found: 100
Best best_penalty found: l1
Best score on validation set: 0.9974326059050064
Score on training/validation set: 0.9990369181380417
Score on test set: 0.9980750721847931


In [55]:
#Predict test data set.
y_pred = logreg.predict(X_test_scaled)

#Checking performance our model with classification report.
print(classification_report(y_test, y_pred))

#Checking performance our model with ROC Score.
roc_auc_score(y_test, y_pred)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       835
           1       1.00      1.00      1.00       204

    accuracy                           1.00      1039
   macro avg       1.00      1.00      1.00      1039
weighted avg       1.00      1.00      1.00      1039



0.9969502172126334

# Tuning hyperparameters with cross validation split and Normalisation

In [56]:
# Divide the data into training and test , no validation data

X_trainval, X_test, y_trainval, y_test = train_test_split(data_features, data_target, random_state=42)

# preprocessing using 0-1 scaling
scaler = MinMaxScaler()
scaler.fit(X_train)

X_test_scaled = scaler.transform(X_test)
X_trainval_scaled = scaler.transform( X_trainval)


print("Size of training set:{}".format(X_trainval_scaled.shape[0]))
print("Size of test set:{}".format(X_test_scaled.shape[0]))

Size of training set:3115
Size of test set:1039


In [57]:
best_score = 0

for C in [100, 10, 1.0, 0.1, 0.01, 0.001]:
    for penalty in ['l1', 'l2']:
        # Learn the model 
        logreg = LogisticRegression(penalty=penalty, C=C, solver = 'liblinear')
        
        # Perform cross validation
        scores = cross_val_score(logreg, X_trainval_scaled, y_trainval, cv=5)
        
        # Compute the mean score
        score = scores.mean()
        
        
        # If improvement, store score and parameter
        if score>best_score:
            best_score = score
            best_C = C
            best_penalty= penalty

# Build a model on the combine training and valiation data
logreg = LogisticRegression(penalty= best_penalty, C=best_C, solver = 'liblinear')
logreg.fit(X_trainval_scaled, y_trainval)

print("Best best_C found: {}".format(best_C))
print("Best best_penalty found: {}".format(best_penalty))
print("Best average score: {}".format(best_score))
print("Score on training/validation set: {}".format(logreg.score(X_trainval_scaled, y_trainval)))
print("Score on test set: {}".format(logreg.score(X_test_scaled, y_test)))

Best best_C found: 100
Best best_penalty found: l1
Best average score: 0.9967897271268058
Score on training/validation set: 0.9983948635634029
Score on test set: 0.9980750721847931


In [59]:
#Predict test data set.
y_pred = logreg.predict(X_test_scaled)

#Checking performance our model with classification report.
print(classification_report(y_test, y_pred))

#Checking performance our model with ROC Score.
roc_auc_score(y_test, y_pred)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       835
           1       1.00      1.00      1.00       204

    accuracy                           1.00      1039
   macro avg       1.00      1.00      1.00      1039
weighted avg       1.00      1.00      1.00      1039



0.9969502172126334

# Tuning hyperparameters with cross validation split and standardization

In [60]:
# Divide the data into training and test , no validation data

X_trainval, X_test, y_trainval, y_test = train_test_split(data_features, data_target, random_state=42)

# preprocessing using zero mean and unit variance scaling
scaler = StandardScaler()
scaler.fit(X_train)

X_test_scaled = scaler.transform(X_test)
X_trainval_scaled = scaler.transform( X_trainval)


print("Size of training set:{}".format(X_trainval_scaled.shape[0]))
print("Size of test set:{}".format(X_test_scaled.shape[0]))

Size of training set:3115
Size of test set:1039


In [61]:
best_score = 0

for C in [100, 10, 1.0, 0.1, 0.01, 0.001]:
    for penalty in ['l1', 'l2']:
        # Learn the model 
        logreg = LogisticRegression(penalty=penalty, C=C, solver = 'liblinear')
        
        # Perform cross validation
        scores = cross_val_score(logreg, X_trainval_scaled, y_trainval, cv=5)
        
        # Compute the mean score
        score = scores.mean()
        
        
        # If improvement, store score and parameter
        if score>best_score:
            best_score = score
            best_C = C
            best_penalty= penalty

# Build a model on the combine training and valiation data
logreg = LogisticRegression(penalty= best_penalty, C=best_C, solver = 'liblinear')
logreg.fit(X_trainval_scaled, y_trainval)

print("Best best_C found: {}".format(best_C))
print("Best best_penalty found: {}".format(best_penalty))
print("Best average score: {}".format(best_score))
print("Score on training/validation set: {}".format(logreg.score(X_trainval_scaled, y_trainval)))
print("Score on test set: {}".format(logreg.score(X_test_scaled, y_test)))

Best best_C found: 100
Best best_penalty found: l1
Best average score: 0.997752808988764
Score on training/validation set: 0.9990369181380417
Score on test set: 0.9980750721847931


In [62]:
#Predict test data set.
y_pred = logreg.predict(X_test_scaled)

#Checking performance our model with classification report.
print(classification_report(y_test, y_pred))

#Checking performance our model with ROC Score.
roc_auc_score(y_test, y_pred)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       835
           1       1.00      1.00      1.00       204

    accuracy                           1.00      1039
   macro avg       1.00      1.00      1.00      1039
weighted avg       1.00      1.00      1.00      1039



0.9969502172126334