In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import sklearn as sk
import seaborn as sns
import warnings; warnings.simplefilter('ignore')
import matplotlib.pyplot as plt
import mglearn
from sklearn.tree import DecisionTreeClassifier
import graphviz
from sklearn.tree import export_graphviz
import pydotplus
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (confusion_matrix,precision_score,recall_score,f1_score,
    roc_curve,roc_auc_score,precision_recall_curve,accuracy_score,classification_report)

In [2]:
data = pd.read_csv("Energi_Viborg_Dandas_data.csv")

#drop columns not needed after asking the company about the meaning of these features

columns_to_be_removed = ['ID', 'mslink', 'XKoordinat','YKoordinat','LedningID','Dobbeltled','EjerKompon','SystemKode','KategoriAf','DatoUdf']
data=data.drop(columns_to_be_removed,axis='columns')

# in the column DatoSaneri is the date of repairing and if there is no date it means it is not repaired

data['DatoSaneri'].fillna(0, inplace=True)

# take only the pipes that are broken(by TV insection) now and the repaired ones

data_with_TVObsAndSaneri = data[data['TVObsKode'].isin([1]) | data['DatoSaneri'] > 0]

In [3]:
#get around 2077 rows with not broken pipes

data_not_broken = data[~data['TVObsKode'].isin([0]) | data['DatoSaneri'] == 0]
data_not_broken = data_not_broken.sample(n=2077) 

In [4]:
frames = [data_with_TVObsAndSaneri, data_not_broken]
  
data_final = pd.concat(frames)
data_final

Unnamed: 0,fra_kote,til_kote,Laengde,Fald,DiameterIn,MaterialeK,anlag_aar,TransportK,Funktionsk,TVObsKode,DatoOprett,DatoOpdate,DatoSaneri
36,34.72,33.480000,64.88,19.112207,300.0,1.0,1939.0,1,0,0.0,2010,2014,1997.0
42,39.46,39.160000,91.75,3.269755,400.0,1.0,1939.0,1,0,1.0,2010,2014,0.0
43,39.71,39.480000,87.69,2.622876,300.0,1.0,1939.0,1,0,1.0,2010,2014,0.0
64,40.55,40.080000,52.11,9.019382,250.0,1.0,1945.0,1,0,1.0,2010,2014,0.0
65,40.38,40.550000,68.39,-2.485744,250.0,1.0,1945.0,1,0,1.0,2010,2014,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8775,31.23,30.882751,32.53,10.674734,400.0,1.0,1998.0,1,0,0.0,2010,2014,0.0
19344,42.33,41.890000,36.29,12.124552,200.0,4.0,2014.0,1,0,0.0,2014,2014,0.0
4469,15.56,14.100000,14.81,98.582039,400.0,1.0,1978.0,1,0,0.0,2010,2014,0.0
21661,52.63,51.930000,41.71,16.782546,200.0,4.0,2016.0,1,0,0.0,2016,2016,0.0


In [5]:
data = data_final

In [6]:
# data_fs= np.where(np.isnan(data_features))
# data_fs
print("Number of rows before removing NaNs: {}".format(data.shape[0]))
data = data.dropna()
print("Number of rows after removing NaNs: {}".format(data.shape[0]))

Number of rows before removing NaNs: 4154
Number of rows after removing NaNs: 4154


In [7]:
#get data copied
datacopy = data


# add  age column

#get current year
from datetime import date
now = date.today().year


def age_df(datacopy):

    if (datacopy['TVObsKode'] == 1) and (datacopy['DatoSaneri'] > 0) :
        return (now - datacopy['DatoSaneri'])
    elif (datacopy['TVObsKode'] == 1) and (datacopy['DatoSaneri']== 0):
        return (now - datacopy['anlag_aar'])
    elif (datacopy['TVObsKode'] == 0) and (datacopy['DatoSaneri'] > 0):
        return (now - datacopy['DatoSaneri'])
    elif (datacopy['TVObsKode']== 0) and (datacopy['DatoSaneri']== 0):
        return (now - datacopy['anlag_aar'])

datacopy['Age'] = datacopy.apply(age_df, axis = 1)

In [8]:
# add a column 'PipeStatus'
# 1 as broken and 0 as not broken

def broken_df(datacopy):

    if (datacopy['TVObsKode'] == 1) and (datacopy['DatoSaneri'] < (datacopy['DatoOpdate'])) and (datacopy['DatoSaneri'] != 0):
        return 1
    elif (datacopy['TVObsKode'] == 1) and (datacopy['DatoSaneri'] >= (datacopy['DatoOpdate'])) and (datacopy['DatoSaneri'] != 0):
        return 0
    elif (datacopy['TVObsKode'] == 1) and (datacopy['DatoSaneri']== 0):
        return 1
    elif (datacopy['TVObsKode'] == 0) and (datacopy['DatoSaneri'] > 0):
        return 0
    elif (datacopy['TVObsKode']== 0) and (datacopy['DatoSaneri']== 0):
        return 0

datacopy['PipeStatus'] = datacopy.apply(broken_df, axis = 1)

In [9]:
# datacopy = datacopy.sample(n=22) 
# datacopy

In [10]:
# data_fs= np.where(np.isnan(datacopy))
# data_fs
# row = datacopy.iloc[369] #index=1 => second row
# print(row)

In [11]:
# data_fs= np.where(np.isnan(data_features))
# data_fs
print("Number of rows before removing NaNs: {}".format(datacopy.shape[0]))
datacopy = datacopy.dropna()
print("Number of rows after removing NaNs: {}".format(datacopy.shape[0]))

Number of rows before removing NaNs: 4154
Number of rows after removing NaNs: 4154


In [12]:
#drop columns not needed after adding new features

columns_to_be_removed = ['DatoOprett', 'DatoOpdate']
datacopy=datacopy.drop(columns_to_be_removed,axis='columns')
datacopy[0:-1]

Unnamed: 0,fra_kote,til_kote,Laengde,Fald,DiameterIn,MaterialeK,anlag_aar,TransportK,Funktionsk,TVObsKode,DatoSaneri,Age,PipeStatus
36,34.72,33.480000,64.88,19.112207,300.0,1.0,1939.0,1,0,0.0,1997.0,24.0,0
42,39.46,39.160000,91.75,3.269755,400.0,1.0,1939.0,1,0,1.0,0.0,82.0,1
43,39.71,39.480000,87.69,2.622876,300.0,1.0,1939.0,1,0,1.0,0.0,82.0,1
64,40.55,40.080000,52.11,9.019382,250.0,1.0,1945.0,1,0,1.0,0.0,76.0,1
65,40.38,40.550000,68.39,-2.485744,250.0,1.0,1945.0,1,0,1.0,0.0,76.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15503,39.08,38.030000,64.03,16.398563,200.0,4.0,1997.0,1,0,0.0,0.0,24.0,0
8775,31.23,30.882751,32.53,10.674734,400.0,1.0,1998.0,1,0,0.0,0.0,23.0,0
19344,42.33,41.890000,36.29,12.124552,200.0,4.0,2014.0,1,0,0.0,0.0,7.0,0
4469,15.56,14.100000,14.81,98.582039,400.0,1.0,1978.0,1,0,0.0,0.0,43.0,0


In [13]:
# creating features set and target

columns_to_be_removed = ['PipeStatus']
data_features= datacopy.drop(columns_to_be_removed,axis='columns')
columns_to_be_removed = ['fra_kote','til_kote', 'Laengde','Fald','DiameterIn','MaterialeK','anlag_aar','TransportK','Funktionsk','TVObsKode','DatoSaneri','Age']
data_target=datacopy.drop(columns_to_be_removed,axis='columns')

In [14]:
# data_fs= np.where(np.isnan(data_features))
# data_fs
print("Number of rows before removing NaNs: {}".format(data.shape[0]))
data = data.dropna()
print("Number of rows after removing NaNs: {}".format(data.shape[0]))

Number of rows before removing NaNs: 4154
Number of rows after removing NaNs: 4154


# Tuning parameters with test-set split and grid search cross validation

In [15]:
# Divide the data into training and test
X_train, X_test, y_train, y_test = train_test_split(
    data_features, data_target, stratify=data_target, random_state=42)

In [19]:
#List Hyperparameters that we want to tune by cross validation

max_depth = [5,10,20,30,40,50,60,70,80,90,100]
learning_rate = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
n_estimators = [10, 100, 1000]

#Convert to dictionary
hyperparameters = dict(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators)

#Create new forest object
gbt = GradientBoostingClassifier(random_state=2)

#Use GridSearch
clf = GridSearchCV(gbt, hyperparameters, cv=5,return_train_score=True, scoring="recall")

#Fit the model
best_model = clf.fit(X_train, y_train)

#Print The value of best Hyperparameters
print('Best max_depth:', best_model.best_estimator_.get_params()['max_depth'])
print('Best learning_rate:', best_model.best_estimator_.get_params()['learning_rate'])
print('Best n_estimators:', best_model.best_estimator_.get_params()['n_estimators'])
    
print("Accuracy on training set: {:.3f}".format(best_model.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(best_model.score(X_test, y_test)))

Best max_depth: 5
Best learning_rate: 0.001
Best n_estimators: 1000
Accuracy on training set: 1.000
Accuracy on test set: 1.000


In [21]:
#Create forest Object.
a = best_model.best_estimator_.get_params()['max_depth']
b = best_model.best_estimator_.get_params()['learning_rate']
c = best_model.best_estimator_.get_params()['n_estimators']
gbt = GradientBoostingClassifier(n_estimators= c, max_depth= a, learning_rate= b, random_state=2)

#Create x and y variables.
x = data_features
y = data_target

#Split data into training and testing.
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=42)

#Training the model.
gbt.fit(X_train, y_train)

#Predict test data set.
y_pred = gbt.predict(X_test)

#Checking performance our model with classification report.
print(classification_report(y_test, y_pred))

#Checking performance our model with ROC Score.
roc_auc_score(y_test, y_pred)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       835
           1       1.00      1.00      1.00       204

    accuracy                           1.00      1039
   macro avg       1.00      1.00      1.00      1039
weighted avg       1.00      1.00      1.00      1039



0.9975490196078431

# Tuning parameters with validation set split

In [22]:
# Divide the data into training, test and validation

X_trainval, X_test, y_trainval, y_test = train_test_split(data_features, data_target, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, random_state=43)
print("Size of training set:{}".format(X_train.shape[0]))
print("Size of validation set:{}".format(X_val.shape[0]))
print("Size of test set:{}".format(X_test.shape[0]))

Size of training set:2336
Size of validation set:779
Size of test set:1039


In [23]:
best_score = 0

for max_depth in [5,10,20,30,40,50,60,70,80,90,100]:
    for learning_rate in [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]:
        for n_estimators in [10, 100, 1000]:
             # Learn the model 
                    gbt = GradientBoostingClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators,random_state=2)
                    gbt.fit(X_train, y_train)
                        
                        # Evaluate the model
                    score = gbt.score(X_val, y_val)
                        
                         # If improvement, store score and parameter
                    if score>best_score:
                        best_score = score
                        best_max_depth = max_depth
                        best_learning_rate= learning_rate
                        best_n_estimators = n_estimators


# Build a model on the combine training and valiation data
gbt = GradientBoostingClassifier(max_depth= best_max_depth, learning_rate=best_learning_rate, n_estimators=best_n_estimators,random_state=2)
gbt.fit(X_trainval, y_trainval)

print("Best best_max_depth found: {}".format(best_max_depth))
print("Best best_learning_rate found: {}".format(best_learning_rate))
print("Best best_n_estimators found: {}".format(best_n_estimators))
print("Best score on validation set: {}".format(best_score))
print("Score on training/validation set: {}".format(gbt.score(X_trainval, y_trainval)))
print("Score on test set: {}".format(gbt.score(X_test, y_test)))

Best best_max_depth found: 5
Best best_learning_rate found: 0.2
Best best_n_estimators found: 1000
Best score on validation set: 0.9987163029525032
Score on training/validation set: 1.0
Score on test set: 0.9990375360923965


In [24]:
#Predict test data set.
y_pred = gbt.predict(X_test)

#Checking performance our model with classification report.
print(classification_report(y_test, y_pred))

#Checking performance our model with ROC Score.
roc_auc_score(y_test, y_pred)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       835
           1       1.00      1.00      1.00       204

    accuracy                           1.00      1039
   macro avg       1.00      1.00      1.00      1039
weighted avg       1.00      1.00      1.00      1039



0.9975490196078431

# Tuning parameters with cross validation

In [25]:
# Divide the data into training and test , no validation data

X_trainval, X_test, y_trainval, y_test = train_test_split(data_features, data_target, random_state=42)
print("Size of training set:{}".format(X_train.shape[0]))
print("Size of test set:{}".format(X_test.shape[0]))

Size of training set:2336
Size of test set:1039


In [26]:
best_score = 0

for max_depth in [5,10,20,30,40,50,60,70,80,90,100]:
    for learning_rate in [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]:
        for n_estimators in [10, 100, 1000]:
            # Learn the model 
            gbt = GradientBoostingClassifier(max_depth=max_depth,learning_rate=learning_rate, n_estimators=n_estimators,random_state=2)
        
            # Perform cross validation
            scores = cross_val_score(gbt, X_trainval, y_trainval, cv=5)
        
            # Compute the mean score
            score = scores.mean()
        
        
           # If improvement, store score and parameter
            if score>best_score:
                best_score = score
                best_max_depth = max_depth
                best_learning_rate= learning_rate
                best_n_estimators = n_estimators

# Build a model on the combine training and valiation data
gbt = GradientBoostingClassifier(max_depth= best_max_depth, learning_rate=best_learning_rate, n_estimators=best_n_estimators,random_state=2)
gbt.fit(X_trainval, y_trainval)

print("Best best_max_depth found: {}".format(best_max_depth))
print("Best best_learning_rate found: {}".format(best_learning_rate))
print("Best best_n_estimators found: {}".format(best_n_estimators))
print("Best average score: {}".format(best_score))
print("Score on training/validation set: {}".format(gbt.score(X_trainval, y_trainval)))
print("Score on test set: {}".format(gbt.score(X_test, y_test)))

Best best_max_depth found: 5
Best best_learning_rate found: 0.2
Best best_n_estimators found: 1000
Best average score: 0.997752808988764
Score on training/validation set: 1.0
Score on test set: 0.9990375360923965


In [27]:
#Predict test data set.
y_pred = gbt.predict(X_test)

#Checking performance our model with classification report.
print(classification_report(y_test, y_pred))

#Checking performance our model with ROC Score.
roc_auc_score(y_test, y_pred)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       835
           1       1.00      1.00      1.00       204

    accuracy                           1.00      1039
   macro avg       1.00      1.00      1.00      1039
weighted avg       1.00      1.00      1.00      1039



0.9975490196078431