In [21]:
%matplotlib inline
import numpy as np
import pandas as pd
import sklearn as sk
import seaborn as sns
import warnings; warnings.simplefilter('ignore')
import matplotlib.pyplot as plt
import mglearn
from sklearn.tree import DecisionTreeClassifier
import graphviz
from sklearn.tree import export_graphviz
import pydotplus
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (confusion_matrix,precision_score,recall_score,f1_score,
    roc_curve,roc_auc_score,precision_recall_curve,accuracy_score,classification_report)

In [22]:
data = pd.read_csv("Energi_Viborg_Dandas_data.csv")

#drop columns not needed after asking the company about the meaning of these features

columns_to_be_removed = ['ID', 'mslink', 'XKoordinat','YKoordinat','LedningID','Dobbeltled','EjerKompon','SystemKode','KategoriAf','DatoUdf']
data=data.drop(columns_to_be_removed,axis='columns')

# in the column DatoSaneri is the date of repairing and if there is no date it means it is not repaired

data['DatoSaneri'].fillna(0, inplace=True)

# take only the pipes that are broken(by TV insection) now and the repaired ones

data_with_TVObsAndSaneri = data[data['TVObsKode'].isin([1]) | data['DatoSaneri'] > 0]

In [23]:
#get around 2077 rows with not broken pipes

data_not_broken = data[~data['TVObsKode'].isin([0]) | data['DatoSaneri'] == 0]
data_not_broken = data_not_broken.sample(n=2077) 

In [24]:
frames = [data_with_TVObsAndSaneri, data_not_broken]
  
data_final = pd.concat(frames)
data_final

Unnamed: 0,fra_kote,til_kote,Laengde,Fald,DiameterIn,MaterialeK,anlag_aar,TransportK,Funktionsk,TVObsKode,DatoOprett,DatoOpdate,DatoSaneri
36,34.72,33.48,64.88,19.112207,300.0,1.0,1939.0,1,0,0.0,2010,2014,1997.0
42,39.46,39.16,91.75,3.269755,400.0,1.0,1939.0,1,0,1.0,2010,2014,0.0
43,39.71,39.48,87.69,2.622876,300.0,1.0,1939.0,1,0,1.0,2010,2014,0.0
64,40.55,40.08,52.11,9.019382,250.0,1.0,1945.0,1,0,1.0,2010,2014,0.0
65,40.38,40.55,68.39,-2.485744,250.0,1.0,1945.0,1,0,1.0,2010,2014,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19467,43.11,42.66,38.79,11.600928,200.0,4.0,1970.0,1,0,0.0,2014,2014,0.0
6594,53.05,52.47,14.82,39.136302,250.0,1.0,1989.0,1,0,0.0,2010,2014,0.0
19417,12.41,12.27,25.89,5.407493,1200.0,1.0,2014.0,1,0,0.0,2014,2014,0.0
19491,18.49,17.38,59.00,18.813559,400.0,1.0,1974.0,1,0,0.0,2014,2014,0.0


In [25]:
data = data_final

In [26]:
# data_fs= np.where(np.isnan(data_features))
# data_fs
print("Number of rows before removing NaNs: {}".format(data.shape[0]))
data = data.dropna()
print("Number of rows after removing NaNs: {}".format(data.shape[0]))

Number of rows before removing NaNs: 4154
Number of rows after removing NaNs: 4154


In [27]:
#get data copied
datacopy = data


# add  age column

#get current year
from datetime import date
now = date.today().year


def age_df(datacopy):

    if (datacopy['TVObsKode'] == 1) and (datacopy['DatoSaneri'] > 0) :
        return (now - datacopy['DatoSaneri'])
    elif (datacopy['TVObsKode'] == 1) and (datacopy['DatoSaneri']== 0):
        return (now - datacopy['anlag_aar'])
    elif (datacopy['TVObsKode'] == 0) and (datacopy['DatoSaneri'] > 0):
        return (now - datacopy['DatoSaneri'])
    elif (datacopy['TVObsKode']== 0) and (datacopy['DatoSaneri']== 0):
        return (now - datacopy['anlag_aar'])

datacopy['Age'] = datacopy.apply(age_df, axis = 1)

In [28]:
# add a column 'PipeStatus'
# 1 as broken and 0 as not broken

def broken_df(datacopy):

    if (datacopy['TVObsKode'] == 1) and (datacopy['DatoSaneri'] < (datacopy['DatoOpdate'])) and (datacopy['DatoSaneri'] != 0):
        return 1
    elif (datacopy['TVObsKode'] == 1) and (datacopy['DatoSaneri'] >= (datacopy['DatoOpdate'])) and (datacopy['DatoSaneri'] != 0):
        return 0
    elif (datacopy['TVObsKode'] == 1) and (datacopy['DatoSaneri']== 0):
        return 1
    elif (datacopy['TVObsKode'] == 0) and (datacopy['DatoSaneri'] > 0):
        return 0
    elif (datacopy['TVObsKode']== 0) and (datacopy['DatoSaneri']== 0):
        return 0

datacopy['PipeStatus'] = datacopy.apply(broken_df, axis = 1)

In [29]:
# datacopy = datacopy.sample(n=22) 
# datacopy

In [30]:
# data_fs= np.where(np.isnan(datacopy))
# data_fs
# row = datacopy.iloc[369] #index=1 => second row
# print(row)

In [31]:
# data_fs= np.where(np.isnan(data_features))
# data_fs
print("Number of rows before removing NaNs: {}".format(datacopy.shape[0]))
datacopy = datacopy.dropna()
print("Number of rows after removing NaNs: {}".format(datacopy.shape[0]))

Number of rows before removing NaNs: 4154
Number of rows after removing NaNs: 4154


In [32]:
#drop columns not needed after adding new features

columns_to_be_removed = ['DatoOprett', 'DatoOpdate']
datacopy=datacopy.drop(columns_to_be_removed,axis='columns')
datacopy[0:-1]

Unnamed: 0,fra_kote,til_kote,Laengde,Fald,DiameterIn,MaterialeK,anlag_aar,TransportK,Funktionsk,TVObsKode,DatoSaneri,Age,PipeStatus
36,34.72,33.48,64.88,19.112207,300.0,1.0,1939.0,1,0,0.0,1997.0,24.0,0
42,39.46,39.16,91.75,3.269755,400.0,1.0,1939.0,1,0,1.0,0.0,82.0,1
43,39.71,39.48,87.69,2.622876,300.0,1.0,1939.0,1,0,1.0,0.0,82.0,1
64,40.55,40.08,52.11,9.019382,250.0,1.0,1945.0,1,0,1.0,0.0,76.0,1
65,40.38,40.55,68.39,-2.485744,250.0,1.0,1945.0,1,0,1.0,0.0,76.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20528,16.85,15.88,146.23,6.633386,300.0,1.0,1993.0,1,0,0.0,0.0,28.0,0
19467,43.11,42.66,38.79,11.600928,200.0,4.0,1970.0,1,0,0.0,0.0,51.0,0
6594,53.05,52.47,14.82,39.136302,250.0,1.0,1989.0,1,0,0.0,0.0,32.0,0
19417,12.41,12.27,25.89,5.407493,1200.0,1.0,2014.0,1,0,0.0,0.0,7.0,0


In [33]:
# creating features set and target

columns_to_be_removed = ['PipeStatus']
data_features= datacopy.drop(columns_to_be_removed,axis='columns')
columns_to_be_removed = ['fra_kote','til_kote', 'Laengde','Fald','DiameterIn','MaterialeK','anlag_aar','TransportK','Funktionsk','TVObsKode','DatoSaneri','Age']
data_target=datacopy.drop(columns_to_be_removed,axis='columns')

In [34]:
# data_fs= np.where(np.isnan(data_features))
# data_fs
print("Number of rows before removing NaNs: {}".format(data.shape[0]))
data = data.dropna()
print("Number of rows after removing NaNs: {}".format(data.shape[0]))

Number of rows before removing NaNs: 4154
Number of rows after removing NaNs: 4154


# Tuning parameters with test-set split and grid search cross validation

In [35]:
# Divide the data into training and test
X_train, X_test, y_train, y_test = train_test_split(
    data_features, data_target, stratify=data_target, random_state=42)

In [37]:
#List Hyperparameters that we want to tune by cross validation

max_depth = [5,10,20,30,40,50,60,70,80,90,100]
max_features = ['sqrt', 'log2']
n_estimators = [10, 100, 1000]

#Convert to dictionary
hyperparameters = dict(max_depth=max_depth, max_features=max_features, n_estimators=n_estimators)

#Create new forest object
forest = RandomForestClassifier()

#Use GridSearch
clf = GridSearchCV(forest, hyperparameters, cv=5,return_train_score=True, scoring="recall")

#Fit the model
best_model = clf.fit(X_train, y_train)

#Print The value of best Hyperparameters
print('Best max_depth:', best_model.best_estimator_.get_params()['max_depth'])
print('Best max_features:', best_model.best_estimator_.get_params()['max_features'])
print('Best n_estimators:', best_model.best_estimator_.get_params()['n_estimators'])
    
print("Accuracy on training set: {:.3f}".format(best_model.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(best_model.score(X_test, y_test)))

Best max_depth: 5
Best max_features: log2
Best n_estimators: 1000
Accuracy on training set: 0.994
Accuracy on test set: 1.000


In [38]:
#Create forest Object.
a = best_model.best_estimator_.get_params()['max_depth']
b = best_model.best_estimator_.get_params()['max_features']
c = best_model.best_estimator_.get_params()['n_estimators']
forest = RandomForestClassifier(n_estimators= c, max_depth= a, max_features= b, random_state=0)

#Create x and y variables.
x = data_features
y = data_target

#Split data into training and testing.
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=42)

#Training the model.
forest.fit(X_train, y_train)

#Predict test data set.
y_pred = forest.predict(X_test)

#Checking performance our model with classification report.
print(classification_report(y_test, y_pred))

#Checking performance our model with ROC Score.
roc_auc_score(y_test, y_pred)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       835
           1       1.00      1.00      1.00       204

    accuracy                           1.00      1039
   macro avg       1.00      1.00      1.00      1039
weighted avg       1.00      1.00      1.00      1039



0.9975490196078431

# Tuning parameters with validation set split

In [39]:
# Divide the data into training, test and validation

X_trainval, X_test, y_trainval, y_test = train_test_split(data_features, data_target, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, random_state=43)
print("Size of training set:{}".format(X_train.shape[0]))
print("Size of validation set:{}".format(X_val.shape[0]))
print("Size of test set:{}".format(X_test.shape[0]))

Size of training set:2336
Size of validation set:779
Size of test set:1039


In [44]:
best_score = 0

for max_depth in [5,10,20,30,40,50,60,70,80,90,100]:
    for max_features in ['sqrt', 'log2']:
        for n_estimators in [10, 100, 1000]:
             # Learn the model 
                    forest = RandomForestClassifier(max_depth=max_depth, max_features=max_features, n_estimators=n_estimators)
                    forest.fit(X_train, y_train)
                        
                        # Evaluate the model
                    score = forest.score(X_val, y_val)
                        
                         # If improvement, store score and parameter
                    if score>best_score:
                        best_score = score
                        best_max_depth = max_depth
                        best_max_features= max_features
                        best_n_estimators = n_estimators


# Build a model on the combine training and valiation data
forest = RandomForestClassifier(max_depth= best_max_depth, max_features=best_max_features, n_estimators=best_n_estimators)
forest.fit(X_trainval, y_trainval)

print("Best best_max_depth found: {}".format(best_max_depth))
print("Best best_max_features found: {}".format(best_max_features))
print("Best best_n_estimators found: {}".format(best_n_estimators))
print("Best score on validation set: {}".format(best_score))
print("Score on training/validation set: {}".format(forest.score(X_trainval, y_trainval)))
print("Score on test set: {}".format(forest.score(X_test, y_test)))

Best best_max_depth found: 20
Best best_max_features found: sqrt
Best best_n_estimators found: 100
Best score on validation set: 1.0
Score on training/validation set: 1.0
Score on test set: 0.9990375360923965


In [45]:
#Predict test data set.
y_pred = forest.predict(X_test)

#Checking performance our model with classification report.
print(classification_report(y_test, y_pred))

#Checking performance our model with ROC Score.
roc_auc_score(y_test, y_pred)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       835
           1       1.00      1.00      1.00       204

    accuracy                           1.00      1039
   macro avg       1.00      1.00      1.00      1039
weighted avg       1.00      1.00      1.00      1039



0.9975490196078431

# Tuning parameters with cross validation

In [46]:
# Divide the data into training and test , no validation data

X_trainval, X_test, y_trainval, y_test = train_test_split(data_features, data_target, random_state=42)
print("Size of training set:{}".format(X_train.shape[0]))
print("Size of test set:{}".format(X_test.shape[0]))

Size of training set:2336
Size of test set:1039


In [49]:
best_score = 0

for max_depth in [5,10,20,30,40,50,60,70,80,90,100]:
    for max_features in ['sqrt', 'log2']:
        for n_estimators in [10, 100, 1000]:
            # Learn the model 
            forest = RandomForestClassifier(max_depth=max_depth, max_features=max_features, n_estimators=n_estimators)
        
            # Perform cross validation
            scores = cross_val_score(forest, X_trainval, y_trainval, cv=5)
        
            # Compute the mean score
            score = scores.mean()
        
        
           # If improvement, store score and parameter
            if score>best_score:
                best_score = score
                best_max_depth = max_depth
                best_max_features= max_features
                best_n_estimators = n_estimators

# Build a model on the combine training and valiation data
forest = RandomForestClassifier(max_depth= best_max_depth, max_features=best_max_features, n_estimators=best_n_estimators)
forest.fit(X_trainval, y_trainval)

print("Best best_max_depth found: {}".format(best_max_depth))
print("Best best_max_features found: {}".format(best_max_features))
print("Best best_n_estimators found: {}".format(best_n_estimators))
print("Best average score: {}".format(best_score))
print("Score on training/validation set: {}".format(forest.score(X_trainval, y_trainval)))
print("Score on test set: {}".format(forest.score(X_test, y_test)))

Best best_max_depth found: 5
Best best_max_features found: sqrt
Best best_n_estimators found: 100
Best average score: 0.9987158908507224
Score on training/validation set: 0.9990369181380417
Score on test set: 0.9990375360923965


In [50]:
#Predict test data set.
y_pred = forest.predict(X_test)

#Checking performance our model with classification report.
print(classification_report(y_test, y_pred))

#Checking performance our model with ROC Score.
roc_auc_score(y_test, y_pred)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       835
           1       1.00      1.00      1.00       204

    accuracy                           1.00      1039
   macro avg       1.00      1.00      1.00      1039
weighted avg       1.00      1.00      1.00      1039



0.9975490196078431