In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import sklearn as sk
import seaborn as sns
import warnings; warnings.simplefilter('ignore')
import matplotlib.pyplot as plt
import mglearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.metrics import mean_squared_error

# Data Preparation and Feature Engineering:

In [2]:
def read_xyz(fileName):
    
    xyz_coordinates = [] #put xyz in an array
    
    with open(fileName,"r") as file:
        for line_number,line in enumerate(file):
            x,y,z = line.split()
            
            xyz_coordinates.append([int(x),int(y),float(z)])
    return xyz_coordinates

In [3]:
groundwater_map = read_xyz("GRW_MBS_50m.xyz")

In [4]:
#display in dataframe 
#add cplumns of name
my_array = np.array(groundwater_map)

df = pd.DataFrame(my_array, columns = ['XKoordinat','YKoordinat','Depth'])



Now we have a table with x,y coordinates and depths. Next step is to copmare with csv file coordinates and get a column to state the ground water level.

In [5]:
data = pd.read_csv("Energi_Viborg_Dandas_data.csv")

#drop columns not needed after asking the company about the meaning of these features

columns_to_be_removed = [ 'mslink','LedningID','Dobbeltled','EjerKompon','SystemKode','KategoriAf','DatoUdf']
data=data.drop(columns_to_be_removed,axis='columns')

# in the column DatoSaneri is the date of repairing and if there is no date it means it is not repaired

data['DatoSaneri'].fillna(0, inplace=True)

In [6]:
# take only the pipes that are broken(by TV insection) now and the repaired ones

data_with_TVObsAndSaneri = data[data['TVObsKode'].isin([1]) | data['DatoSaneri'] > 0]

get matched depth withground water of broken pipes

In [7]:
def get_matched_depth(select_x,select_y):
#     select_x = data['XKoordinat']
#     select_y = data['YKoordinat']
    #select_y = data.loc[data['XKoordinat'] == select_x]['YKoordinat'].values[0]
    length = data_with_TVObsAndSaneri.loc[data_with_TVObsAndSaneri['XKoordinat'] == select_x]['Laengde'].values[0]
    angle = data_with_TVObsAndSaneri.loc[data_with_TVObsAndSaneri['XKoordinat'] == select_x]['Fald'].values[0]
#     length=data['Laengde']
#     angle = data['Fald']
    #calculate another point by length:
    end_x = select_x+ (length * np.cos(angle))
    end_y = select_y+ (length * np.sin(angle))
    if(end_x > select_x):
        max_x=end_x
        min_x=select_x
    else:
        min_x=end_x
        max_x=select_x   
    if(end_y > select_y):
        max_y=end_y
        min_y=select_y
    else:
        min_y=end_y
        max_y=select_y   


    matched_depth_col = df.loc[(df['XKoordinat'] <= max_x)&(df['XKoordinat'] >= min_x)
    &(df['YKoordinat'] <= max_y)&(df['YKoordinat'] >= min_y)]['Depth']
    #test if there is a value
    if(matched_depth_col.size > 0):
        matched_depth = matched_depth_col.values[0]
    else:
        matched_depth = np.NaN
    
    return matched_depth

In [8]:
def add_depth(datacopy):
    select_x = datacopy['XKoordinat']
    select_y = datacopy['YKoordinat']
    return get_matched_depth(select_x,select_y)
data_with_TVObsAndSaneri['Depth'] = data_with_TVObsAndSaneri.apply(add_depth,axis =1)
data_with_TVObsAndSaneri    
        

Unnamed: 0,ID,XKoordinat,YKoordinat,fra_kote,til_kote,Laengde,Fald,DiameterIn,MaterialeK,anlag_aar,TransportK,Funktionsk,TVObsKode,DatoOprett,DatoOpdate,DatoSaneri,Depth
36,87810,529911.05,6252443.83,34.720000,33.480000,64.88,19.112207,300.0,1.0,1939.0,1,0,0.0,2010,2014,1997.0,
42,87832,530405.37,6252578.04,39.460000,39.160000,91.75,3.269755,400.0,1.0,1939.0,1,0,1.0,2010,2014,0.0,2.314121
43,87834,530493.05,6252579.67,39.710000,39.480000,87.69,2.622876,300.0,1.0,1939.0,1,0,1.0,2010,2014,0.0,
64,87901,530791.62,6252572.03,40.550000,40.080000,52.11,9.019382,250.0,1.0,1945.0,1,0,1.0,2010,2014,0.0,4.478954
65,87903,530857.05,6252552.13,40.380000,40.550000,68.39,-2.485744,250.0,1.0,1945.0,1,0,1.0,2010,2014,0.0,5.462685
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23957,222195,500582.01,6260561.72,11.940000,10.550000,60.32,23.043767,350.0,1.0,1968.0,1,0,1.0,2018,2018,0.0,9.960063
24062,222448,544401.67,6256588.02,25.720000,22.820000,44.37,65.359477,160.0,4.0,2017.0,1,0,1.0,2018,2018,0.0,
24073,222942,530696.61,6245426.58,28.490000,27.730000,84.29,9.016491,200.0,1.0,1995.0,1,0,1.0,2018,2018,0.0,
24090,222967,530819.00,6245345.46,27.162706,27.902884,18.82,-39.329328,315.0,4.0,1965.0,1,0,1.0,2018,2018,0.0,


In [9]:
data_with_TVObsAndSaneri_Groundwater = data_with_TVObsAndSaneri.dropna()


handle unbroken pipe by adding groundwater depth and get around 619 rows randomly

In [10]:
data_not_broken = data[~data['TVObsKode'].isin([0]) | data['DatoSaneri'] == 0]
data_not_broken = data_not_broken.sample(n=4000) 

In [11]:
def get_matched_depth_unbroken(select_x,select_y):
#     select_x = data['XKoordinat']
#     select_y = data['YKoordinat']
    #select_y = data.loc[data['XKoordinat'] == select_x]['YKoordinat'].values[0]
    length = data_not_broken.loc[data_not_broken['XKoordinat'] == select_x]['Laengde'].values[0]
    angle = data_not_broken.loc[data_not_broken['XKoordinat'] == select_x]['Fald'].values[0]
#     length=data['Laengde']
#     angle = data['Fald']
    #calculate another point by length:
    end_x = select_x+ (length * np.cos(angle))
    end_y = select_y+ (length * np.sin(angle))
    if(end_x > select_x):
        max_x=end_x
        min_x=select_x
    else:
        min_x=end_x
        max_x=select_x   
    if(end_y > select_y):
        max_y=end_y
        min_y=select_y
    else:
        min_y=end_y
        max_y=select_y   


    matched_depth_col = df.loc[(df['XKoordinat'] <= max_x)&(df['XKoordinat'] >= min_x)
    &(df['YKoordinat'] <= max_y)&(df['YKoordinat'] >= min_y)]['Depth']
    #test if there is a value
    if(matched_depth_col.size > 0):
        matched_depth = matched_depth_col.values[0]
    else:
        matched_depth = np.NaN
    
    return matched_depth

In [12]:
def add_depth_unbroken(datacopy):
    select_x = datacopy['XKoordinat']
    select_y = datacopy['YKoordinat']
    return get_matched_depth_unbroken(select_x,select_y)
data_not_broken['Depth'] = data_not_broken.apply(add_depth_unbroken,axis =1)
        

In [13]:
data_not_broken_Groundwater = data_not_broken.dropna()
data_not_broken_Groundwater =data_not_broken_Groundwater.sample(n=619)

In [14]:
frames = [data_not_broken_Groundwater,data_with_TVObsAndSaneri_Groundwater]
data = pd.concat(frames)

In [15]:
datacopy = data


# add  age column

#get current year
from datetime import date
now = date.today().year


def age_df(datacopy):

    if (datacopy['TVObsKode'] == 1) and (datacopy['DatoSaneri'] > 0) :
        return (now - datacopy['DatoSaneri'])
    elif (datacopy['TVObsKode'] == 1) and (datacopy['DatoSaneri']== 0):
        return (now - datacopy['anlag_aar'])
    elif (datacopy['TVObsKode'] == 0) and (datacopy['DatoSaneri'] > 0):
        return (now - datacopy['DatoSaneri'])
    elif (datacopy['TVObsKode']== 0) and (datacopy['DatoSaneri']== 0):
        return (now - datacopy['anlag_aar'])

datacopy['Age'] = datacopy.apply(age_df, axis = 1)

In [16]:
# add a column 'PipeStatus'
# 1 as broken and 0 as not broken

def broken_df(datacopy):

    if (datacopy['TVObsKode'] == 1) and (datacopy['DatoSaneri'] < (datacopy['DatoOpdate'])) and (datacopy['DatoSaneri'] != 0):
        return 1
    elif (datacopy['TVObsKode'] == 1) and (datacopy['DatoSaneri'] >= (datacopy['DatoOpdate'])) and (datacopy['DatoSaneri'] != 0):
        return 0
    elif (datacopy['TVObsKode'] == 1) and (datacopy['DatoSaneri']== 0):
        return 1
    elif (datacopy['TVObsKode'] == 0) and (datacopy['DatoSaneri'] > 0):
        return 0
    elif (datacopy['TVObsKode']== 0) and (datacopy['DatoSaneri']== 0):
        return 0

datacopy['PipeStatus'] = datacopy.apply(broken_df, axis = 1)

In [17]:
# data_fs= np.where(np.isnan(data_features))
# data_fs
print("Number of rows before removing NaNs: {}".format(datacopy.shape[0]))
datacopy = datacopy.dropna()
print("Number of rows after removing NaNs: {}".format(datacopy.shape[0]))

Number of rows before removing NaNs: 1238
Number of rows after removing NaNs: 1238


In [18]:
#drop columns not needed after adding new features

columns_to_be_removed = ['DatoOprett', 'DatoOpdate']
datacopy=datacopy.drop(columns_to_be_removed,axis='columns')
datacopy[0:-1]

Unnamed: 0,ID,XKoordinat,YKoordinat,fra_kote,til_kote,Laengde,Fald,DiameterIn,MaterialeK,anlag_aar,TransportK,Funktionsk,TVObsKode,DatoSaneri,Depth,Age,PipeStatus
23485,219236,525938.45,6253508.25,25.30,22.58,81.79,33.255899,160.0,4.0,2017.0,1,0,0.0,0.0,14.219084,4.0,0
7636,128787,527530.04,6258546.98,33.15,32.62,63.64,8.328096,400.0,1.0,1975.0,1,0,0.0,0.0,11.205647,46.0,0
2754,103332,544840.68,6265213.81,21.97,21.46,70.90,7.193230,160.0,4.0,1979.0,1,0,0.0,0.0,0.986856,42.0,0
12888,156449,518003.62,6252260.86,45.60,45.15,87.33,5.152868,400.0,1.0,1969.0,1,0,0.0,0.0,25.464317,52.0,0
354,89279,538072.30,6259752.41,55.05,54.56,94.36,5.192878,200.0,4.0,1975.0,1,0,0.0,0.0,14.199490,46.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23825,221075,505295.19,6253688.34,41.00,40.24,77.12,9.854772,200.0,1.0,1979.0,1,0,1.0,0.0,14.129904,42.0,1
23835,221099,505487.19,6253719.27,40.77,40.36,57.64,7.113116,200.0,1.0,1979.0,1,0,1.0,0.0,14.236778,42.0,1
23838,221112,505553.82,6253730.01,41.26,40.78,67.49,7.112165,200.0,1.0,1979.0,1,0,1.0,0.0,15.404106,42.0,1
23841,221115,505234.88,6253724.12,42.49,41.53,109.15,8.795236,315.0,4.0,1992.0,1,0,1.0,0.0,16.049837,29.0,1


In [19]:
# val = datacopy.nunique()
# val

In [20]:
# creating features set and target

columns_to_be_removed = ['Age','ID']
data_features= datacopy.drop(columns_to_be_removed,axis='columns')
columns_to_be_removed = ['fra_kote','til_kote', 'Laengde','Fald','DiameterIn','MaterialeK','anlag_aar','TransportK',
                         'Funktionsk','TVObsKode','DatoSaneri','PipeStatus','ID','XKoordinat','YKoordinat','Depth']
data_target=datacopy.drop(columns_to_be_removed,axis='columns')

In [21]:
# data_fs= np.where(np.isnan(data_features))
# data_fs
data = datacopy
print("Number of rows before removing NaNs: {}".format(data.shape[0]))
data = data.dropna()
print("Number of rows after removing NaNs: {}".format(data.shape[0]))

Number of rows before removing NaNs: 1238
Number of rows after removing NaNs: 1238


In [22]:
data_target

Unnamed: 0,Age
23485,4.0
7636,46.0
2754,42.0
12888,52.0
354,46.0
...,...
23835,42.0
23838,42.0
23841,29.0
23957,53.0


# Tuning Alpha for Lasso Model with Train-Test split and Normalization:

In [23]:
# Divide the data into training and test
X_train, X_test, y_train, y_test = train_test_split(data_features, data_target, random_state=42)

In [24]:
# Learn the model with a certain numnber of alphas
lassocv = LassoCV(alphas = None, cv = 10, max_iter = 100000, normalize = True)
lassocv.fit(X_train, y_train)
print("Best alpha value found: {}".format(lassocv.alpha_))

Best alpha value found: 0.0010181925038167624


In [25]:
# coefficients associated with the chosen alpha
pd.Series(lassocv.coef_ , index = data_features.columns)

XKoordinat     0.000000
YKoordinat    -0.000008
fra_kote      -0.024735
til_kote      -0.000000
Laengde       -0.010591
Fald          -0.033541
DiameterIn     0.000243
MaterialeK    -0.000000
anlag_aar     -0.739035
TransportK     7.534099
Funktionsk    -0.474766
TVObsKode     -6.585158
DatoSaneri    -0.020611
Depth          0.060034
PipeStatus    10.440272
dtype: float64

In [26]:
# to chech model performance
lasso = Lasso(max_iter = 10000, normalize = True)
lasso.set_params(alpha=lassocv.alpha_)
lasso.fit(X_train, y_train)
mse = mean_squared_error(y_test, lasso.predict(X_test))
print("The MSE associated with alpha value: {}".format(mse))

The MSE associated with alpha value: 71.50568407196843


The $R^2$ corresponding to this value for alpha are:

In [27]:
# R^2 of the associated alpha
print("R^2 on train data is {} and on test data is {}".format(lasso.score(X_train, y_train), 
                                                              lasso.score(X_test,y_test)))

R^2 on train data is 0.8487090630038366 and on test data is 0.8595752986912264


# Tuning Alpha for Lasso Model with validation set split and Normalization:

In [28]:
# Divide the data into training, test and validation

X_trainval, X_test, y_trainval, y_test = train_test_split(data_features, data_target, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, random_state=43)

In [29]:
best_score = 100
for alphas in 10**np.linspace(-10, 10, 100):
    # Learn the model with a certain numnber of alphas
    lasso = Lasso(max_iter = 10000, normalize=True, alpha=alphas)
    lasso.fit(X_train, y_train)
    
    # Evaluate the model
    score = mean_squared_error(y_val, lasso.predict(X_val))
    
    
    # If improvement, store score and parameter
    if score < best_score:
        best_score = score
        best_alphas = alphas

# Build a model on the combine training and valiation data
lasso = Lasso(max_iter = 10000, normalize=True, alpha = best_alphas)
lasso.fit(X_trainval, y_trainval)

print("Best alpha found: {}".format(best_alphas))
print("Best MSE on validation set: {}".format(best_score))
print("MSE on training/validation set: {}".format(mean_squared_error(y_trainval, lasso.predict(X_trainval))))
print("MSE on test set: {}".format(mean_squared_error(y_test, lasso.predict(X_test))))

Best alpha found: 0.0029836472402833404
Best MSE on validation set: 78.0308614035536
MSE on training/validation set: 81.51161228457471
MSE on test set: 72.05196006269517


In [30]:
# to chech model performance
lasso1 = Lasso(max_iter = 10000, alpha = best_alphas, normalize = True)
lasso1.fit(X_train, y_train)
mse = mean_squared_error(y_test, lasso1.predict(X_test))
print("The MSE associated with alpha value: {}".format(mse))

The MSE associated with alpha value: 71.8422312800897


In [31]:
# coefficients associated with the chosen alpha
pd.Series(lasso.coef_ , index = data_features.columns)

XKoordinat    0.000000
YKoordinat   -0.000000
fra_kote     -0.013950
til_kote     -0.000000
Laengde      -0.008214
Fald         -0.029694
DiameterIn    0.000023
MaterialeK   -0.000000
anlag_aar    -0.735169
TransportK    5.734465
Funktionsk   -0.454999
TVObsKode    -3.743897
DatoSaneri   -0.020593
Depth         0.037773
PipeStatus    7.548493
dtype: float64

In [32]:
# R^2 of the associated alpha
print("R^2 on train data is {} and on test data is {}".format(lasso1.score(X_train, y_train), 
                                                              lasso1.score(X_test,y_test)))

R^2 on train data is 0.8459587366202788 and on test data is 0.8589143786288552


# Tuning Alpha for Lasso Model with cross validation split and Normalization:

In [33]:
# Divide the data into training, test and validation

X_trainval, X_test, y_trainval, y_test = train_test_split(data_features, data_target, random_state=42)

In [34]:
best_score = 0
for alphas in 10**np.linspace(-10, 10, 100):
    # Set a certain number of alphas
    lasso1 = Lasso(max_iter = 10000, normalize=True, alpha=alphas)
    
    # Perform cross validation
    scores = cross_val_score(lasso1, X_trainval, y_trainval, cv=5)
    
    # Compute the mean score
    score = scores.mean()
    
    # If improvement, store score and parameter
    if score > best_score:
        best_score = score
        best_alphas = alphas

# Build a model on the combine training and valiation data
lasso1 = Lasso(max_iter = 10000, normalize=True, alpha = best_alphas)
lasso1.fit(X_trainval, y_trainval)

print("Best alpha found: {}".format(best_alphas))
print("Best MSE on validation set: {}".format(best_score))
print("MSE on training/validation set: {}".format(mean_squared_error(y_trainval, lasso1.predict(X_trainval))))
print("MSE on test set: {}".format(mean_squared_error(y_test, lasso1.predict(X_test))))

Best alpha found: 0.0004641588833612782
Best MSE on validation set: 0.8414049435891628
MSE on training/validation set: 81.06724582509685
MSE on test set: 71.4023350641215


In [35]:
# to chech model performance
lasso = Lasso(max_iter = 10000, alpha = best_alphas, normalize = True)
lasso.fit(X_train, y_train)
mse = mean_squared_error(y_test, lasso.predict(X_test))
print("The MSE associated with alpha value: {}".format(mse))

The MSE associated with alpha value: 71.51491280449564


In [36]:
# coefficients associated with the chosen alpha
pd.Series(lasso.coef_ , index = data_features.columns)

XKoordinat    -0.000007
YKoordinat    -0.000019
fra_kote      -0.000000
til_kote      -0.019917
Laengde       -0.016853
Fald          -0.039530
DiameterIn    -0.001005
MaterialeK    -0.007105
anlag_aar     -0.729629
TransportK     8.590976
Funktionsk    -0.594643
TVObsKode     -9.239413
DatoSaneri    -0.020439
Depth          0.079083
PipeStatus    13.667519
dtype: float64

In [37]:
# R^2 of the associated alpha
print("R^2 on train data is {} and on test data is {}".format(lasso.score(X_train, y_train), 
                                                              lasso.score(X_test,y_test)))

R^2 on train data is 0.8467358138913377 and on test data is 0.8595571750689519


# Tuning Alpha for Lasso Model with Train-Test split and Standardization:

In [38]:
# Divide the data into training and test
X_train, X_test, y_train, y_test = train_test_split(data_features, data_target, random_state=42)

# preprocessing using 0-1 scaling
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [39]:
# Learn the model with a certain numnber of alphas
lassocv = LassoCV(alphas = None, cv = 10, max_iter = 100000, normalize = True)
lassocv.fit(X_train, y_train)
print("Best alpha value found: {}".format(lassocv.alpha_))

Best alpha value found: 0.0010181925038167624


In [40]:
# ridge.fit(data_features, data_target)
pd.Series(lassocv.coef_ , index = data_features.columns)

XKoordinat     0.000000
YKoordinat    -0.000008
fra_kote      -0.024735
til_kote      -0.000000
Laengde       -0.010591
Fald          -0.033541
DiameterIn     0.000243
MaterialeK    -0.000000
anlag_aar     -0.739035
TransportK     7.534099
Funktionsk    -0.474766
TVObsKode     -6.585158
DatoSaneri    -0.020611
Depth          0.060034
PipeStatus    10.440272
dtype: float64

In [41]:
# to chech model performance
lasso = Lasso(max_iter = 10000)
lasso.set_params(alpha=lassocv.alpha_)
lasso.fit(X_train, y_train)
mse = mean_squared_error(y_test, lasso.predict(X_test))
print("The MSE associated with alpha value: {}".format(mse))

The MSE associated with alpha value: 71.20640492627466


In [42]:
# R^2 of the associated alpha
print("R^2 on train data is {} and on test data is {}".format(lasso.score(X_train_scaled, y_train), 
                                                              lasso.score(X_test_scaled,y_test)))

R^2 on train data is -4503.905838632152 and on test data is -4747.028838167456


# Tuning Alpha for Lasso Model with validation set split and Standardization:

In [43]:
# Divide the data into training, test and validation

X_trainval, X_test, y_trainval, y_test = train_test_split(data_features, data_target, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, random_state=43)

# preprocessing using zero mean and unit variance scaling
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_val_scaled = scaler.transform(X_val)
X_trainval_scaled = scaler.transform( X_trainval)


In [44]:
best_score = 100
for alphas in 10**np.linspace(-10, 10, 100):
    # Learn the model with a certain numnber of alphas
    lasso1 = Lasso(max_iter = 10000, alpha=alphas)
    lasso1.fit(X_train_scaled, y_train)
    
    # Evaluate the model
    score = mean_squared_error(y_val, lasso1.predict(X_val_scaled))
    
    
    # If improvement, store score and parameter
    if score < best_score:
        best_score = score
        best_alphas = alphas

# Build a model on the combine training and valiation data
lasso1 = Lasso(max_iter = 10000, alpha = best_alphas)
lasso1.fit(X_trainval_scaled, y_trainval)

print("Best alpha found: {}".format(best_alphas))
print("Best MSE on validation set: {}".format(best_score))
print("MSE on training/validation set: {}".format(mean_squared_error(y_trainval, lasso1.predict(X_trainval_scaled))))
print("MSE on test set: {}".format(mean_squared_error(y_test, lasso1.predict(X_test_scaled))))

Best alpha found: 0.07742636826811278
Best MSE on validation set: 78.03522154326286
MSE on training/validation set: 81.39219654399346
MSE on test set: 71.91809282428129


In [45]:
# to chech model performance
lasso = Lasso(max_iter = 10000, alpha = best_alphas, normalize = True)
lasso.fit(X_train_scaled, y_train)
mse = mean_squared_error(y_test, lasso.predict(X_test_scaled))
print("The MSE associated with alpha value: {}".format(mse))

The MSE associated with alpha value: 87.4841885227234


In [46]:
# coefficients associated with the chosen alpha
pd.Series(lasso.coef_ , index = data_features.columns)

XKoordinat    -0.000000
YKoordinat     0.000000
fra_kote      -0.000000
til_kote      -0.000000
Laengde       -0.000000
Fald          -0.000000
DiameterIn    -0.000000
MaterialeK    -0.000000
anlag_aar    -11.926735
TransportK     0.000000
Funktionsk    -0.000000
TVObsKode      0.000000
DatoSaneri   -16.339255
Depth         -0.000000
PipeStatus     1.067147
dtype: float64

In [47]:
# R^2 of the associated alpha
print("R^2 on train data is {} and on test data is {}".format(lasso.score(X_train_scaled, y_train), 
                                                              lasso.score(X_test_scaled,y_test)))

R^2 on train data is 0.8176583760313347 and on test data is 0.8281963007279332


# Tuning Alpha for Lasso Model with cross validation split and Standardization:

In [48]:
# Divide the data into training, test and validation

X_trainval, X_test, y_trainval, y_test = train_test_split(data_features, data_target, random_state=42)

# preprocessing using 0-1 scaling
scaler = StandardScaler()
scaler.fit(X_train)

X_test_scaled = scaler.transform(X_test)
X_trainval_scaled = scaler.transform( X_trainval)

In [49]:
best_score = 0
for alphas in 10**np.linspace(-10, 10, 100):
    # Set a certain number of alphas
    lasso1 = Lasso(max_iter = 10000, alpha=alphas)
    
    # Perform cross validation
    scores = cross_val_score(lasso1, X_trainval_scaled, y_trainval, cv=5)
    
    # Compute the mean score
    score = scores.mean()
    
    # If improvement, store score and parameter
    if score > best_score:
        best_score = score
        best_alphas = alphas

# Build a model on the combine training and valiation data
lasso1 = Lasso(max_iter = 10000, alpha = best_alphas)
lasso1.fit(X_trainval_scaled, y_trainval)

print("Best alpha found: {}".format(best_alphas))
print("Best MSE on validation set: {}".format(best_score))
print("MSE on training/validation set: {}".format(mean_squared_error(y_trainval, lasso1.predict(X_trainval_scaled))))
print("MSE on test set: {}".format(mean_squared_error(y_test, lasso1.predict(X_test_scaled))))

Best alpha found: 0.012045035402587835
Best MSE on validation set: 0.841411838580839
MSE on training/validation set: 81.06413511733778
MSE on test set: 71.39389947094443


In [50]:
# to chech model performance
lasso = Lasso(max_iter = 10000, alpha = best_alphas)
lasso.fit(X_trainval_scaled,y_trainval)
mse = mean_squared_error(y_test, lasso.predict(X_test_scaled))
print("The MSE associated with alpha value: {}".format(mse))

The MSE associated with alpha value: 71.39389947094443


In [51]:
# coefficients associated with the chosen alpha
pd.Series(lasso.coef_ , index = data_features.columns)

XKoordinat     0.012543
YKoordinat    -0.091126
fra_kote      -0.442091
til_kote      -0.000000
Laengde       -0.343501
Fald          -0.635901
DiameterIn     0.067370
MaterialeK     0.000000
anlag_aar    -14.770206
TransportK     0.304777
Funktionsk    -0.673584
TVObsKode     -3.264835
DatoSaneri   -18.887953
Depth          0.644239
PipeStatus     4.848557
dtype: float64

In [52]:
# R^2 of the associated alpha
print("R^2 on train/validation data is {} and on test data is {}".format(lasso.score(X_trainval_scaled,y_trainval), 
                                                              lasso.score(X_test_scaled,y_test)))

R^2 on train/validation data is 0.8487956800355503 and on test data is 0.8597948241655078
