In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

In [35]:
df=pd.read_excel(r'illite.xlsx')

In [36]:
df.columns

Index(['Depth', 'B.Density', 'API', 'resis.90', 'DT-C', 'DT_SH',
       'Illite from XRD'],
      dtype='object')

In [37]:
df.corr()

Unnamed: 0,Depth,B.Density,API,resis.90,DT-C,DT_SH,Illite from XRD
Depth,1.0,-0.120963,0.19823,0.082312,-0.449168,0.191432,-0.158212
B.Density,-0.120963,1.0,-0.422898,0.011434,-0.106291,-0.269532,0.264321
API,0.19823,-0.422898,1.0,0.307387,0.041697,0.025347,-0.263613
resis.90,0.082312,0.011434,0.307387,1.0,-0.224173,-0.181945,-0.127384
DT-C,-0.449168,-0.106291,0.041697,-0.224173,1.0,-0.591326,-0.03119
DT_SH,0.191432,-0.269532,0.025347,-0.181945,-0.591326,1.0,0.015535
Illite from XRD,-0.158212,0.264321,-0.263613,-0.127384,-0.03119,0.015535,1.0


In [38]:
# selecting features and label
X = df.iloc[:, :-1]  # features
y = df.iloc[:,-1]  # label
X.shape, y.shape

((206, 6), (206,))

In [39]:
# spliting into training and testing set for both X and y
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
# Define features (X) and label (y)
X = df.iloc[:, :-1]  # Features
y = df.iloc[:, -1]   # Label

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=10)

# Apply Polynomial Features
degree = 2  # You can change the degree based on your requirement
poly = PolynomialFeatures(degree=degree, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)


In [40]:
# function to convert seconds to into hours, minutes and seconds
# this function is used to measure the time taking by the models
def convert(seconds):
    seconds = seconds % (24 * 3600)
    hour = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60
    return "%d:%02d:%02d" % (hour, minutes, seconds)

In [41]:
import os

# Create the output/cnn directory if it doesn't exist
output_dir = "./output/gradient_boosting_B"
os.makedirs(output_dir, exist_ok=True)

In [42]:
# Loading the svr model from svm and GridSearchCV from model_selection
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
import time

regr = GridSearchCV(
    GradientBoostingRegressor(), 
    {
        'loss' : ['squared_error','absolute_error',],
        'learning_rate': [0.01,0.001,0.1],
        'n_estimators': [40,50,60,80],
        'criterion': ['squared_error', 'friedman_mse'],
        'max_depth': [10,12,14,16]
        
    }, 
    cv=10, 
    scoring=['neg_mean_squared_error', 'r2'], 
    refit='r2',
    verbose=2)

start = time.time()
# enter your code below this line to calculate time it take

#Fit the model
regr.fit(X_train_poly, y_train)

# enter you code above this line  to calculate time it take   
end = time.time()

print('run_time:', convert(end-start), 'h:m:s')

#regr.cv_results_

Fitting 10 folds for each of 192 candidates, totalling 1920 fits


[CV] END criterion=squared_error, learning_rate=0.01, loss=squared_error, max_depth=10, n_estimators=40; total time=   0.1s
[CV] END criterion=squared_error, learning_rate=0.01, loss=squared_error, max_depth=10, n_estimators=40; total time=   0.2s
[CV] END criterion=squared_error, learning_rate=0.01, loss=squared_error, max_depth=10, n_estimators=40; total time=   0.1s
[CV] END criterion=squared_error, learning_rate=0.01, loss=squared_error, max_depth=10, n_estimators=40; total time=   0.1s
[CV] END criterion=squared_error, learning_rate=0.01, loss=squared_error, max_depth=10, n_estimators=40; total time=   0.1s
[CV] END criterion=squared_error, learning_rate=0.01, loss=squared_error, max_depth=10, n_estimators=40; total time=   0.1s
[CV] END criterion=squared_error, learning_rate=0.01, loss=squared_error, max_depth=10, n_estimators=40; total time=   0.1s
[CV] END criterion=squared_error, learning_rate=0.01, loss=squared_error, max_depth=10, n_estimators=40; total time=   0.0s
[CV] END

In [43]:
# converting the results to a readable format using dataFrame
df = pd.DataFrame(regr.cv_results_)

# getting all parameter combinations and their performance result
result = df[['param_n_estimators', 'param_learning_rate', 'param_loss', 
            'param_criterion', 'param_max_depth','mean_test_r2', 'mean_test_neg_mean_squared_error']]
result.to_csv('./output/gradient_boosting_B/performance_result_for_gradient_boosting_gridsearchCV.csv')
result

Unnamed: 0,param_n_estimators,param_learning_rate,param_loss,param_criterion,param_max_depth,mean_test_r2,mean_test_neg_mean_squared_error
0,40,0.01,squared_error,squared_error,10,-0.043856,-28.585709
1,50,0.01,squared_error,squared_error,10,-0.075537,-29.414551
2,60,0.01,squared_error,squared_error,10,-0.108951,-30.386001
3,80,0.01,squared_error,squared_error,10,-0.186910,-32.351874
4,40,0.01,squared_error,squared_error,12,-0.049634,-28.795266
...,...,...,...,...,...,...,...
187,80,0.10,absolute_error,friedman_mse,14,0.097372,-24.452156
188,40,0.10,absolute_error,friedman_mse,16,0.123282,-23.788551
189,50,0.10,absolute_error,friedman_mse,16,0.085483,-24.823139
190,60,0.10,absolute_error,friedman_mse,16,0.131105,-23.698746


In [44]:
# printing the best combination of parameter that perform the best
print(regr.best_params_)

# the best performing score
print(regr.best_score_)

{'criterion': 'friedman_mse', 'learning_rate': 0.1, 'loss': 'absolute_error', 'max_depth': 16, 'n_estimators': 60}
0.1311050725750161


In [45]:
import time
from sklearn.ensemble import GradientBoostingRegressor

regressor = GradientBoostingRegressor(n_estimators=regr.best_params_['n_estimators'],
                            learning_rate=regr.best_params_['learning_rate'],
                            loss=regr.best_params_['loss'],
                            max_depth=regr.best_params_['max_depth'],
                            criterion=regr.best_params_['criterion'],
                            verbose=1)

start = time.time() # starting of the time
regressor.fit(X_train_poly,y_train) # fit/train the model

end = time.time() # ending of the time
print('run_time:', convert(end - start), 'h:m:s')

pred_1 = regressor.predict(X_train_poly) # prediction for training set
pred_2 = regressor.predict(X_test_poly) # prediction for testing set
                                  

      Iter       Train Loss   Remaining Time 
         1           3.8257            0.85s
         2           3.5019            0.86s
         3           3.2158            0.91s
         4           2.9625            0.87s
         5           2.7494            0.84s
         6           2.5418            0.79s
         7           2.3596            0.75s
         8           2.1911            0.72s
         9           2.0221            0.69s
        10           1.9021            0.68s
        20           1.1094            0.50s
        30           0.7481            0.36s
        40           0.5364            0.23s
        50           0.4068            0.11s
        60           0.3151            0.00s
run_time: 0:00:00 h:m:s


In [46]:
# saving training, testing and y data with their predicted data in a excel file with different sheets
a = np.stack([y_train, pred_1], axis=1) # training
a = pd.DataFrame(a, columns=['y_train', 'y_train_pred'])

b = np.stack([y_test, pred_2], axis=1) # testing 
b = pd.DataFrame(b, columns=['y_test', 'y_test_pred'])


with pd.ExcelWriter("./output/gradient_boosting_B/predicted_gradient_boosting_model.xlsx") as writer:
    # use to_excel function and specify the sheet_name and index
    # to store the dataframe in specified sheet
    a.to_excel(writer, sheet_name="training", index=True)
    b.to_excel(writer, sheet_name="testing", index=True)


In [47]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error

train_r2 = r2_score(y_train,pred_1) # r2_score for training set
train_mae = mean_absolute_error(y_train, pred_1) # mean absolute error for training set
train_mse = mean_squared_error(y_train, pred_1) # mean squared error for training set
train_mape = mean_absolute_percentage_error(y_train, pred_1) # mean_absolute_percentage_error for training set
train_ev = explained_variance_score(y_train, pred_1) 
train_maxE = max_error(y_train, pred_1)
train_minE = min(abs(y_train - pred_1))

test_r2 = r2_score(y_test, pred_2) # r2_score for testing set
test_mae = mean_absolute_error(y_test, pred_2) # mean absolute error for testing set
test_mse = mean_squared_error(y_test, pred_2) # mean squared error for testing set
test_mape = mean_absolute_percentage_error(y_test, pred_2) # mean_absolute_percentage_error for testing set
test_ev = explained_variance_score(y_test, pred_2) 
test_maxE = max_error(y_test, pred_2)
test_minE = min(abs(y_test - pred_2))



metrics = {
'performance_metrics': ['R2', 'MAE', 'MSE', 'MAPE', 'EV', 'maxE', 'minE'],
'training': [train_r2, train_mae, train_mse, train_mape, train_ev, train_maxE, train_minE],
'testing': [test_r2, test_mae, test_mse, test_mape, test_ev, test_maxE, test_minE],
}

performance_metrics = pd.DataFrame(metrics)
performance = performance_metrics.transpose()
performance.to_csv('./output/gradient_boosting_B/performance_gradient_boosting.csv')
performance

Unnamed: 0,0,1,2,3,4,5,6
performance_metrics,R2,MAE,MSE,MAPE,EV,maxE,minE
training,0.973203,0.315097,0.761548,0.036658,0.974079,6.698594,0.000033
testing,0.535097,2.546885,13.302084,0.267382,0.537303,12.041139,0.024787


In [48]:
#from sklearn.externals import joblib
from joblib import dump, load
dump(regressor, './output/gradient_boosting_B/trained_gradient_boosting_model.joblib')

#clf = load('trained_linear_regression_model.joblib')

['./output/gradient_boosting_B/trained_gradient_boosting_model.joblib']

In [49]:
#from sklearn.externals import joblib
from joblib import dump, load