In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

In [30]:
df=pd.read_excel(r'illite.xlsx')

In [31]:
# selecting features and label
X = df.iloc[:, :-1]  # features
y = df.iloc[:,-1]  # label
X.shape, y.shape

((206, 6), (206,))

In [32]:
# spliting into training and testing set for both X and y
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
# Define features (X) and label (y)
X = df.iloc[:, :-1]  # Features
y = df.iloc[:, -1]   # Label

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=10)

# Apply Polynomial Features
degree = 2  # You can change the degree based on your requirement
poly = PolynomialFeatures(degree=degree, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

In [33]:
# function to convert seconds to into hours, minutes and seconds
# this function is used to measure the time taking by the models
def convert(seconds):
    seconds = seconds % (24 * 3600)
    hour = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60
    return "%d:%02d:%02d" % (hour, minutes, seconds)

In [34]:
# Loading the svr model from svm and GridSearchCV from model_selection
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
import time

regr = GridSearchCV(
    XGBRegressor(),
    {
        'n_estimators': [40,50,60,80,100],
        'max_depth': [5,8,10,12,15],
        #grow_policy': [0,1],
        'learning_rate': [0.001,0.01,0.1],
     
    }, 
    cv=10, 
    scoring=['neg_mean_squared_error', 'r2'], 
    refit='r2',
    verbose=1)

start = time.time()
# enter your code below this line to calculate time it take

#Fit the model
regr.fit(X_train_poly, y_train, eval_set=[(X_test_poly, y_test)], verbose=False)

# enter you code above this line  to calculate time it take   
end = time.time()

print('gridsearch_run_time:', convert(end-start), 'h:m:s')

#regr.cv_results_

Fitting 10 folds for each of 75 candidates, totalling 750 fits
gridsearch_run_time: 0:02:46 h:m:s


In [35]:
import os

# Create the output/cnn directory if it doesn't exist
output_dir = "./output/xgboost"
os.makedirs(output_dir, exist_ok=True)

In [36]:
# converting the results to a readable format using dataFrame
df = pd.DataFrame(regr.cv_results_)
# getting all parameter combinations and their performance result
result = df[['param_max_depth',  'param_n_estimators',
             'param_learning_rate', 
            'mean_test_r2', 'mean_test_neg_mean_squared_error']]
result.to_csv('./output/xgboost/performance_result_for_xgboost_gridsearchCV.csv')
result

Unnamed: 0,param_max_depth,param_n_estimators,param_learning_rate,mean_test_r2,mean_test_neg_mean_squared_error
0,5,40,0.001,-0.047234,-28.391930
1,5,50,0.001,-0.043531,-28.309539
2,5,60,0.001,-0.039840,-28.227917
3,5,80,0.001,-0.032793,-28.072944
4,5,100,0.001,-0.026176,-27.931166
...,...,...,...,...,...
70,15,40,0.100,-0.056736,-29.418380
71,15,50,0.100,-0.071255,-29.836510
72,15,60,0.100,-0.079102,-30.044209
73,15,80,0.100,-0.085853,-30.224253


In [37]:
# printing the best combination of parameter that perform the best
print(regr.best_params_)

# the best performing score
print(regr.best_score_)

{'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100}
0.08910192042773019


In [38]:
import time
from xgboost import XGBRegressor

regressor = XGBRegressor(n_estimators=regr.best_params_['n_estimators'],
                                    max_depth=regr.best_params_['max_depth'],
                                    
                                    learning_rate=regr.best_params_['learning_rate'],
                                   
                                    verbose=0)

start = time.time() # starting of the time
regressor.fit(X_train_poly, y_train, eval_set=[(X_test_poly, y_test)], verbose=False) # fit/train the model


end = time.time() # ending of the time
print('training_run_time:', convert(end - start), 'h:m:s')

pred_1 = regressor.predict(X_train_poly) # prediction for training set
pred_2 = regressor.predict(X_test_poly) # prediction for testing set


Parameters: { "verbose" } are not used.



training_run_time: 0:00:00 h:m:s


In [39]:
# saving training, testing and y data with their predicted data in a excel file with different sheets
a = np.stack([y_train, pred_1], axis=1) # training
a = pd.DataFrame(a, columns=['y_train', 'y_train_pred'])

b = np.stack([y_test, pred_2], axis=1) # testing 
b = pd.DataFrame(b, columns=['y_test', 'y_test_pred'])


with pd.ExcelWriter("./output/xgboost/predicted_xgboost_model.xlsx") as writer:
    # use to_excel function and specify the sheet_name and index
    # to store the dataframe in specified sheet
    a.to_excel(writer, sheet_name="training", index=True)
    b.to_excel(writer, sheet_name="testing", index=True)
  

In [40]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error

train_r2 = r2_score(y_train,pred_1) # r2_score for training set
train_mae = mean_absolute_error(y_train, pred_1) # mean absolute error for training set
train_mse = mean_squared_error(y_train, pred_1) # mean squared error for training set
train_mape = mean_absolute_percentage_error(y_train, pred_1) # mean_absolute_percentage_error for training set
train_ev = explained_variance_score(y_train, pred_1) 
train_maxE = max_error(y_train, pred_1)
train_minE = min(abs(y_train - pred_1))

test_r2 = r2_score(y_test, pred_2) # r2_score for testing set
test_mae = mean_absolute_error(y_test, pred_2) # mean absolute error for testing set
test_mse = mean_squared_error(y_test, pred_2) # mean squared error for testing set
test_mape = mean_absolute_percentage_error(y_test, pred_2) # mean_absolute_percentage_error for testing set
test_ev = explained_variance_score(y_test, pred_2) 
test_maxE = max_error(y_test, pred_2)
test_minE = min(abs(y_test - pred_2))


metrics = {
'performance_metrics': ['R2', 'MAE', 'MSE', 'MAPE', 'EV', 'maxE', 'minE'],
'training': [train_r2, train_mae, train_mse, train_mape, train_ev, train_maxE, train_minE],
'testing': [test_r2, test_mae, test_mse, test_mape, test_ev, test_maxE, test_minE],

}

performance_metrics = pd.DataFrame(metrics)
performance = performance_metrics.transpose()
performance.to_csv('./output/xgboost/performance_xgboost.csv')
performance

Unnamed: 0,0,1,2,3,4,5,6
performance_metrics,R2,MAE,MSE,MAPE,EV,maxE,minE
training,0.662902,2.418983,9.579868,0.340441,0.662986,10.711189,0.01531
testing,0.45344,2.898638,15.638505,0.328095,0.454666,12.885036,0.061711


In [41]:
#from sklearn.externals import joblib
from joblib import dump, load
dump(regressor, './output/xgboost/trained_xgboost_model.joblib')

#clf = load('trained_linear_regression_model.joblib')

['./output/xgboost/trained_xgboost_model.joblib']

In [42]:
from joblib import dump, load