In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from numpy.linalg import pinv
import xgboost as xgb
from sklearn.linear_model import SGDRegressor, LinearRegression

In [2]:
data = pd.read_csv('data_new.csv')
data.head(3)

Unnamed: 0,DES,HBA,HBD,MR,T/K,Dexp,MCI,Tbm,Tcm,Vcm,Pcm,ωm,MW
0,DES2,ChCl,EG,1.2,308.57,1.1163,1.7208,439,610.73,290.69,59.92,0.94,87.6648
1,DES2,ChCl,EG,1.2,312.56,1.1142,1.7208,439,610.73,290.69,59.92,0.94,87.6648
2,DES2,ChCl,EG,1.2,313.52,1.1137,1.7208,439,610.73,290.69,59.92,0.94,87.6648


In [3]:
# from matplotlib import pyplot as plt
# temp = data.iloc[:,4]
# density = data.iloc[:,5]
# plt.scatter(temp,density)
# plt.show()

In [4]:
X = data[['MR','T/K','MCI','ωm','MW']]
y = data['Dexp']
print(X.shape, y.shape)

(494, 5) (494,)


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
y_test = y_test.to_numpy()

In [8]:
# # Without Regularization

# lr = LinearRegression()
# lr.fit(X_train, y_train)

# pred = lr.predict(X_test)

# sum = 0
# for i in range(len(pred)):
#     sum += abs(pred[i] - y_test[i]) / y_test[i]
# mape = sum / len(y_test)
# print(f'MAPE: {mape*100}')
# print(f'Accuracy: {(1 - mape)*100}')

In [6]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

objectives = ['reg:gamma', 'reg:squarederror', 'reg:squaredlogerror', 'reg:tweedie', 'reg:pseudohubererror','reg:absoluteerror']

for obj in objectives:
    params = {
        'objective': obj,
        'eval_metric': 'mape',
        'seed': 42
    }

    model = xgb.train(params, dtrain, 1000)
    pred = model.predict(dtest)

    sum = 0
    for i in range(len(pred)):
        sum += abs(pred[i] - y_test[i]) / y_test[i]
    mape = sum / len(y_test)
    print(f'\nObjective: {obj}')
    print(f'MAPE: {mape*100}')
    print(f'Accuracy: {(1 - mape)*100}\n')


Objective: reg:gamma
MAPE: 0.2905368439971696
Accuracy: 99.70946315600283


Objective: reg:squarederror
MAPE: 0.2874872669564596
Accuracy: 99.71251273304354


Objective: reg:squaredlogerror
MAPE: 0.2555065402356315
Accuracy: 99.74449345976437


Objective: reg:tweedie
MAPE: 0.24419405196512486
Accuracy: 99.75580594803488


Objective: reg:pseudohubererror
MAPE: 0.2702517272963837
Accuracy: 99.72974827270362


Objective: reg:absoluteerror
MAPE: 4.9285095644821535
Accuracy: 95.07149043551784



In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

eval_metric = ["rmse","rmsle","mae"]
objectives = ['reg:gamma', 'reg:squarederror', 'reg:squaredlogerror', 'reg:tweedie', 'reg:pseudohubererror','reg:absoluteerror']

for obj in objectives:
        print("Objective : "+obj)
        for i in eval_metric:

                params = {
                'objective': obj,
                'eval_metric': i,
                'seed': 42
                }


                model = xgb.train(params, dtrain, 1000)
                pred = model.predict(dtest)

                if i =="rmse":
                        sum = 0
                        for j in range(len(pred)):
                                sum += (pred[j] - y_test[j])**2
                        sum/=len(pred)
                        sum=(sum)**(0.5)
                        sum = round(sum,6)
                        print("RMSE")
                        print(f'Error:{sum*100}')
                        print(f'Accuracy: {(1 - sum)*100}')
                        
                elif i == "rmsle":
        
                        n = len(y_test)
                        msle = round(np.sqrt(np.mean(np.square(np.log1p(pred) - np.log1p(y_test)))),6)
                        print("RMSLE")
                        print(f'Error:{msle*100}')
                        print(f'Accuracy: {(1 - msle)*100}')
        
                elif i == "mae":
                        sum = 0
                        for i in range(len(pred)):
                                sum += abs(pred[i] - y_test[i]) / y_test[i]
                        mape = round(sum / len(y_test),6)
                        print("MAE")
                        print(f'Error: {mape*100}')
                        print(f'Accuracy: {(1 - mape)*100}\n') 


                                        

In [16]:
# X_train=np.array(X_train)
# type(X_train)
y_train  = np.array(y_train)

# K - Fold Validation

In [19]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)

scores = []
for train_index, test_index in kf.split(X_train):
    x_train, x_test = X_train[train_index], X_train[test_index]
    ytrain, ytest = y_train[train_index], y_train[test_index]

    dtrain = xgb.DMatrix(x_train, label=ytrain)
    dtest = xgb.DMatrix(x_test, label=ytest)

    params = {
                'objective': "reg:tweedie",
                'eval_metric': "mape",
                'seed': 42
            }
    
    model = xgb.train(params, dtrain, 1000)
    pred = model.predict(dtest)

    sum = 0
    for i in range(len(pred)):
        sum += abs(pred[i] - y_test[i]) / y_test[i]
    mape = sum / len(y_test)
    
    scores.append(mape)

avg = np.mean(np.array(scores))

print(f'Error: {avg*100}')
print(f'Accuracy: {(1 - avg)*100}\n') 


Error: 5.723766163251757
Accuracy: 94.27623383674825

