In [25]:
import pandas as pd
import numpy as np

In [26]:
data = pd.read_csv('Copper.csv')

In [27]:
data.head()

Unnamed: 0,item_date,quantity tons,country,status,item type,application,thickness,width,product_ref,delivery date,selling_price
0,2021-04-01,4.04,28.0,1,5,10.0,0.65,1500.0,1670798778,2021-07-01,9.41
1,2021-04-01,6.78,25.0,1,5,41.0,-0.23,1210.0,1668701718,2021-04-01,9.79
2,2021-04-01,6.06,30.0,1,6,28.0,-1.06,952.0,628377,2021-01-01,8.88
3,2021-04-01,5.39,32.0,1,3,59.0,0.77,1317.0,1668701718,2021-01-01,9.21
4,2021-04-01,6.8,28.0,1,5,10.0,1.23,1984.5,640665,2021-03-01,8.68


In [28]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data['item_date'] = le.fit_transform(data['item_date'])

In [29]:
data['delivery date'] = pd.to_datetime(data['delivery date']).dt.year

In [31]:
X = data.drop(['selling_price'],axis = 1)
y = data['selling_price']

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from xgboost import XGBRegressor
from sklearn import metrics

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. Random Forest Regression Model

In [34]:
rf = RandomForestRegressor(n_estimators=150)
rf.fit(X_train, y_train)

In [35]:
rf_train_predictions = rf.predict(X_train)
rf_test_predictions = rf.predict(X_test)

In [37]:
print("RANDOM FOREST REGRESSION")
print("--------------------------")
print("Evaluating Train Data")
print("--------------------------")

print('R2 score:',metrics.r2_score(y_train,rf_train_predictions))
print("MAE :",metrics.mean_absolute_error(y_train,rf_train_predictions))
print("MSE :",metrics.mean_squared_error(y_train,rf_train_predictions))
print('RMSE :', np.sqrt(metrics.mean_squared_error(y_train,rf_train_predictions)))

print('---------------------------------------------------------------')

print("Evaluating Test Data")
print("-----------------------")
print('R2 score:',metrics.r2_score(y_test,rf_test_predictions))
print("MAE :",metrics.mean_absolute_error(y_test,rf_test_predictions))
print("MSE :",metrics.mean_squared_error(y_test,rf_test_predictions))
print('RMSE :', np.sqrt(metrics.mean_squared_error(y_test,rf_test_predictions)))

RANDOM FOREST REGRESSION
--------------------------
Evaluating Train Data
--------------------------
R2 score: 0.992667794276853
MAE : 0.023265194290276146
MSE : 0.001458518539052824
RMSE : 0.0381905556263957
---------------------------------------------------------------
Evaluating Test Data
-----------------------
R2 score: 0.9506099592148898
MAE : 0.06182425632686653
MSE : 0.009754574671836033
RMSE : 0.09876525032538536


# 2. Decsion Tree Regression Model

In [13]:
dtree = DecisionTreeRegressor()
dtree.fit(X_train,y_train)

In [14]:
dt_train_predictions = dtree.predict(X_train)
dt_test_predictions = dtree.predict(X_test)

In [15]:
print("DECISION TREE REGRESSION")
print("----------------------------")
print("Evaluating Train Data")
print("----------------------------")

print('R2 score:',metrics.r2_score(y_train,dt_train_predictions))
print("MAE :",metrics.mean_absolute_error(y_train,dt_train_predictions))
print("MSE :",metrics.mean_squared_error(y_train,dt_train_predictions))
print('RMSE :', np.sqrt(metrics.mean_squared_error(y_train,dt_train_predictions)))

print('---------------------------------------------------------------')

print("Evaluating Test Data")
print("-----------------------")
print('R2 score:',metrics.r2_score(y_test,dt_test_predictions))
print("MAE :",metrics.mean_absolute_error(y_test,dt_test_predictions))
print("MSE :",metrics.mean_squared_error(y_test,dt_test_predictions))
print('RMSE :', np.sqrt(metrics.mean_squared_error(y_test,dt_test_predictions)))

DECISION TREE REGRESSION
----------------------------
Evaluating Train Data
----------------------------
R2 score: 0.9990836294797852
MAE : 0.001419776351203239
MSE : 0.00018228394603760217
RMSE : 0.013501257202112779
---------------------------------------------------------------
Evaluating Test Data
-----------------------
R2 score: 0.9126792096818074
MAE : 0.07583236946189385
MSE : 0.01724592966562878
RMSE : 0.13132375895331652


# 3. Extra Trees Regression Model

In [16]:
ext = ExtraTreesRegressor(n_estimators=150,max_depth=5)
ext.fit(X_train,y_train)

In [17]:
ex_train_pred = ext.predict(X_train)
ex_test_pred = ext.predict(X_test)

In [18]:
print("EXTRA TREE REGRESSION")
print("----------------------------")
print("Evaluating Train Data")
print("----------------------------")

print('R2 score:',metrics.r2_score(y_train,ex_train_pred ))
print("MAE :",metrics.mean_absolute_error(y_train,ex_train_pred ))
print("MSE :",metrics.mean_squared_error(y_train,ex_train_pred ))
print('RMSE :', np.sqrt(metrics.mean_squared_error(y_train,ex_train_pred )))

print('---------------------------------------------------------------')

print("Evaluating Test Data")
print("-----------------------")
print('R2 score:',metrics.r2_score(y_test,ex_test_pred))
print("MAE :",metrics.mean_absolute_error(y_test,ex_test_pred))
print("MSE :",metrics.mean_squared_error(y_test,ex_test_pred))
print('RMSE :', np.sqrt(metrics.mean_squared_error(y_test,ex_test_pred)))

EXTRA TREE REGRESSION
----------------------------
Evaluating Train Data
----------------------------
R2 score: 0.7170484679883752
MAE : 0.18320940153699714
MSE : 0.056284571202023806
RMSE : 0.23724369581091886
---------------------------------------------------------------
Evaluating Test Data
-----------------------
R2 score: 0.7154558585444026
MAE : 0.18313219317056134
MSE : 0.056197707698570555
RMSE : 0.23706055702830564


# 4. XGBoost Regression Model

In [19]:
xgb  = XGBRegressor()
xgb.fit(X_train,y_train)

AttributeError: 'super' object has no attribute '__sklearn_tags__'

AttributeError: 'super' object has no attribute '__sklearn_tags__'

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)

In [20]:
xg_train_pred = xgb.predict(X_train)
xg_test_pred = xgb.predict(X_test)

In [21]:
print("XGBoost REGRESSION")
print("----------------------------")
print("Evaluating Train Data")
print("----------------------------")

print('R2 score:',metrics.r2_score(y_train,xg_train_pred ))
print("MAE :",metrics.mean_absolute_error(y_train,xg_train_pred ))
print("MSE :",metrics.mean_squared_error(y_train,xg_train_pred))
print('RMSE :', np.sqrt(metrics.mean_squared_error(y_train,xg_train_pred)))

print('---------------------------------------------------------------')

print("Evaluating Test Data")
print("-----------------------")
print('R2 score:',metrics.r2_score(y_test,xg_test_pred))
print("MAE :",metrics.mean_absolute_error(y_test,xg_test_pred))
print("MSE :",metrics.mean_squared_error(y_test,xg_test_pred))
print('RMSE :', np.sqrt(metrics.mean_squared_error(y_test,xg_test_pred)))

XGBoost REGRESSION
----------------------------
Evaluating Train Data
----------------------------
R2 score: 0.9396971330836226
MAE : 0.07601472160211677
MSE : 0.01199541484193685
RMSE : 0.1095235812139872
---------------------------------------------------------------
Evaluating Test Data
-----------------------
R2 score: 0.9318699978224029
MAE : 0.08014061234165101
MSE : 0.013455732837420055
RMSE : 0.11599884843143941




*   From the above metrics The Random forest Regressor has the 95% r2 score with minimum errors. Thus the model explains or predicts 95% of the relationship between the dependent and independent variables. Hence choosing the Ransdom Forest Regression Model




# Predicting the Price with the model

In [41]:
import numpy as np
from scipy.special import inv_boxcox

# Your input data 'ip'
ip = [[109, 5.41, 26.0, 1, 3, 15.0, 0.65, 1250.0, 164141591, 2021]]

# Get the feature names from the trained Random Forest model
columns = rf.feature_names_in_

# Convert 'ip' into a pandas DataFrame with the correct feature names
ip_df = pd.DataFrame(ip, columns=columns)

# Now, make the prediction
predict = rf.predict(ip_df)

# Convert the prediction using Box-Cox inverse transformation
selling_price = inv_boxcox(predict[0].round(2), 0.09343054475928997)

print(selling_price)

720.7349666862001


# Saving the model as pickle file

In [42]:
import pickle
with open('rf_reg.pkl','wb') as file:
    pickle.dump(rf,file)