#### *Importing Libraries*

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder 
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor

##### *Loading Data*

In [None]:
model_df = pd.read_csv(r'Copper.csv')

##### *Model Building*

##### *Querying Won and Lost*

In [None]:
# Win/Lost
query_df = model_df.query("status == 'Won' or status == 'Lost'")
print(query_df['status'].value_counts())

Won     59278
Lost     8741
Name: status, dtype: int64


In [None]:
query_df['item_type']

##### Reducing Skewness

In [6]:
skew_df = query_df.drop('material_ref',axis = 1)

In [7]:
skew_df = skew_df.query("thickness > 0 and quantity_tons > 0")

In [8]:
# Skewness

print(skew_df['thickness'].skew())
print(skew_df['width'].skew())
print(skew_df['selling_price'].skew())
print(skew_df['quantity_tons'].skew())

34.70559093594388
0.5234370646044206
149.540788051666
30.275586829290734


##### Log Transformation

In [9]:
skew_df['thickness'] = np.log(skew_df['thickness'])
skew_df['width'] = np.log(skew_df['width'])
skew_df['quantity_tons'] = np.log(skew_df['quantity_tons'])
skew_df['selling_price'] = np.log(skew_df['selling_price'])

In [10]:
# Skewness

print(skew_df['thickness'].skew())
print(skew_df['width'].skew())
print(skew_df['selling_price'].skew())
print(skew_df['quantity_tons'].skew())

0.42781018418508565
-1.9383285820075098
0.09219786229335214
0.21051434919918785


In [12]:
skew_df.to_csv('Copper_log_transformed.csv')

##### *Training the Model*

In [21]:
x1 = skew_df.drop(['selling_price','status'],axis = 1)
y1 = skew_df['selling_price']

x_train1,x_test1,y_train1,y_test1 = train_test_split(x1,y1,test_size=0.25,random_state=42)
x_train1.sample()

Unnamed: 0,item_date,country,item type,application,thickness,width,product_ref,delivery date,quantity_tons
80302,2020,26.0,S,10.0,1.386294,7.600902,1670798778,2020,3.258097


In [22]:
trans1 = ColumnTransformer([
    ('trans1',OrdinalEncoder(categories=[[1995,2020,2021],[2020,2021]]),[0,7]),
    ('trans2',OneHotEncoder(sparse_output = False,handle_unknown = 'ignore'),[1,2,3,6])], remainder = 'passthrough')

trans2 = RandomForestRegressor(n_estimators=100,random_state=42)

trans3 = XGBRegressor()

trans4 = HistGradientBoostingRegressor(max_depth = 5,categorical_features = [0,1,2,3,6,7],random_state=42)

trans5 = ExtraTreesRegressor(n_estimators=100,max_depth = 5)

pipe = Pipeline([
    ('trans1',trans1),
    ('trans2',trans2)
])


pipe_1 = Pipeline([
    ('trans1',trans1),
    ('trans3',trans3)
])

pipe_2 = Pipeline([
    ('trans1',trans1),
    ('trans4',trans4)
])

pipe_3 = Pipeline([
    ('trans1',trans1),
    ('trans5',trans5)
])

In [23]:
# Random forest regressor
result = pipe.fit(x_train1,y_train1)

# Xgboost
result_1 = pipe_1.fit(x_train1,y_train1)

# HistGradientBoostingRegressor
result_2 = pipe_2.fit(x_train1,y_train1)

# ExtraTreesRegressor
result_3 = pipe_3.fit(x_train1,y_train1)

##### *Prediction*

In [24]:
# Random forest regressor
y_pred = pipe.predict(x_test1)

# Xgboost
y_pred_1 = pipe_1.predict(x_test1)

# HistGradientBoostingRegressor
y_pred_2 = pipe_2.predict(x_test1)

# ExtraTreeRegressor
y_pred_3 = pipe_3.predict(x_test1)

##### Evaluation Metrics

In [25]:
# Random forest regressor
r2 = r2_score(y_test1,y_pred)
print(f"R-squared (R2): {r2}")

# Xgboost
r2 = r2_score(y_test1,y_pred_1)
print(f"R-squared (R2): {r2}")

# HistGradientBoostingRegressor
r2 = r2_score(y_test1,y_pred_2)
print(f"R-squared (R2): {r2}")

# ExtraTreeRegressor
r2 = r2_score(y_test1,y_pred_3)
print(f"R-squared (R2): {r2}")

R-squared (R2): 0.8915284158110927
R-squared (R2): 0.8860394511560808
R-squared (R2): 0.8868001273164166
R-squared (R2): 0.8064224440008776


##### *User Prediction*

In [16]:
# Real world Prediction

item_year = 2021
country = 32
status = 'Lost'
item_type = 'W'
application = 10
product_ref = 1670798778
delivery_year = 2021

thickness = 0.75
log_thickness = np.log(thickness)

width = 1000
log_width = np.log(width)

quantity_tons = 20
log_quantity = np.log(quantity_tons)

In [26]:
user_input = pd.DataFrame([[item_year,country,item_type,application,log_thickness,log_width,product_ref,delivery_year,quantity_tons]],
                          columns = ['item_date','country','item type','application','thickness','width','product_ref','delivery date','quantity_tons'])
user_input

Unnamed: 0,item_date,country,item type,application,thickness,width,product_ref,delivery date,quantity_tons
0,2021,32,W,10,-0.287682,6.907755,1670798778,2021,20


In [27]:
prediction = pipe.predict(user_input)
prediction

array([6.75652757])

In [28]:
np.exp(prediction)

array([859.65192275])

##### *Saving the model*

In [29]:
pickle.dump(pipe,open('price_prediction.pkl','wb'))