#### *Importing Libraries*

In [187]:
import pandas as pd
import numpy as np
import plotly.express as px
import pickle
from imblearn.over_sampling import SMOTEN
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder 
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import classification_report

##### *Loading Data*

In [2]:
model_df = pd.read_csv(r'Copper.csv')

##### *Querying Won and Lost*

In [213]:
# Win/Lost
query_df = model_df.query("status == 'Won' or status == 'Lost'")
print(query_df['status'].value_counts())

Won     59278
Lost     8741
Name: status, dtype: int64


In [44]:
fig = px.bar(x = query_df['status'].value_counts().values)
fig.show()

##### *The Data is Highly Imbalanced*

In [214]:
new_df = pd.read_csv(r'F:\GUVI_DATA_SCIENCE\Project\Copper_Industry_Modeling\Copper_log_transformed.csv')

In [218]:
query_df = new_df.drop(['application','Unnamed: 0','item_date'],axis = 1)
query_df.sample()

Unnamed: 0,country,status,item type,thickness,width,product_ref,delivery date,selling_price,quantity_tons
4456,27.0,Lost,S,2.302585,6.907755,640665,2021,6.922644,4.007333


##### *Model Building*

In [219]:
x = query_df.drop(['status'],axis = 1)
y = query_df['status']

In [220]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [224]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=42)

In [227]:
x_train.shape,x_test.shape
x_train.sample()

Unnamed: 0,country,item type,thickness,width,product_ref,delivery date,selling_price,quantity_tons
57932,32.0,W,1.098612,7.31322,628377,2020,6.391917,2.639057


In [228]:
trans1 = ColumnTransformer([
    ('trans1',OneHotEncoder(sparse_output = False,handle_unknown = 'ignore'),[0,1,4,5])
],remainder = 'passthrough')

In [229]:
x_train_trans = trans1.fit_transform(x_train)
x_train_trans.shape

(50698, 52)

In [230]:
x_test_trans = trans1.transform(x_test)
x_test_trans.shape

(16900, 52)

##### *Sampling*

In [182]:
# SMOTEENN - Combination

sampling = SMOTEENN()

x_res,y_res = sampling.fit_resample(x_train_trans,y_train)

In [231]:
# SMOTEN - Over sampling

sampling = SMOTEN()

x_res,y_res = sampling.fit_resample(x_train_trans,y_train)

##### *Training the Model*

In [232]:
model = RandomForestClassifier(random_state=42)

model_1 = XGBClassifier()

model_2 = HistGradientBoostingClassifier(random_state=42)

model_3 = ExtraTreesClassifier()

In [233]:
result = model.fit(x_res,y_res)
result = model_1.fit(x_res,y_res)
result = model_2.fit(x_res,y_res)
result = model_3.fit(x_res,y_res)

##### *Predictions*

In [234]:
# Random forest regressor
y_pred = model.predict(x_test_trans)
y_pred_1 = model_1.predict(x_test_trans)
y_pred_2 = model_2.predict(x_test_trans)
y_pred_3 = model_3.predict(x_test_trans)

##### *Classification Report*

In [168]:
# SMOTEENN - Default data
report = classification_report(y_test,y_pred)
print(report)

report = classification_report(y_test,y_pred_1)
print(report)

report = classification_report(y_test,y_pred_2)
print(report)

report = classification_report(y_test,y_pred_3)
print(report)

              precision    recall  f1-score   support

           0       0.64      0.75      0.69      3102
           1       0.96      0.94      0.95     20705

    accuracy                           0.91     23807
   macro avg       0.80      0.84      0.82     23807
weighted avg       0.92      0.91      0.91     23807

              precision    recall  f1-score   support

           0       0.52      0.70      0.60      3102
           1       0.95      0.90      0.93     20705

    accuracy                           0.88     23807
   macro avg       0.74      0.80      0.76     23807
weighted avg       0.90      0.88      0.88     23807

              precision    recall  f1-score   support

           0       0.46      0.70      0.56      3102
           1       0.95      0.88      0.91     20705

    accuracy                           0.86     23807
   macro avg       0.71      0.79      0.74     23807
weighted avg       0.89      0.86      0.87     23807

              preci

In [186]:
# SMOTEENN - Processed Data
report = classification_report(y_test,y_pred)
print(report)

report = classification_report(y_test,y_pred_1)
print(report)

report = classification_report(y_test,y_pred_2)
print(report)

report = classification_report(y_test,y_pred_3)
print(report)

              precision    recall  f1-score   support

           0       0.54      0.86      0.66      3003
           1       0.98      0.89      0.93     20657

    accuracy                           0.89     23660
   macro avg       0.76      0.88      0.80     23660
weighted avg       0.92      0.89      0.90     23660

              precision    recall  f1-score   support

           0       0.44      0.83      0.58      3003
           1       0.97      0.85      0.90     20657

    accuracy                           0.84     23660
   macro avg       0.71      0.84      0.74     23660
weighted avg       0.90      0.84      0.86     23660

              precision    recall  f1-score   support

           0       0.39      0.85      0.53      3003
           1       0.97      0.81      0.88     20657

    accuracy                           0.81     23660
   macro avg       0.68      0.83      0.71     23660
weighted avg       0.90      0.81      0.84     23660

              preci

In [212]:
# SMOTEN - Default data

report = classification_report(y_test,y_pred)
print(report)

report = classification_report(y_test,y_pred_1)
print(report)

report = classification_report(y_test,y_pred_2)
print(report)

report = classification_report(y_test,y_pred_3)
print(report)

              precision    recall  f1-score   support

           0       0.72      0.65      0.68      3102
           1       0.95      0.96      0.96     20705

    accuracy                           0.92     23807
   macro avg       0.83      0.81      0.82     23807
weighted avg       0.92      0.92      0.92     23807

              precision    recall  f1-score   support

           0       0.53      0.59      0.56      3102
           1       0.94      0.92      0.93     20705

    accuracy                           0.88     23807
   macro avg       0.73      0.76      0.74     23807
weighted avg       0.88      0.88      0.88     23807

              precision    recall  f1-score   support

           0       0.47      0.56      0.51      3102
           1       0.93      0.91      0.92     20705

    accuracy                           0.86     23807
   macro avg       0.70      0.73      0.72     23807
weighted avg       0.87      0.86      0.87     23807

              preci

In [235]:
# SMOTEN - Processed Data
report = classification_report(y_test,y_pred)
print(report)

report = classification_report(y_test,y_pred_1)
print(report)

report = classification_report(y_test,y_pred_2)
print(report)

report = classification_report(y_test,y_pred_3)
print(report)

              precision    recall  f1-score   support

           0       0.73      0.66      0.69      2143
           1       0.95      0.96      0.96     14757

    accuracy                           0.93     16900
   macro avg       0.84      0.81      0.82     16900
weighted avg       0.92      0.93      0.92     16900

              precision    recall  f1-score   support

           0       0.51      0.61      0.55      2143
           1       0.94      0.92      0.93     14757

    accuracy                           0.88     16900
   macro avg       0.73      0.76      0.74     16900
weighted avg       0.89      0.88      0.88     16900

              precision    recall  f1-score   support

           0       0.44      0.57      0.50      2143
           1       0.93      0.90      0.92     14757

    accuracy                           0.86     16900
   macro avg       0.69      0.73      0.71     16900
weighted avg       0.87      0.86      0.86     16900

              preci

In [None]:
# SMOTEN - Processed Data - Has an accuracy of 93%

##### *User Prediction*

In [240]:
x_train.sample()

Unnamed: 0,country,item type,thickness,width,product_ref,delivery date,selling_price,quantity_tons
19641,78.0,W,-0.693147,7.075809,611993,2021,7.082549,4.75359


In [262]:
# Real world Prediction

country = 32
item_type = 'W'
product_ref = 1670798778
delivery_year = 2021

thickness = 0.75
log_thickness = np.log(thickness)

width = 1000
log_width = np.log(width)

quantity_tons = 20
log_quantity = np.log(quantity_tons)

selling_price = 20
log_selling_price = np.log(selling_price)

In [263]:
pred_df = pd.DataFrame([[country,item_type,log_thickness,log_width,product_ref,delivery_year,log_selling_price,log_quantity]],
                       columns = ['country','item type','thickness','width','product_ref','delivery date','selling_price','quantity_tons'])

pred_df

Unnamed: 0,country,item type,thickness,width,product_ref,delivery date,selling_price,quantity_tons
0,32,W,-0.287682,6.907755,1670798778,2021,2.995732,2.995732


In [264]:
pref_df_trans = trans1.transform(pred_df)

In [265]:
y_pred = model.predict(pref_df_trans)

if y_pred == 1:
    print('Status : Won')
else:
    print('Status : Lost')

Status : Lost


##### *Saving the Encoder and Model*

In [236]:
pickle.dump(model,open('Status_Prediction.pkl','wb'))

In [237]:
pickle.dump(trans1,open('encoder.pkl','wb'))