# Here we are going to do all the eda , processing and training part using pipeline method

### Exploratory Data analysis

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [4]:
df=pd.read_csv('gemstones.csv')
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


### we are going to remove the id column because it's not useful

In [5]:
df.drop('id',axis='columns',inplace=True)

In [6]:
df['cut'].value_counts()

Ideal        92454
Premium      49910
Very Good    37566
Good         11622
Fair          2021
Name: cut, dtype: int64

In [7]:
df['color'].value_counts()

G    44391
E    35869
F    34258
H    30799
D    24286
I    17514
J     6456
Name: color, dtype: int64

In [8]:
df['clarity'].value_counts()

SI1     53272
VS2     48027
VS1     30669
SI2     30484
VVS2    15762
VVS1    10628
IF       4219
I1        512
Name: clarity, dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193573 entries, 0 to 193572
Data columns (total 10 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   carat    193573 non-null  float64
 1   cut      193573 non-null  object 
 2   color    193573 non-null  object 
 3   clarity  193573 non-null  object 
 4   depth    193573 non-null  float64
 5   table    193573 non-null  float64
 6   x        193573 non-null  float64
 7   y        193573 non-null  float64
 8   z        193573 non-null  float64
 9   price    193573 non-null  int64  
dtypes: float64(6), int64(1), object(3)
memory usage: 14.8+ MB


In [10]:
obj_columns=[]
numerical_columns=[]
for c in df.columns:
    if df[f'{c}'].dtype==object:
        obj_columns.append(c)
    else:
        numerical_columns.append(c)

In [11]:
obj_columns

['cut', 'color', 'clarity']

In [12]:
numerical_columns.remove('price')

In [13]:
numerical_columns

['carat', 'depth', 'table', 'x', 'y', 'z']

In [14]:
df.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
price      0
dtype: int64

In [15]:
df[obj_columns]

Unnamed: 0,cut,color,clarity
0,Premium,F,VS2
1,Very Good,J,SI2
2,Ideal,G,VS1
3,Ideal,G,VS1
4,Premium,G,VS2
...,...,...,...
193568,Ideal,D,VVS2
193569,Premium,G,VVS2
193570,Very Good,F,SI1
193571,Very Good,D,SI1


### Now we are going to to ordinal encoding and that too in a pipeline way

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
from sklearn.model_selection import train_test_split

In [17]:
# For categorical columns
cat_pipeline=Pipeline(steps=[('imputer',SimpleImputer(strategy='most_frequent')),
                ('encoder',OrdinalEncoder()),
                ('scaler',StandardScaler())])
# For numerical columns
numeric_pipeline=Pipeline(steps=[('imputer',SimpleImputer(strategy='median')),
                ('scaler',StandardScaler())])

### Now we will combine both the pipelines using column transformer

In [18]:
ct=ColumnTransformer([('cat_pipeline',cat_pipeline,obj_columns),
                      ('numeric_pipeline',numeric_pipeline,numerical_columns)])

In [19]:
ct

In [20]:
transformed_data=ct.fit_transform(df)

In [21]:
x=pd.DataFrame(transformed_data,columns=ct.get_feature_names_out())
y=df['price']

### Train test split of the data

In [22]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

### Now we are going to try various models all at once to find which ones are the best

In [60]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error,mean_absolute_error

In [63]:
def model_selector(x_train,y_train,x_test,y_test):
  from sklearn.ensemble import RandomForestRegressor
  from sklearn.linear_model import LinearRegression,Lasso,Ridge
  from sklearn.tree import DecisionTreeRegressor
  from sklearn.svm import LinearSVC
  from sklearn.ensemble import GradientBoostingRegressor
  import xgboost as xgb
  from sklearn.metrics import mean_squared_error,mean_absolute_error
  stats=[]
  scores=[]
  mae=[]
  mse=[]
  rfr=RandomForestRegressor()
  lr=LinearRegression()
  l=Lasso()
  r=Ridge()
  dtr=DecisionTreeRegressor()
  gbr=GradientBoostingRegressor()
  xgbr=xgb.XGBRegressor()

  models=[rfr,lr,l,dtr,gbr,xgbr]
  for i in models:
      i.fit(x_train,y_train)
      stats.append({f'{i}':i.score(x_test,y_test)})
      scores.append(i.score(x_test,y_test))
      mae.append({f'{i}':mean_absolute_error(y_true=y_test,y_pred=i.predict(x_test))})
      mse.append({f'{i}':mean_squared_error(y_true=y_test,y_pred=i.predict(x_test))})
  index=scores.index(max(scores))
  best_score_model={str(models[index]):scores[index]}
  return stats,best_score_model,mae,mse



In [64]:
stats,best_score_model,mae,mse=model_selector(x_train,y_train,x_test,y_test)

In [57]:
stats

[{'RandomForestRegressor()': 0.9770376367071751},
 {'LinearRegression()': 0.925060999906643},
 {'Lasso()': 0.9250261634582762},
 {'DecisionTreeRegressor()': 0.9564053250594363},
 {'GradientBoostingRegressor()': 0.9738254070667935},
 {'XGBRegressor(base_score=None, booster=None, callbacks=None,\n             colsample_bylevel=None, colsample_bynode=None,\n             colsample_bytree=None, device=None, early_stopping_rounds=None,\n             enable_categorical=False, eval_metric=None, feature_types=None,\n             gamma=None, grow_policy=None, importance_type=None,\n             interaction_constraints=None, learning_rate=None, max_bin=None,\n             max_cat_threshold=None, max_cat_to_onehot=None,\n             max_delta_step=None, max_depth=None, max_leaves=None,\n             min_child_weight=None, missing=nan, monotone_constraints=None,\n             multi_strategy=None, n_estimators=None, n_jobs=None,\n             num_parallel_tree=None, random_state=None, ...)': 0.9786

In [58]:
best_score_model

{'XGBRegressor(base_score=None, booster=None, callbacks=None,\n             colsample_bylevel=None, colsample_bynode=None,\n             colsample_bytree=None, device=None, early_stopping_rounds=None,\n             enable_categorical=False, eval_metric=None, feature_types=None,\n             gamma=None, grow_policy=None, importance_type=None,\n             interaction_constraints=None, learning_rate=None, max_bin=None,\n             max_cat_threshold=None, max_cat_to_onehot=None,\n             max_delta_step=None, max_depth=None, max_leaves=None,\n             min_child_weight=None, missing=nan, monotone_constraints=None,\n             multi_strategy=None, n_estimators=None, n_jobs=None,\n             num_parallel_tree=None, random_state=None, ...)': 0.9786461303569776}

In [65]:
mae

[{'RandomForestRegressor()': 309.71643614652453},
 {'LinearRegression()': 706.8343173265145},
 {'Lasso()': 707.8161997780155},
 {'DecisionTreeRegressor()': 424.82731068922465},
 {'GradientBoostingRegressor()': 352.32574164956634},
 {'XGBRegressor(base_score=None, booster=None, callbacks=None,\n             colsample_bylevel=None, colsample_bynode=None,\n             colsample_bytree=None, device=None, early_stopping_rounds=None,\n             enable_categorical=False, eval_metric=None, feature_types=None,\n             gamma=None, grow_policy=None, importance_type=None,\n             interaction_constraints=None, learning_rate=None, max_bin=None,\n             max_cat_threshold=None, max_cat_to_onehot=None,\n             max_delta_step=None, max_depth=None, max_leaves=None,\n             min_child_weight=None, missing=nan, monotone_constraints=None,\n             multi_strategy=None, n_estimators=None, n_jobs=None,\n             num_parallel_tree=None, random_state=None, ...)': 297.454

In [66]:
mse

[{'RandomForestRegressor()': 370368.34588019457},
 {'LinearRegression()': 1210982.4462739604},
 {'Lasso()': 1211545.3884991077},
 {'DecisionTreeRegressor()': 708525.8896336476},
 {'GradientBoostingRegressor()': 422970.3164039049},
 {'XGBRegressor(base_score=None, booster=None, callbacks=None,\n             colsample_bylevel=None, colsample_bynode=None,\n             colsample_bytree=None, device=None, early_stopping_rounds=None,\n             enable_categorical=False, eval_metric=None, feature_types=None,\n             gamma=None, grow_policy=None, importance_type=None,\n             interaction_constraints=None, learning_rate=None, max_bin=None,\n             max_cat_threshold=None, max_cat_to_onehot=None,\n             max_delta_step=None, max_depth=None, max_leaves=None,\n             min_child_weight=None, missing=nan, monotone_constraints=None,\n             multi_strategy=None, n_estimators=None, n_jobs=None,\n             num_parallel_tree=None, random_state=None, ...)': 345069.

### Hence we can conclude that xg boost  regressor works best in our case so we will pickle this model file for further work 

In [67]:
import pickle

In [68]:
pickle.dump(xgbr,open('xgbr.pkl','wb'))

In [69]:
load_model=pickle.load(open('xgbr.pkl','rb'))
load_model.score(x_test,y_test)

0.9786461303569776