In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Lasso, Ridge,ElasticNet
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

In [2]:
df=pd.read_csv('data/gemstone.csv')

In [3]:
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [4]:
# df=df.sample(n=10000)

In [5]:
df.shape

(193573, 11)

In [6]:
df.drop(columns=['id','x','y','z'],inplace=True)

In [7]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price
0,1.52,Premium,F,VS2,62.2,58.0,13619
1,2.03,Very Good,J,SI2,62.0,58.0,13387
2,0.7,Ideal,G,VS1,61.2,57.0,2772
3,0.32,Ideal,G,VS1,61.6,56.0,666
4,1.7,Premium,G,VS2,62.6,59.0,14453


In [8]:
X=df.iloc[:,:-1]
y=df['price']

In [9]:
X

Unnamed: 0,carat,cut,color,clarity,depth,table
0,1.52,Premium,F,VS2,62.2,58.0
1,2.03,Very Good,J,SI2,62.0,58.0
2,0.70,Ideal,G,VS1,61.2,57.0
3,0.32,Ideal,G,VS1,61.6,56.0
4,1.70,Premium,G,VS2,62.6,59.0
...,...,...,...,...,...,...
193568,0.31,Ideal,D,VVS2,61.1,56.0
193569,0.70,Premium,G,VVS2,60.3,58.0
193570,0.73,Very Good,F,SI1,63.1,57.0
193571,0.34,Very Good,D,SI1,62.9,55.0


In [10]:
categorical_column=X.select_dtypes(include=['object']).columns
numerical_column=X.select_dtypes(include=['int', 'float']).columns

In [11]:
categorical_column,numerical_column

(Index(['cut', 'color', 'clarity'], dtype='object'),
 Index(['carat', 'depth', 'table'], dtype='object'))

In [12]:
# Define the custom ranking for each ordinal variable
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [13]:
num_pipeline=Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="median")),
        ("scaler",StandardScaler())

    ]
)

In [14]:
cat_pipeline=Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="most_frequent")),
    ('encoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
    ('scaler',StandardScaler())

    ]
)

In [15]:
preprocessor=ColumnTransformer([
    ('num_pipeline',num_pipeline,list(numerical_column)),
    ('cat_pipeline',cat_pipeline,list(categorical_column))
])

In [16]:
preprocessor

In [19]:
lr=LinearRegression()

In [20]:
lr_pipe=make_pipeline(preprocessor,lr)

In [21]:
lr_pipe

In [22]:
# train test splitting
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.33,random_state=41)

In [23]:
lr_pipe.fit(x_train,y_train)

In [24]:
lr_pipe.predict(x_test)

array([ 1132.02324305,  5356.35229857, 10776.99764265, ...,
        4906.41040871,  3814.6708344 , 10011.0442559 ])

In [25]:
r2_score(y_test,lr_pipe.predict(x_test))

0.9252236703596447

In [27]:
rf=RandomForestRegressor(n_jobs=-1)

In [28]:
rf_pipe=make_pipeline(preprocessor,rf)

In [29]:
rf_pipe.fit(x_train,y_train)

In [30]:
r2_score(y_test,rf_pipe.predict(x_test))

0.9746565351658298

In [31]:
import pickle

In [34]:
with open('../model.pkl','wb') as f:
    pickle.dump(rf_pipe,f)

In [39]:
with open('../model.pkl','rb') as f:
    obj=pickle.load(f)

In [40]:
obj

In [45]:
x_test.iloc[1:2,:]

Unnamed: 0,carat,cut,color,clarity,depth,table
108621,1.01,Very Good,H,VS2,63.4,59.0


In [58]:
x_test

Unnamed: 0,carat,cut,color,clarity,depth,table
124816,0.41,Premium,D,VS2,62.5,58.0
108621,1.01,Very Good,H,VS2,63.4,59.0
141921,1.52,Very Good,F,VS2,62.6,53.3
68212,0.32,Ideal,H,SI2,62.4,56.0
106490,0.76,Premium,F,SI1,59.0,60.0
...,...,...,...,...,...,...
99623,0.56,Ideal,F,SI1,60.8,56.0
40262,2.01,Premium,I,SI1,59.4,58.0
183828,0.91,Premium,G,VS2,61.6,58.0
98897,0.70,Premium,D,VS2,61.5,58.0


In [49]:
['1.01','Very Good','H','VS2', 63.4, 59.0]

['1.01', 'Very Good', 'H', 'VS2', 63.4, 59.0]

In [54]:
df.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price'], dtype='object')

In [55]:
pd.DataFrame([['1.01','Very Good','H','VS2', 63.4, 59.0]],columns=df.drop(columns=['price']).columns)

Unnamed: 0,carat,cut,color,clarity,depth,table
0,1.01,Very Good,H,VS2,63.4,59.0


In [63]:
list(df.drop(columns=['price']).columns)

['carat', 'cut', 'color', 'clarity', 'depth', 'table']

In [56]:
pd.DataFrame([['1.01','Very Good','H','VS2', 63.4, 59.0]])

Unnamed: 0,0,1,2,3,4,5
0,1.01,Very Good,H,VS2,63.4,59.0


In [64]:
obj.predict(pd.DataFrame([['1.01','Very Good','H','VS2', 63.4, 59.0]],columns=list(df.drop(columns=['price']).columns)))

array([4964.25416667])