# Here we are going to do all the eda , processing and training part using pipeline method 

### Exploratory Data analysis 

In [1]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
df=pd.read_csv('raw.csv')
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


### we are going to remove the id column because it's not useful 

In [3]:
df.drop('id',axis='columns',inplace=True)

In [4]:
df['cut'].value_counts()

cut
Ideal        92454
Premium      49910
Very Good    37566
Good         11622
Fair          2021
Name: count, dtype: int64

In [5]:
df['color'].value_counts()

color
G    44391
E    35869
F    34258
H    30799
D    24286
I    17514
J     6456
Name: count, dtype: int64

In [6]:
df['clarity'].value_counts()

clarity
SI1     53272
VS2     48027
VS1     30669
SI2     30484
VVS2    15762
VVS1    10628
IF       4219
I1        512
Name: count, dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193573 entries, 0 to 193572
Data columns (total 10 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   carat    193573 non-null  float64
 1   cut      193573 non-null  object 
 2   color    193573 non-null  object 
 3   clarity  193573 non-null  object 
 4   depth    193573 non-null  float64
 5   table    193573 non-null  float64
 6   x        193573 non-null  float64
 7   y        193573 non-null  float64
 8   z        193573 non-null  float64
 9   price    193573 non-null  int64  
dtypes: float64(6), int64(1), object(3)
memory usage: 14.8+ MB


In [8]:
obj_columns=[]
numerical_columns=[]
for c in df.columns:
    if df[f'{c}'].dtype==object:
        obj_columns.append(c)
    else:
        numerical_columns.append(c)

In [9]:
obj_columns

['cut', 'color', 'clarity']

In [10]:
numerical_columns.remove('price')

In [11]:
numerical_columns

['carat', 'depth', 'table', 'x', 'y', 'z']

In [12]:
df.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
price      0
dtype: int64

In [13]:
df[obj_columns]

Unnamed: 0,cut,color,clarity
0,Premium,F,VS2
1,Very Good,J,SI2
2,Ideal,G,VS1
3,Ideal,G,VS1
4,Premium,G,VS2
...,...,...,...
193568,Ideal,D,VVS2
193569,Premium,G,VVS2
193570,Very Good,F,SI1
193571,Very Good,D,SI1


### Now we are going to to ordinal encoding and that too in a pipeline way 

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
from sklearn.model_selection import train_test_split

In [15]:
# For categorical columns
cat_pipeline=Pipeline(steps=[('imputer',SimpleImputer(strategy='most_frequent')),
                ('encoder',OrdinalEncoder()),
                ('scaler',StandardScaler())])
# For numerical columns
numeric_pipeline=Pipeline(steps=[('imputer',SimpleImputer(strategy='median')),
                ('scaler',StandardScaler())])

### Now we will combine both the pipelines using column transformer

In [16]:
ct=ColumnTransformer([('cat_pipeline',cat_pipeline,obj_columns),
                      ('numeric_pipeline',numeric_pipeline,numerical_columns)])

In [17]:
ct

In [18]:
transformed_data=ct.fit_transform(df)

In [19]:
x=pd.DataFrame(transformed_data,columns=ct.get_feature_names_out())
y=df['price']

### Train test split of the data 

In [20]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

### Now we are going to try various models all at once to find which ones are the best 

In [21]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb

In [22]:
rfr=RandomForestRegressor()
lr=LinearRegression()
l=Lasso()
r=Ridge()
dtr=DecisionTreeRegressor()
ls=LinearSVC()
gbr=GradientBoostingRegressor()

models=[rfr,lr,l,dtr,ls,gbr,xgb]

scores=[]
for i in models:
    i.fit(x_train,y_train)
    scores.append({f'{i}':i.score(x_test,y_test)})
scores

In [23]:
lr.fit(x_train,y_train)

In [24]:
lr.score(x_test,y_test)

0.9250609999066431

In [25]:
rfr.fit(x_train,y_train)
rfr.score(x_test,y_test)

KeyboardInterrupt: 