In [140]:
import pandas as pd

In [141]:
gemstone = pd.read_csv('../code/data/gemstone.csv')
gemstone.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1,0.3,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66,499
1,2,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.7,984
2,3,0.9,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78,6289
3,4,0.42,Ideal,F,VS1,61.6,56.0,4.82,4.8,2.96,1082
4,5,0.31,Ideal,F,VVS1,60.4,59.0,4.35,4.43,2.65,779


In [142]:
gemstone.drop(labels=['Unnamed: 0'], axis=1, inplace = True)

In [143]:
x = gemstone.drop(labels=['price'], axis=1)
y = gemstone[['price']]

In [144]:
categorical_cols = x.select_dtypes(include = 'O').columns
numerical_cols = x.select_dtypes(exclude = 'O').columns

In [145]:
cut_categories = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']

In [146]:
from sklearn.impute import SimpleImputer # Handling Missing Values
from sklearn.preprocessing import StandardScaler # Handling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding

In [147]:
# Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [148]:
# Numerical Pipeline

num_pipeline = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy = 'median')),
        ('scaler', StandardScaler())
    ]
)

# Categorical Pipeline

cat_pipeline = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy = 'most_frequent')),
        ('ordinalencoder', OrdinalEncoder(categories = [
            cut_categories, color_categories, clarity_categories])),
            ('scaler', StandardScaler())
    ]
)

preprocessor = ColumnTransformer(
    [
        ('num_pipeline', num_pipeline, numerical_cols),
        ('cat_pipeline', cat_pipeline, categorical_cols)
    ]
)

In [149]:
# Train Test Split
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

In [150]:
x_train = pd.DataFrame(preprocessor.fit_transform(x_train), 
             columns = preprocessor.get_feature_names_out())

x_test = pd.DataFrame(preprocessor.transform(x_test),
                      columns = preprocessor.get_feature_names_out())