In [1]:
import pandas as pd
import seaborn as sns
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
import joblib


In [2]:
df = sns.load_dataset('diamonds')
 
X = df.drop('cut', axis = 1)
y = df['cut'] 

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    53940 non-null  float64 
 1   cut      53940 non-null  category
 2   color    53940 non-null  category
 3   clarity  53940 non-null  category
 4   depth    53940 non-null  float64 
 5   table    53940 non-null  float64 
 6   price    53940 non-null  int64   
 7   x        53940 non-null  float64 
 8   y        53940 non-null  float64 
 9   z        53940 non-null  float64 
dtypes: category(3), float64(6), int64(1)
memory usage: 3.0 MB


In [11]:
df.head(10)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
5,0.24,Very Good,J,VVS2,62.8,57.0,336,3.94,3.96,2.48
6,0.24,Very Good,I,VVS1,62.3,57.0,336,3.95,3.98,2.47
7,0.26,Very Good,H,SI1,61.9,55.0,337,4.07,4.11,2.53
8,0.22,Fair,E,VS2,65.1,61.0,337,3.87,3.78,2.49
9,0.23,Very Good,H,VS1,59.4,61.0,338,4.0,4.05,2.39


In [5]:
df.isna().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [6]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)


In [7]:
categorical_cols = ['color', 'clarity']
numerical_cols = ['carat', 'depth', 'table', 'x', 'y', 'z']


In [8]:
label_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    LabelEncoder()
)

numerical_pipeline = make_pipeline(
    SimpleImputer(strategy='median'),
    MinMaxScaler()
)
categorical_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(sparse_output=False)
)

column_transformer =make_column_transformer(
    (numerical_pipeline, numerical_cols),
    (categorical_pipeline, categorical_cols)
)

pipeline = make_pipeline(
    column_transformer,
    RandomForestClassifier(random_state=42)
)
pipeline.fit(X, y_encoded)
print('R2 en train', pipeline.score(X, y_encoded))


R2 en train 0.9981275491286615


In [9]:
joblib.dump(pipeline, '../models/pipeline_clasificacion.joblib')

['../models/pipeline_clasificacion.joblib']

In [None]:
X_new = pd.DataFrame({
            'carat': [0.22],
            'color': ['E'],
            'clarity': ['VS2'],
            'depth':[65.1],
            'table': [61],
            'price':[337],
            'x':[3.87],
            'y':[3.78],
            'z':[2.49],                 
        })

y_pred = pipeline.predict(X_new)
y_pred_prob = pipeline.predict_proba(X_new)

y_pred
le.inverse_transform(y_pred)

array([0])