In [61]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error

import jovian

<IPython.core.display.Javascript object>

In [62]:
df = pd.read_csv('/media/arunachal/New Volume/Certification Courses/Kaggle/melb_data.csv')

In [63]:
# list missing columns

cols_missing = [col for col in df.columns if df[col].isnull().any()]

# list columns with categorical data

cols_categorical = [col for col in df.columns if df[col].dtype=='object']

# list columns with numerical data

cols_numerical = [col for col in df.columns if df[col].dtype in ['int64', 'float64']]

# list columns with cardinality below 10 

cols_low_cardinality = [col for col in df.columns if df[col].nunique()<10]

# list columns with categorical data and cardinality < 10

cols_categorical_low_cardinality = [col for col in df.columns if df[col].nunique()<10 and df[col].dtype=='object']

# list columns with only numerical data and categorical data with cardinality < 10

cols_cat_num = cols_categorical_low_cardinality + cols_numerical 

# select usable columns for OHC and fitting RandomForestRegressor

my_cols = [col for col in cols_cat_num if col not in cols_missing]

In [64]:
data = df[my_cols]

In [65]:
# separate predictor and target variables

X = data.drop(['Price'], axis=1)
y = data['Price']

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [67]:
# list columns in reduced dataframe with only categorical data

cols_train_categorical = [col for col in X_train.columns if X_train[col].dtype == 'object']

In [68]:
# return MAPE for differently encoded datasets 

def score(X_train, X_test, y_train, y_test):
    
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    return mean_absolute_percentage_error(y_test, y_pred)

In [69]:
# return MAPE after dropping columns with categorical data

X_train_drop = X_train.select_dtypes(exclude = 'object')
X_test_drop = X_test.select_dtypes(exclude = 'object')

score(X_train_drop, X_test_drop, y_train, y_test)

0.16426659861408727

In [70]:
# return MAPE after ordinal encoding columns with categorical data

X_train_ordinal = X_train.copy()
X_test_ordinal = X_test.copy()

ordinal_encoder = OrdinalEncoder()

X_train_ordinal[cols_train_categorical] = ordinal_encoder.fit_transform(X_train[cols_train_categorical])
X_test_ordinal[cols_train_categorical] = ordinal_encoder.fit_transform(X_test[cols_train_categorical])

score(X_train_ordinal, X_test_ordinal, y_train, y_test)

0.15308848822388954

In [71]:
# return MAPE after one hot encoding columns with categorical data


oh_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

oh_cols_train = pd.DataFrame(oh_encoder.fit_transform(X_train[cols_train_categorical]))
oh_cols_test = pd.DataFrame(oh_encoder.fit_transform(X_test[cols_train_categorical]))

oh_cols_train.index = X_train.index
oh_cols_test.index = X_test.index

X_train_num = X_train.drop(cols_train_categorical, axis = 1)
X_test_num = X_test.drop(cols_train_categorical, axis = 1)

X_train_oh = pd.concat([X_train_num, oh_cols_train], axis=1)
X_test_oh = pd.concat([X_test_num, oh_cols_test], axis=1)

score(X_train_oh, X_test_oh, y_train, y_test)



0.15333861737297402

In [72]:
jovian.commit()

<IPython.core.display.Javascript object>

[jovian] Committed successfully! https://jovian.ai/kr-arunachal/error-comparison-dropping-ordinal-encoding-one-hot-encoding[0m


'https://jovian.ai/kr-arunachal/error-comparison-dropping-ordinal-encoding-one-hot-encoding'