In [1]:
# Basic data analysis
import pandas as pd
import numpy as np
import missingno as msno

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px

# Automatic data analysis
from ydata_profiling import ProfileReport
# Statistics
from scipy import stats
import distfit 
from distfit import distfit

from tqdm import tqdm

In [2]:
df_train = pd.read_csv("../data/imputed/train.csv")
df_test = pd.read_csv("../data/imputed/test.csv")

In [3]:
from src.features.encoding.encoding_model import Encoder
from src.utils.encoding_utils import MultipleEncoder, DoubleValidationEncoderNumerical
from sklearn.model_selection import StratifiedKFold
from sklearn.base import BaseEstimator, TransformerMixin
from src.utils.encoding_utils import read_data, save_data
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.woe import WOEEncoder
from category_encoders.target_encoder import TargetEncoder
from category_encoders.sum_coding import SumEncoder
from category_encoders.m_estimate import MEstimateEncoder
from category_encoders.backward_difference import BackwardDifferenceEncoder
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders.helmert import HelmertEncoder
from category_encoders.cat_boost import CatBoostEncoder
from category_encoders.james_stein import JamesSteinEncoder
from category_encoders.one_hot import OneHotEncoder

In [36]:
encoder = Encoder(encoder_name="MEstimateEncoder", cat_validation="Double")

In [37]:
df_train_X, df_train_Y, df_test = read_data("../data/imputed", "Transported")

In [38]:
df_train_Y

array([0, 1, 0, ..., 1, 0, 1])

In [39]:
df_train_X = encoder.fit_transform(df_train_X, df_train_Y)

shapes before encoder: (6954, 14), (1739, 14)
shapes before encoder: (6954, 14), (1739, 14)
shapes before encoder: (6954, 14), (1739, 14)
shapes before encoder: (6955, 14), (1738, 14)
shapes before encoder: (6955, 14), (1738, 14)


In [40]:
df_train_X

Unnamed: 0,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Cabin_deck,ID_num,Group_size,HomePlanet,Cabin_side
0,0.326776,0.471077,39.0,0.506250,0.0,0.0,0.0,0.0,0.0,0.743048,1,1,0.671196,0.451328
1,0.329623,0.472318,24.0,0.506482,109.0,9.0,25.0,549.0,44.0,0.438753,1,1,0.425181,0.555677
2,0.329623,0.472318,58.0,0.386657,43.0,3576.0,0.0,6715.0,49.0,0.520930,1,2,0.659144,0.555677
3,0.329623,0.472318,33.0,0.506482,0.0,1283.0,371.0,3329.0,193.0,0.520930,2,2,0.659144,0.555677
4,0.328265,0.470065,16.0,0.506325,303.0,70.0,151.0,565.0,2.0,0.436362,1,1,0.425214,0.555119
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0.328265,0.618567,41.0,0.389536,0.0,6819.0,0.0,1643.0,74.0,0.479389,1,1,0.657783,0.450989
8689,0.817297,0.503184,18.0,0.506250,0.0,0.0,0.0,0.0,0.0,0.511703,1,1,0.419018,0.555192
8690,0.329120,0.473884,26.0,0.506915,0.0,0.0,1872.0,1.0,0.0,0.516378,1,1,0.422466,0.555949
8691,0.329692,0.608881,32.0,0.506104,0.0,1049.0,0.0,353.0,3235.0,0.366622,1,2,0.651380,0.553491


In [41]:
df_train_X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   CryoSleep     8693 non-null   float64
 1   Destination   8693 non-null   float64
 2   Age           8693 non-null   float64
 3   VIP           8693 non-null   float64
 4   RoomService   8693 non-null   float64
 5   FoodCourt     8693 non-null   float64
 6   ShoppingMall  8693 non-null   float64
 7   Spa           8693 non-null   float64
 8   VRDeck        8693 non-null   float64
 9   Cabin_deck    8693 non-null   float64
 10  ID_num        8693 non-null   int64  
 11  Group_size    8693 non-null   int64  
 12  HomePlanet    8693 non-null   float64
 13  Cabin_side    8693 non-null   float64
dtypes: float64(12), int64(2)
memory usage: 1018.7 KB


In [42]:
df_test_enc = encoder.transform(df_test)

In [43]:
df_test_enc

Unnamed: 0,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Cabin_deck,ID_num,Group_size,HomePlanet,Cabin_side
0,0.565893,0.497181,27.000000,0.504125,0.0,0.0,0.0,0.0,0.0,0.506255,1,1,0.488422,0.513603
1,0.468843,0.497181,19.000000,0.504125,0.0,9.0,0.0,2823.0,0.0,0.491282,1,1,0.488422,0.513603
2,0.565893,0.524681,31.000000,0.504125,0.0,0.0,0.0,0.0,0.0,0.536499,1,1,0.533181,0.513603
3,0.468843,0.497181,38.000000,0.504125,0.0,6652.0,0.0,181.0,585.0,0.536499,1,1,0.533181,0.513603
4,0.468843,0.497181,20.000000,0.504125,10.0,0.0,635.0,0.0,0.0,0.491282,1,1,0.488422,0.513603
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,0.565893,0.497181,34.000000,0.504125,0.0,0.0,0.0,0.0,0.0,0.506255,2,2,0.488422,0.513603
4273,0.468843,0.497181,42.000000,0.504125,0.0,847.0,17.0,10.0,144.0,0.491282,1,1,0.488422,0.513603
4274,0.565893,0.524681,33.213298,0.504125,0.0,0.0,0.0,0.0,0.0,0.486573,1,1,0.507838,0.493542
4275,0.468843,0.497181,35.343316,0.504125,0.0,2680.0,0.0,0.0,523.0,0.486573,1,1,0.533181,0.493542
