In [4]:
# IMPORT ------
    #* Dataframe 
import pandas as pd 
    #* Matrices 
import numpy as np
    #* Preprocessing 
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
    #* Splitter 
from sklearn.model_selection import train_test_split

    # Misc
import os 

In [5]:
# PATH ------
base_path = "/Maturite_dentaire"
data_path = "data/Teeth"
data_name= "dataset.csv"

# CWD ------
os.chdir(base_path)

# IMPORT TYPES ------
pandas_types= {'ID':'int','VAL_I1':'category','VAL_I2':'category',
            'VAL_C1':'category','VAL_PM1':'category',
            'VAL_PM2':'category','VAL_M1':'category',
            'VAL_M2':'category','VAL_M3':'category'}

# ORDINAL ENCODING ------
ord_encoding= {'A':2,'B':3,'C':4,'D':5,'E':6,'F':7,'G':8,'H':9}

# RANDOM STATE ------
rs= 12345

# PREPROCESSING 

## IMPORT DATA 

In [6]:
# IMPORT DATA ------
df = pd.read_csv(os.path.join(data_path,data_name),
                 sep=';',
                 dtype=pandas_types)

# Remove useless columns ---
X = df.drop(['ID', 'PAT_AGE'], axis=1)
Y = df['PAT_AGE']

# Train test split 

In [41]:
X_train,X_test, y_train, y_test = train_test_split(X,Y, random_state= rs)  

In [42]:
print(f"Shape Training Data : {X_train.shape}")
print(f"Shape Testing Data : {X_test.shape}")

print(f"Shape Training Target : {y_train.shape}")
print(f"Shape Testing Target : {y_test.shape}")

Shape Training Data : (2135, 9)
Shape Testing Data : (712, 9)
Shape Training Target : (2135,)
Shape Testing Target : (712,)


In [43]:
# Save to csv ---
X_train.to_csv(os.path.join(data_path,"X_train.csv"), index=False)
y_train.to_csv(os.path.join(data_path,"y_train.csv"), index=False)

X_test.to_csv(os.path.join(data_path,"X_test.csv"), index=False)
y_test.to_csv(os.path.join(data_path,"y_test.csv"), index=False)

## Ordinal encoding 

In [44]:
# Simple Ordinal Encoding ---
X_train.replace(ord_encoding, inplace=True)
X_test.replace(ord_encoding, inplace=True)
# Saving Ordinal encoded data --- 
X_train.to_csv(os.path.join(data_path,"1_X_train_encoded.csv"),index=False)
X_test.to_csv(os.path.join(data_path,"1_X_test_encoded.csv"),index=False)

## Imputing 

In [64]:
# IMPUTING STRATEGY ------
#df.dropna(inplace=True, subset=['VAL_PM1','VAL_PM2','VAL_M2'])
knni= KNNImputer(n_neighbors= 40,weights= "distance",add_indicator= True)
X_train_imp= knni.fit_transform(X_train.values) 
X_test_imp= knni.transform(X_test.values) 

# New columns --- 
indicator_features= []
for i in X_train.columns.values[1:]:
    indicator_features.append(i+ "_missing_indicator")
new_cols= [*list(X_train.columns), *indicator_features]

# Saving Ordinal encoded data --- 
pd.DataFrame(X_train_imp,columns= new_cols).to_csv(os.path.join(data_path,"2_X_train_encoded.csv"),index=False)
pd.DataFrame(X_test_imp, columns= new_cols).to_csv(os.path.join(data_path,"2_X_test_encoded.csv"),index=False)

## Outlier removal 

In [66]:
# isolation forest 
#train ---
Y_pred_train = IsolationForest(random_state=1).fit_predict(X_train)
X_train = X_train[np.where(Y_pred_train == 1, True, False)]
Y_train = y_train[np.where(Y_pred_train == 1, True, False)]

#test ---
Y_pred_test = IsolationForest(random_state=1).fit_predict(X_test)
X_test = X_test[np.where(Y_pred_test == 1, True, False)]
Y_test = y_test[np.where(Y_pred_test == 1, True, False)]

In [None]:
#PCA
# train ---
# pca = PCA(n_components = 4, random_state=1)
# pca_dataset = pca.fit_transform(X_train)
# inverse_transform_dataset = pca.inverse_transform(pca_dataset)
# MSE_score = ((X_train-inverse_transform_dataset)**2).sum(axis=1)
# ids = int(0.1*len(MSE_score))
# to_remove = MSE_score.argsort()[:ids]
# X_train = pd.DataFrame(X_train)
# Y_train = pd.DataFrame(Y_train)
# X_train.drop(X_train.index[to_remove], inplace=True)
# Y_train.drop(Y_train.index[to_remove], inplace=True)
# Y_train = np.ravel(Y_train)

# test ---
# pca = PCA(n_components = 4, random_state=1)
# pca_dataset = pca.fit_transform(X_test)
# inverse_transform_dataset = pca.inverse_transform(pca_dataset)
# MSE_score = ((X_test-inverse_transform_dataset)**2).sum(axis=1)
# ids = int(0.1*len(MSE_score))
# to_remove = MSE_score.argsort()[:ids]
# X_test = pd.DataFrame(X_test)
# Y_test = pd.DataFrame(Y_test)
# X_test.drop(X_test.index[to_remove], inplace=True)
# Y_test.drop(Y_test.index[to_remove], inplace=True)
# Y_test = np.ravel(Y_test)

### New shapes

In [None]:
print(f"Shape Training Data : {X_train.shape}")
print(f"Shape Testing Data : {X_test.shape}")

print(f"Shape Training Target : {y_train.shape}")
print(f"Shape Testing Target : {y_test.shape}")