In [1]:
# IMPORT ------
    #* Dataframe 
import pandas as pd 
    #* Matrices 
import numpy as np
    #* Preprocessing 
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
    #* Splitter 
from sklearn.model_selection import train_test_split

    # Misc
import os 

In [2]:
# PATH ------
base_path = "/home/jovyan/work"  #"/Maturite_dentaire"
data_path = "data/Teeth"
data_name= "dataset.csv"

# CWD ------
os.chdir(base_path)

# IMPORT TYPES ------
pandas_types= {'ID':'int','VAL_I1':'category','VAL_I2':'category',
            'VAL_C1':'category','VAL_PM1':'category',
            'VAL_PM2':'category','VAL_M1':'category',
            'VAL_M2':'category','VAL_M3':'category'}

# ORDINAL ENCODING ------
ord_encoding= {'A':2,'B':3,'C':4,'D':5,'E':6,'F':7,'G':8,'H':9}

# RANDOM STATE ------
rs= 12345

# PREPROCESSING 

## IMPORT DATA 

In [3]:
# IMPORT DATA ------
df = pd.read_csv(os.path.join(data_path,data_name),
                 sep=';',
                 dtype=pandas_types)

# Remove useless columns ---
X = df.drop(['ID', 'PAT_AGE'], axis=1)
Y = df['PAT_AGE']

# Train test split 

In [4]:
X_train,X_test, y_train, y_test = train_test_split(X,Y, random_state= rs)  

In [5]:
print(f"Shape Training Data : {X_train.shape}")
print(f"Shape Testing Data : {X_test.shape}")

print(f"Shape Training Target : {y_train.shape}")
print(f"Shape Testing Target : {y_test.shape}")

Shape Training Data : (2135, 9)
Shape Testing Data : (712, 9)
Shape Training Target : (2135,)
Shape Testing Target : (712,)


In [6]:
# Save to csv ---
X_train.to_csv(os.path.join(data_path,"X_train.csv"), index=False)
y_train.to_csv(os.path.join(data_path,"y_train.csv"), index=False)

X_test.to_csv(os.path.join(data_path,"X_test.csv"), index=False)
y_test.to_csv(os.path.join(data_path,"y_test.csv"), index=False)

## Ordinal encoding 

In [7]:
# Simple Ordinal Encoding ---
X_train.replace(ord_encoding, inplace=True)
X_test.replace(ord_encoding, inplace=True)

# Saving Ordinal encoded data --- 
X_train.to_csv(os.path.join(data_path,"1_X_train_encoded.csv"),index=False)
X_test.to_csv(os.path.join(data_path,"1_X_test_encoded.csv"),index=False)

## Removing outliers 

In [8]:
# Remove all samples for which at least 7 out of 8 teeth info is missing --- 
to_rm = np.where(np.sum(pd.isna(X_train),axis=1)>=7)[0] # rowsums : looking for >= 7
X_train = X_train.reset_index(drop=True).drop(to_rm,axis=0) # remove rows from training 
y_train = y_train.reset_index(drop=True).drop(to_rm,axis=0) # remove rows from labels 

# reset indexes --- 
X_train.reset_index(drop= True, inplace=True)
y_train.reset_index(drop= True, inplace=True)

# Printing n° of rows removed : 
print(f"Number of samples removed from trainin data : {to_rm.shape}")

Number of samples removed from trainin data : (118,)


# TRAIN VAL TEST SPLIT 

- Training 
- Validation 
- Testing

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train, random_state= rs,test_size=0.1)

In [10]:
print(f"Shape Training Data : {X_train.shape}")
print(f"Shape Training Target : {y_train.shape}")
print("\n")
print(f"Shape Validation Target : {X_val.shape}")
print(f"Shape Validation Data : {y_val.shape}")
print("\n")
print(f"Shape Testing Data : {X_test.shape}")
print(f"Shape Testing Target : {y_test.shape}")

Shape Training Data : (1815, 9)
Shape Training Target : (1815,)


Shape Validation Target : (202, 9)
Shape Validation Data : (202,)


Shape Testing Data : (712, 9)
Shape Testing Target : (712,)


## Imputing 

In [11]:
# IMPUTING STRATEGY ------
knni= KNNImputer(n_neighbors= 40,weights= "distance",add_indicator= True)
X_train= knni.fit_transform(X_train.values) 
X_val= knni.transform(X_val.values) 
X_test= knni.transform(X_test.values) 

# New columns --- 
indicator_features= []
for i in X.columns.values[1:]:
    indicator_features.append(i+ "_missing_indicator")
new_cols= [*list(X.columns), *indicator_features]

# Saving Ordinal encoded data --- 
pd.DataFrame(X_train,columns= new_cols).to_csv(os.path.join(data_path,"2_X_train_encoded.csv"),index=False)
pd.DataFrame(X_val,columns= new_cols).to_csv(os.path.join(data_path,"2_X_val_encoded.csv"),index=False)
pd.DataFrame(X_test, columns= new_cols).to_csv(os.path.join(data_path,"2_X_test_encoded.csv"),index=False)

## Outlier removal 

In [12]:
# isolation forest 
#train ---
Y_pred_train = IsolationForest(random_state=rs,bootstrap=True,contamination=0.05).fit_predict(X_train) # fit outlier detection # Returns -1 for outliers and 1 for inliers.
inliers = np.where(Y_pred_train == 1)[0] # which samples to keep 
    #* Keep non outliers
X_train = X_train[inliers,:]
y_train = y_train.reset_index(drop=True)[inliers]

In [13]:
print(f"Shape Training Data : {X_train.shape}")
print(f"Shape Training Labels Data : {y_train.shape}")

Shape Training Data : (1726, 17)
Shape Training Labels Data : (1726,)


In [15]:
# Saving preprocesssed data --- 
    #* Train 
pd.DataFrame(X_train,columns= new_cols).to_csv(os.path.join(data_path,"3_X_train_outlier.csv"),index=False)
pd.Series(y_train,name="Age").to_csv(os.path.join(data_path,"3_y_train_outlier.csv"),index=False)
    #* val 
pd.DataFrame(X_val,columns= new_cols).to_csv(os.path.join(data_path,"3_X_val_outlier.csv"),index=False)
pd.Series(y_val,name="Age").to_csv(os.path.join(data_path,"3_y_val_outlier.csv"),index=False)
    #* Train 
pd.DataFrame(X_test,columns= new_cols).to_csv(os.path.join(data_path,"3_X_test_outlier.csv"),index=False)
pd.Series(y_test,name="Age").to_csv(os.path.join(data_path,"3_y_test_outlier.csv"),index=False)