In [5]:
# Librairies classiques
import pandas as pd
import numpy as np

# Visualisation de données
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import seaborn as sns
from plotly.subplots import make_subplots

# Machine Learning
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

In [20]:
df = pd.read_csv("Data\AIDA_Results_IA_Institut.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26841 entries, 0 to 26840
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Start           26841 non-null  int64  
 1   Diver           26841 non-null  object 
 2   Gender          26841 non-null  object 
 3   Discipline      26841 non-null  object 
 4   Line            4694 non-null   float64
 5   Official Top    26841 non-null  object 
 6   AP              26841 non-null  int64  
 7   RP              26841 non-null  object 
 8   Card            26841 non-null  object 
 9   Points          26841 non-null  object 
 10  Remarks         26836 non-null  object 
 11  Title Event     26841 non-null  object 
 12  Event Type      26841 non-null  object 
 13  Day             26841 non-null  object 
 14  Category Event  26841 non-null  object 
dtypes: float64(1), int64(2), object(12)
memory usage: 3.1+ MB


In [18]:
df

Unnamed: 0,Start,Diver,Gender,Discipline,Official Top,AP,RP,Card,Points,Remarks,Title Event,Event Type,Day,Category Event
0,1,Tasos Grillakis (GRC),M,FIM,00:00,33,23 m,YELLOW,12.0,-,Depth Event 2016,Depth Competition,2016-07-17,other
1,2,Antonis Papantonatos (GRC),M,FIM,00:00,55,47 m,YELLOW,38.0,-,Depth Event 2016,Depth Competition,2016-07-17,other
2,3,Dimitris Koumoulos (GRC),M,CNF,00:00,55,55 m,WHITE,55.0,-,Depth Event 2016,Depth Competition,2016-07-17,other
3,4,Christos Papadopoulos (GRC),M,CWT,00:00,55,55 m,WHITE,55.0,OK,Depth Event 2016,Depth Competition,2016-07-17,other
4,5,Anna Chalari (GRC),F,CWT,00:00,15,15 m,WHITE,15.0,OK,Depth Event 2016,Depth Competition,2016-07-17,other
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26836,7,Karine Le Flanchec (FRA),F,CWTB,09:40,57,57 m,WHITE,57,Ok,The AIDA Cormorant Depth Challenge,Depth Competition,2023-07-23,other
26837,8,Anne-Sophie Passalboni (FRA),F,CWT,09:50,52,52 m,WHITE,52,Ok,The AIDA Cormorant Depth Challenge,Depth Competition,2023-07-23,other
26838,9,Clementine Marie (FRA),F,CNF,10:00,40,40 m,WHITE,40,Ok,The AIDA Cormorant Depth Challenge,Depth Competition,2023-07-23,other
26839,10,Eisve Treciakauskaite (LTU),F,CWT,10:10,33,28 m,YELLOW,22,"No tag,under ap",The AIDA Cormorant Depth Challenge,Depth Competition,2023-07-23,other


## Modifications des données

### Drop de colonnes

In [21]:
df = df.drop(["Line"], axis=1)

### Autres

## ...

## One hot encoder

In [None]:
# On sélectionne que les colonnes du dataset qui nous intéressent, et on les copies dans un nouveau DataFrame pour éviter les effets de bord.
df_mod = df[[]].copy()
df_mod

Unnamed: 0,carat_weight,depth_percent,table_percent,meas_length,meas_width,meas_depth,culet_size,fluor_color,eye_clean,total_sales_price
0,0.09,62.7,59.0,2.85,2.87,1.79,N,unknown,unknown,200
1,0.09,61.9,59.0,2.84,2.89,1.78,N,unknown,unknown,200
2,0.09,61.1,59.0,2.88,2.90,1.77,unknown,unknown,unknown,200
3,0.09,62.0,59.0,2.86,2.88,1.78,unknown,unknown,unknown,200
4,0.09,64.9,58.5,2.79,2.83,1.82,N,unknown,unknown,200
...,...,...,...,...,...,...,...,...,...,...
219698,10.65,61.3,58.0,14.06,14.18,8.66,N,unknown,unknown,1210692
219699,5.17,64.8,65.0,11.55,8.81,5.71,unknown,unknown,unknown,1292500
219700,18.07,60.2,59.0,17.06,17.10,10.20,N,unknown,unknown,1315496
219701,0.90,70.8,72.0,5.22,4.90,3.47,N,unknown,unknown,1350000


In [None]:
# On utilise one_hot_encoder pour transformer les colonnes de données catégoriques en plusieurs colonnes à valeur numériques
one_hot_encoder = OneHotEncoder()

In [None]:
# On intialise le transformer qui effectura les modifications que l'on souhaite
transformer = make_column_transformer(
    (OneHotEncoder(), []),
    remainder='passthrough')

In [None]:
# On transforme les données du DataFrame
transformed = transformer.fit_transform(df_mod)
transformed

array([[0.000000e+00, 0.000000e+00, 0.000000e+00, ..., 2.870000e+00,
        1.790000e+00, 2.000000e+02],
       [0.000000e+00, 0.000000e+00, 0.000000e+00, ..., 2.890000e+00,
        1.780000e+00, 2.000000e+02],
       [0.000000e+00, 0.000000e+00, 0.000000e+00, ..., 2.900000e+00,
        1.770000e+00, 2.000000e+02],
       ...,
       [0.000000e+00, 0.000000e+00, 0.000000e+00, ..., 1.710000e+01,
        1.020000e+01, 1.315496e+06],
       [0.000000e+00, 0.000000e+00, 0.000000e+00, ..., 4.900000e+00,
        3.470000e+00, 1.350000e+06],
       [0.000000e+00, 0.000000e+00, 0.000000e+00, ..., 1.109000e+01,
        7.390000e+00, 1.449881e+06]])

In [None]:
# On convertie ces données en DataFrame
transformed_df = pd.DataFrame(
    transformed, 
    columns=transformer.get_feature_names_out()
)

In [None]:
# Affiche le nom de toutes les colonnes du DataFrame transformé
transformer.get_feature_names_out()

array(['onehotencoder__culet_size_EL', 'onehotencoder__culet_size_L',
       'onehotencoder__culet_size_M', 'onehotencoder__culet_size_N',
       'onehotencoder__culet_size_S', 'onehotencoder__culet_size_SL',
       'onehotencoder__culet_size_VL', 'onehotencoder__culet_size_VS',
       'onehotencoder__culet_size_unknown',
       'onehotencoder__fluor_color_Blue',
       'onehotencoder__fluor_color_Green',
       'onehotencoder__fluor_color_Orange',
       'onehotencoder__fluor_color_White',
       'onehotencoder__fluor_color_Yellow',
       'onehotencoder__fluor_color_unknown',
       'onehotencoder__eye_clean_Borderline',
       'onehotencoder__eye_clean_E1', 'onehotencoder__eye_clean_No',
       'onehotencoder__eye_clean_Yes', 'onehotencoder__eye_clean_unknown',
       'remainder__carat_weight', 'remainder__depth_percent',
       'remainder__table_percent', 'remainder__meas_length',
       'remainder__meas_width', 'remainder__meas_depth',
       'remainder__total_sales_price'], dtype

In [None]:
# On vérifie que nos données sont complètes
transformed_df.isna().sum()

onehotencoder__culet_size_EL           0
onehotencoder__culet_size_L            0
onehotencoder__culet_size_M            0
onehotencoder__culet_size_N            0
onehotencoder__culet_size_S            0
onehotencoder__culet_size_SL           0
onehotencoder__culet_size_VL           0
onehotencoder__culet_size_VS           0
onehotencoder__culet_size_unknown      0
onehotencoder__fluor_color_Blue        0
onehotencoder__fluor_color_Green       0
onehotencoder__fluor_color_Orange      0
onehotencoder__fluor_color_White       0
onehotencoder__fluor_color_Yellow      0
onehotencoder__fluor_color_unknown     0
onehotencoder__eye_clean_Borderline    0
onehotencoder__eye_clean_E1            0
onehotencoder__eye_clean_No            0
onehotencoder__eye_clean_Yes           0
onehotencoder__eye_clean_unknown       0
remainder__carat_weight                0
remainder__depth_percent               0
remainder__table_percent               0
remainder__meas_length                 0
remainder__meas_

In [None]:
# Pour avoir un aperçu synthétique des donneés
transformed_df.describe()

Unnamed: 0,onehotencoder__culet_size_EL,onehotencoder__culet_size_L,onehotencoder__culet_size_M,onehotencoder__culet_size_N,onehotencoder__culet_size_S,onehotencoder__culet_size_SL,onehotencoder__culet_size_VL,onehotencoder__culet_size_VS,onehotencoder__culet_size_unknown,onehotencoder__fluor_color_Blue,onehotencoder__fluor_color_Green,onehotencoder__fluor_color_Orange,onehotencoder__fluor_color_White,onehotencoder__fluor_color_Yellow,onehotencoder__fluor_color_unknown,onehotencoder__eye_clean_Borderline,onehotencoder__eye_clean_E1,onehotencoder__eye_clean_No,onehotencoder__eye_clean_Yes,onehotencoder__eye_clean_unknown,remainder__carat_weight,remainder__depth_percent,remainder__table_percent,remainder__meas_length,remainder__meas_width,remainder__meas_depth,remainder__total_sales_price
count,219703.0,219703.0,219703.0,219703.0,219703.0,219703.0,219703.0,219703.0,219703.0,219703.0,219703.0,219703.0,219703.0,219703.0,219703.0,219703.0,219703.0,219703.0,219703.0,219703.0,219703.0,219703.0,219703.0,219703.0,219703.0,219703.0,219703.0
mean,1.8e-05,0.000264,0.000742,0.600351,0.002167,6.4e-05,1.8e-05,0.006122,0.390254,0.069271,0.00025,4.6e-05,0.000191,0.001821,0.928422,0.002344,0.001365,0.000187,0.281885,0.714219,0.755176,61.683768,57.747585,5.548853,5.135626,3.285699,6908.062
std,0.004267,0.016246,0.027228,0.489827,0.046496,0.007982,0.004267,0.078003,0.487808,0.253915,0.01582,0.006746,0.013825,0.04263,0.257789,0.048359,0.036927,0.013659,0.449919,0.451787,0.845894,9.915266,9.959928,1.763924,1.374529,2.054822,25959.49
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.08,0.0,0.0,0.0,0.0,0.0,200.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.31,61.2,57.0,4.35,4.31,2.68,958.0
50%,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.5,62.4,58.0,5.06,4.8,3.03,1970.0
75%,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,63.5,60.0,6.35,5.7,3.63,5207.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,19.35,98.7,94.0,93.66,62.3,76.3,1449881.0
