In [60]:
import pandas as pd

In [61]:
df = pd.read_csv('Data/AIDA_Results_IA_Institut.csv')

In [62]:
# Mapping for gender and discipline
gender_map = {'M': 0, 'F': 1}
discipline_map = {'CNF': 0, 'FIM': 1, 'CWT': 2, 'CWT-B': 3}

# Transforming the 'Gender' column
df['Gender'] = df['Gender'].map(gender_map)

# Transforming the 'Discipline' column
df['Discipline'] = df['Discipline'].map(discipline_map)

# Removing rows where either 'Gender' or 'Discipline' is NaN (i.e., was other than specified categories)
df.dropna(subset=['Gender', 'Discipline'], inplace=True)



In [63]:


# 1. Extracting the month from the 'Day' column
df['Month'] = pd.to_datetime(df['Day']).dt.month

# 2. Calculating total dive experience
# Sorting data by 'Diver' and 'Day'
df.sort_values(by=['Diver', 'Day'], inplace=True)

# Cumulative count of dives per diver
df['Experience Dive'] = df.groupby('Diver').cumcount()

# 3. Calculating experience per discipline
# Cumulative count of dives per diver per discipline
df['Experience Discipline'] = df.groupby(['Diver', 'Discipline']).cumcount()

# Displaying the modified dataframe
df[['Diver', 'Day', 'Month', 'Discipline', 'Experience Dive', 'Experience Discipline']]

Unnamed: 0,Diver,Day,Month,Discipline,Experience Dive,Experience Discipline
13306,(),2000-01-01,1,2.0,0,0
13343,(),2001-11-10,11,2.0,1,1
11224,(),2002-07-27,7,2.0,2,2
2639,(),2002-10-31,10,2.0,3,3
8973,(),2003-05-29,5,2.0,4,4
...,...,...,...,...,...,...
19120,Ícaro Do Valle (BRA),2019-10-20,10,0.0,17,7
24941,圭一 福里 (JPN),2022-09-29,9,1.0,0,0
24949,圭一 福里 (JPN),2022-09-30,9,1.0,1,1
25753,보걸 강 (KOR),2023-05-25,5,0.0,0,0


In [64]:
df[df['Remarks']=='DQBO']

Unnamed: 0,Start,Diver,Gender,Discipline,Line,Official Top,AP,RP,Card,Points,Remarks,Title Event,Event Type,Day,Category Event,Month,Experience Dive,Experience Discipline
9156,2,(),1,0.0,,00:00,41,41 m,RED,0.0,DQBO,AIDA Cyprus Depth Games 2011,Depth Competition,2011-10-16,other,10,15,1
9166,12,(),1,2.0,,00:00,46,46 m,RED,0.0,DQBO,AIDA Cyprus Depth Games 2011,Depth Competition,2011-10-16,other,10,16,12
5119,289,(),1,2.0,,00:00,50,50 m,RED,0.0,DQBO,World Championship 2012,World Championship,2012-09-09,World Championship,9,20,14
9120,7,(),1,2.0,,00:00,49,49 m,RED,0.0,DQBO,AIDA CYPRUS Depth Games 2012,Depth Competition,2012-10-14,other,10,22,15
14182,3,Adam Sellars (AUS),0,1.0,,00:00,70,70 m,RED,0.0,DQBO,AIDA World Depth Championship 2017,World Championship,2017-08-23,World Championship,8,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4293,140,Yuki Muto (JPN),1,2.0,,00:00,60,60 m,RED,0.0,DQBO,Suunto Vertical Blue 2012,Depth Competition,2012-11-20,VB,11,15,12
4295,142,Yuki Muto (JPN),1,2.0,,00:00,58,58 m,RED,0.0,DQBO,Suunto Vertical Blue 2012,Depth Competition,2012-11-20,VB,11,17,14
2893,68,Yuki Muto (JPN),1,2.0,,00:00,58,58 m,RED,0.0,DQBO,Suunto Vertical Blue 2015,Depth Competition,2015-04-27,VB,4,26,21
9482,70,Yuki Muto (JPN),1,0.0,,00:00,35,35 m,RED,0.0,DQBO,2015 AIDA Depth World Championships,World Championship,2015-09-11,World Championship,9,33,3


In [65]:

# Analyzing the 'Remarks' column in detail to identify different types of comments
remarks_counts = df['Remarks'].value_counts()

# Displaying the most common remarks for analysis
remarks_counts.head(30)


Remarks
OK                    8628
Ok                    5341
-                     2555
PEN                   2219
No tag,rp<ap           484
DQSP                   439
DQBO                   381
No tag,under ap        353
NR                     346
DQOTHER                251
No tag                 245
Dqsp                   186
Rpap,no tag            166
DNS                     93
Early Turn, No Tag      85
Pen, no tag             77
Dqbo-surface            76
Dqbo                    73
National Record         65
Dqairways               64
Dqbo-uw                 58
SHORT & NO TAG          54
Dqother                 50
Short, no tag           50
W                       46
Short                   46
Early                   43
Early turn, no tag      37
Dqpull                  34
DQ BO                   33
Name: count, dtype: int64

In [66]:
df['Remarks']

13306      OK
13343      OK
11224      OK
2639       OK
8973      PEN
         ... 
19120      Ok
24941    Dqsp
24949      Ok
25753      Ok
26685    Dqsp
Name: Remarks, Length: 23613, dtype: object

In [67]:
for value in df["Remarks"].value_counts().index:
    print(value)

OK
Ok
-
PEN
No tag,rp<ap
DQSP
DQBO
No tag,under ap
NR
DQOTHER
No tag
Dqsp
Rpap,no tag
DNS
Early Turn, No Tag
Pen, no tag
Dqbo-surface
Dqbo
National Record
Dqairways
Dqbo-uw
SHORT & NO TAG
Dqother
Short, no tag
W
Short
Early
Early turn, no tag
Dqpull
DQ BO
Grab
DQ SP
No tag,rp<ap,dqother
No tag,rp<ap,dqsp
Early turn
RP<AP no tag
No tag,under ap,dqsp
Pen, Short, no tag
BO
Other
Dqsp,dqairways
SP
National record
0
Short, tag
Early Turn, Tag
RP<AP
Pen
Y
No tag & tag
Dqairways,dqbo-surface
No tag,under ap,dqpull
Rpap,no tag,dqsp
ET
RP<AP No tag
Short,no tag,rp<ap
DQ Pull
Early turn and No tag
Short no tag
AP>RP no tag
Pull
ET NO TAG
No tag,rp<ap,dqairways
Dqtouch
DQPULL
WR
Early turn, Tag
Early,no tag,rp<ap
DQ pull
No Tag
Tag
No tag,under ap,dqbo-uw
SHORT/NO TAG
EARLY TURN+MISSING TAG
Early Turn
Short/No tag
Other,no tag,rp<ap
Grab,no tag,under ap
Grab,no tag,rp<ap
Early turn, no tag.
Airway
No tag,under ap,dqother
SHORT, TAG
No tag,dqcheck-in
Short, No Tag
ET(-4), no Tag(-1)
Dqother, pull


In [68]:
# Adding a new column 'Syncope' to classify each dive as syncope (1) or non-syncope (0)
df['Syncope'] = df['Remarks'].apply(lambda x: 1 if 'DQBO' in str(x) or 'Dqbo-surface' in str(x) else 0)

# Display the first few rows of the modified dataframe to verify the changes
df.head()


Unnamed: 0,Start,Diver,Gender,Discipline,Line,Official Top,AP,RP,Card,Points,Remarks,Title Event,Event Type,Day,Category Event,Month,Experience Dive,Experience Discipline,Syncope
13306,136,(),1,2.0,,00:00,0,37 m,WHITE,37.0,OK,Compiled rankings for year 2000,Competition,2000-01-01,other,1,0,0,0
13343,140,(),1,2.0,,00:00,0,40 m,WHITE,40.0,OK,Compiled rankings for year 2001,Competition,2001-11-10,other,11,1,1,0
11224,10,(),1,2.0,,00:00,0,31 m,WHITE,31.0,OK,1st Portuguese Freediving Championship,Competition,2002-07-27,other,7,2,2,0
2639,162,(),1,2.0,,00:00,0,33 m,WHITE,33.0,OK,AIDA Pacific Cup 2002,Mixed Competition,2002-10-31,other,10,3,3,0
8973,137,(),1,2.0,,00:00,34,32 m,YELLOW,12.0,PEN,Sony Freediver Open Classic 2003,Mixed Competition,2003-05-29,other,5,4,4,0


In [69]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

data = df

# Supprimer ou remplacer les valeurs manquantes
data = data.fillna(method='ffill') 
data['Day'] = pd.to_datetime(data['Day'])
data.sort_values(by=['Diver', 'Day'], inplace=True)
data['Experience'] = data.groupby('Diver').cumcount()
# Encoder les variables catégorielles
label_encoders = {}
for column in ['Gender', 'Discipline', 'Event Type']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Diviser les données en ensembles d'entraînement et de test
X = data[['Discipline', 'AP', 'Gender', 'Event Type', 'Experience']]
y = data['Syncope']  # Assurez-vous que 'Syncope' est votre variable cible
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


# Entraîner un modèle de régression logistique
model = LogisticRegression()
model.fit(X_train, y_train)

# Évaluer le modèle
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))


  data = data.fillna(method='ffill')


              precision    recall  f1-score   support

           0       0.98      1.00      0.99      4629
           1       0.00      0.00      0.00        94

    accuracy                           0.98      4723
   macro avg       0.49      0.50      0.49      4723
weighted avg       0.96      0.98      0.97      4723



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [75]:
data[data['Syncope']==1].count()


Start                    468
Diver                    468
Gender                   468
Discipline               468
Line                     464
Official Top             468
AP                       468
RP                       468
Card                     468
Points                   468
Remarks                  468
Title Event              468
Event Type               468
Day                      468
Category Event           468
Month                    468
Experience Dive          468
Experience Discipline    468
Syncope                  468
Experience               468
dtype: int64

In [71]:
df[df['Syncope']==0].count()

Start                    23145
Diver                    23145
Gender                   23145
Discipline               23145
Line                      3217
Official Top             23145
AP                       23145
RP                       23145
Card                     23145
Points                   23145
Remarks                  23140
Title Event              23145
Event Type               23145
Day                      23145
Category Event           23145
Month                    23145
Experience Dive          23145
Experience Discipline    23145
Syncope                  23145
dtype: int64

In [72]:
df['Remarks'].value_counts()

Remarks
OK                            8628
Ok                            5341
-                             2555
PEN                           2219
No tag,rp<ap                   484
                              ... 
NR (fresh water)                 1
DQ Pull Start                    1
No tag,under ap,dqcheck-in       1
ET(-10), no Tag(-1)              1
Rpap,no tag,turn,early           1
Name: count, Length: 391, dtype: int64

In [73]:
df['Category Event'].value_counts()

Category Event
other                 16425
World Championship     3720
VB                     1836
Panglao                 992
NAC                     640
Name: count, dtype: int64