In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [46]:
pd.set_option('display.max_colwidth',1000)
pd.set_option('display.max_columns',200)
pd.set_option('display.max_rows',1000)

path_train = "drive/MyDrive/Colab Notebooks/train.csv"
path_test = "drive/MyDrive/Colab Notebooks/test.csv"
path_spray = "drive/MyDrive/Colab Notebooks/spray.csv"
path_weather = "drive/MyDrive/Colab Notebooks/weather.csv"
path_sample = "drive/MyDrive/Colab Notebooks/sampleSubmission.csv"

train_df = pd.read_csv(path_train)
test_df = pd.read_csv(path_test)
spray_df = pd.read_csv(path_spray)
weather_df = pd.read_csv(path_weather)

# Preprocessing

## Preprocess weather datasets

In [47]:
# On remplace les données manquantes ('M', '-') par None
weather_df.replace(['M', '-'], np.nan, inplace=True)
print(weather_df.isnull().sum())

Station           0
Date              0
Tmax              0
Tmin              0
Tavg             11
Depart         1472
DewPoint          0
WetBulb           4
Heat             11
Cool             11
Sunrise        1472
Sunset         1472
CodeSum           0
Depth          1472
Water1         2944
SnowFall       1472
PrecipTotal       2
StnPressure       4
SeaLevel          9
ResultSpeed       0
ResultDir         0
AvgSpeed          3
dtype: int64


In [48]:
# Remplacer NaN par la moyenne de chaque colonne
col = ['Tavg', 'Depart', 'WetBulb', 'Heat', 'Cool', 'Sunrise', 'Sunset', 'Depth', 'Water1', 'SnowFall', 'PrecipTotal', 'StnPressure', 'SeaLevel', 'AvgSpeed']
for column in col:
        weather_df[column] = pd.to_numeric(weather_df[column], errors='coerce')
        mean_value = weather_df[column].mean()
        weather_df[column].fillna(mean_value, inplace=True)

# Features gênantes pour l'entrainement du modèle (de type object) pourront potentiellement être ajoutée après
weather_df = weather_df.drop(['Water1', 'CodeSum'], axis=1)
# Vérification qu'il n'y a plus de valeurs non définie
print(weather_df.isnull().sum())

Station        0
Date           0
Tmax           0
Tmin           0
Tavg           0
Depart         0
DewPoint       0
WetBulb        0
Heat           0
Cool           0
Sunrise        0
Sunset         0
Depth          0
SnowFall       0
PrecipTotal    0
StnPressure    0
SeaLevel       0
ResultSpeed    0
ResultDir      0
AvgSpeed       0
dtype: int64


## Preprocess train dataset

In [49]:
target = train_df['WnvPresent']
target

0        0
1        0
2        0
3        0
4        0
        ..
10501    1
10502    0
10503    0
10504    0
10505    0
Name: WnvPresent, Length: 10506, dtype: int64

In [50]:
train = train_df.drop(["WnvPresent", "Address", "AddressNumberAndStreet", "Street", "Trap", "NumMosquitos"], axis=1)

station1 = (41.995, -87.933)
station2 = (41.786, -87.752)

def euclidean_distance(lat1, lon1, lat2, lon2):
    return np.sqrt((lat1 - lat2) ** 2 + (lon1 - lon2) ** 2)

def closest_station(row):
    dist_to_station1 = euclidean_distance(row['Latitude'], row['Longitude'], station1[0], station1[1])
    dist_to_station2 = euclidean_distance(row['Latitude'], row['Longitude'], station2[0], station2[1])
    return 1 if dist_to_station1 < dist_to_station2 else 2

train['Station'] = train.apply(closest_station, axis=1)

train = train.drop(["Latitude", "Longitude"], axis=1)

train

Unnamed: 0,Date,Species,Block,AddressAccuracy,Station
0,2007-05-29,CULEX PIPIENS/RESTUANS,41,9,1
1,2007-05-29,CULEX RESTUANS,41,9,1
2,2007-05-29,CULEX RESTUANS,62,9,1
3,2007-05-29,CULEX PIPIENS/RESTUANS,79,8,1
4,2007-05-29,CULEX RESTUANS,79,8,1
...,...,...,...,...,...
10501,2013-09-26,CULEX PIPIENS/RESTUANS,51,8,2
10502,2013-09-26,CULEX PIPIENS/RESTUANS,58,8,2
10503,2013-09-26,CULEX PIPIENS/RESTUANS,17,9,2
10504,2013-09-26,CULEX PIPIENS/RESTUANS,71,9,1


### Merge train and weather

In [51]:
train = pd.merge(train, weather_df, on=['Date', 'Station'], how='left')
train

Unnamed: 0,Date,Species,Block,AddressAccuracy,Station,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,Depth,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,2007-05-29,CULEX PIPIENS/RESTUANS,41,9,1,88,60,74.0,10.000000,58,65.0,0.0,9.0,421.000000,1917.000000,0.0,0.000000,0.0,29.39,30.11,5.8,18,6.5
1,2007-05-29,CULEX RESTUANS,41,9,1,88,60,74.0,10.000000,58,65.0,0.0,9.0,421.000000,1917.000000,0.0,0.000000,0.0,29.39,30.11,5.8,18,6.5
2,2007-05-29,CULEX RESTUANS,62,9,1,88,60,74.0,10.000000,58,65.0,0.0,9.0,421.000000,1917.000000,0.0,0.000000,0.0,29.39,30.11,5.8,18,6.5
3,2007-05-29,CULEX PIPIENS/RESTUANS,79,8,1,88,60,74.0,10.000000,58,65.0,0.0,9.0,421.000000,1917.000000,0.0,0.000000,0.0,29.39,30.11,5.8,18,6.5
4,2007-05-29,CULEX RESTUANS,79,8,1,88,60,74.0,10.000000,58,65.0,0.0,9.0,421.000000,1917.000000,0.0,0.000000,0.0,29.39,30.11,5.8,18,6.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10501,2013-09-26,CULEX PIPIENS/RESTUANS,51,8,2,75,55,65.0,1.954484,52,58.0,0.0,0.0,480.646739,1847.347826,0.0,0.000068,0.0,29.40,30.04,4.1,9,4.6
10502,2013-09-26,CULEX PIPIENS/RESTUANS,58,8,2,75,55,65.0,1.954484,52,58.0,0.0,0.0,480.646739,1847.347826,0.0,0.000068,0.0,29.40,30.04,4.1,9,4.6
10503,2013-09-26,CULEX PIPIENS/RESTUANS,17,9,2,75,55,65.0,1.954484,52,58.0,0.0,0.0,480.646739,1847.347826,0.0,0.000068,0.0,29.40,30.04,4.1,9,4.6
10504,2013-09-26,CULEX PIPIENS/RESTUANS,71,9,1,75,50,63.0,3.000000,52,58.0,2.0,0.0,543.000000,1742.000000,0.0,0.000000,0.0,29.34,30.04,3.8,8,4.2


### Type handling

In [52]:
train = pd.get_dummies(train, columns=['Species'])

date = pd.to_datetime(train['Date'])
train['Year'] = date.dt.year
train['Month'] = date.dt.month
train['Day'] = date.dt.day
train = train.drop(["Date"], axis=1)

for column in train.columns:
    if train[column].dtype == 'bool':
        train[column] = train[column].astype(int)

train

Unnamed: 0,Block,AddressAccuracy,Station,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,Depth,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,Species_CULEX ERRATICUS,Species_CULEX PIPIENS,Species_CULEX PIPIENS/RESTUANS,Species_CULEX RESTUANS,Species_CULEX SALINARIUS,Species_CULEX TARSALIS,Species_CULEX TERRITANS,Year,Month,Day
0,41,9,1,88,60,74.0,10.000000,58,65.0,0.0,9.0,421.000000,1917.000000,0.0,0.000000,0.0,29.39,30.11,5.8,18,6.5,0,0,1,0,0,0,0,2007,5,29
1,41,9,1,88,60,74.0,10.000000,58,65.0,0.0,9.0,421.000000,1917.000000,0.0,0.000000,0.0,29.39,30.11,5.8,18,6.5,0,0,0,1,0,0,0,2007,5,29
2,62,9,1,88,60,74.0,10.000000,58,65.0,0.0,9.0,421.000000,1917.000000,0.0,0.000000,0.0,29.39,30.11,5.8,18,6.5,0,0,0,1,0,0,0,2007,5,29
3,79,8,1,88,60,74.0,10.000000,58,65.0,0.0,9.0,421.000000,1917.000000,0.0,0.000000,0.0,29.39,30.11,5.8,18,6.5,0,0,1,0,0,0,0,2007,5,29
4,79,8,1,88,60,74.0,10.000000,58,65.0,0.0,9.0,421.000000,1917.000000,0.0,0.000000,0.0,29.39,30.11,5.8,18,6.5,0,0,0,1,0,0,0,2007,5,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10501,51,8,2,75,55,65.0,1.954484,52,58.0,0.0,0.0,480.646739,1847.347826,0.0,0.000068,0.0,29.40,30.04,4.1,9,4.6,0,0,1,0,0,0,0,2013,9,26
10502,58,8,2,75,55,65.0,1.954484,52,58.0,0.0,0.0,480.646739,1847.347826,0.0,0.000068,0.0,29.40,30.04,4.1,9,4.6,0,0,1,0,0,0,0,2013,9,26
10503,17,9,2,75,55,65.0,1.954484,52,58.0,0.0,0.0,480.646739,1847.347826,0.0,0.000068,0.0,29.40,30.04,4.1,9,4.6,0,0,1,0,0,0,0,2013,9,26
10504,71,9,1,75,50,63.0,3.000000,52,58.0,2.0,0.0,543.000000,1742.000000,0.0,0.000000,0.0,29.34,30.04,3.8,8,4.2,0,0,1,0,0,0,0,2013,9,26


## Preprocess test dataset

In [53]:
test_id = test_df["Id"]

In [54]:
test = test_df.drop(["Id", "Address", "AddressNumberAndStreet", "Street", "Trap"], axis=1)

test['Station'] = test.apply(closest_station, axis=1)
test = test.drop(["Latitude", "Longitude"], axis=1)

test

Unnamed: 0,Date,Species,Block,AddressAccuracy,Station
0,2008-06-11,CULEX PIPIENS/RESTUANS,41,9,1
1,2008-06-11,CULEX RESTUANS,41,9,1
2,2008-06-11,CULEX PIPIENS,41,9,1
3,2008-06-11,CULEX SALINARIUS,41,9,1
4,2008-06-11,CULEX TERRITANS,41,9,1
...,...,...,...,...,...
116288,2014-10-02,CULEX SALINARIUS,21,8,2
116289,2014-10-02,CULEX TERRITANS,21,8,2
116290,2014-10-02,CULEX TARSALIS,21,8,2
116291,2014-10-02,UNSPECIFIED CULEX,21,8,2


### Merge test and weather

In [55]:
test = pd.merge(test, weather_df, on=['Date', 'Station'], how='left')
test

Unnamed: 0,Date,Species,Block,AddressAccuracy,Station,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,Depth,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,2008-06-11,CULEX PIPIENS/RESTUANS,41,9,1,86,61,74.0,7.000000,56,64.0,0.0,9.0,416.000000,1926.000000,0.0,0.000000,0.00,29.28,29.99,8.9,18,10.0
1,2008-06-11,CULEX RESTUANS,41,9,1,86,61,74.0,7.000000,56,64.0,0.0,9.0,416.000000,1926.000000,0.0,0.000000,0.00,29.28,29.99,8.9,18,10.0
2,2008-06-11,CULEX PIPIENS,41,9,1,86,61,74.0,7.000000,56,64.0,0.0,9.0,416.000000,1926.000000,0.0,0.000000,0.00,29.28,29.99,8.9,18,10.0
3,2008-06-11,CULEX SALINARIUS,41,9,1,86,61,74.0,7.000000,56,64.0,0.0,9.0,416.000000,1926.000000,0.0,0.000000,0.00,29.28,29.99,8.9,18,10.0
4,2008-06-11,CULEX TERRITANS,41,9,1,86,61,74.0,7.000000,56,64.0,0.0,9.0,416.000000,1926.000000,0.0,0.000000,0.00,29.28,29.99,8.9,18,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116288,2014-10-02,CULEX SALINARIUS,21,8,2,75,66,71.0,1.954484,63,65.0,0.0,6.0,480.646739,1847.347826,0.0,0.000068,0.72,29.10,29.78,7.2,17,7.9
116289,2014-10-02,CULEX TERRITANS,21,8,2,75,66,71.0,1.954484,63,65.0,0.0,6.0,480.646739,1847.347826,0.0,0.000068,0.72,29.10,29.78,7.2,17,7.9
116290,2014-10-02,CULEX TARSALIS,21,8,2,75,66,71.0,1.954484,63,65.0,0.0,6.0,480.646739,1847.347826,0.0,0.000068,0.72,29.10,29.78,7.2,17,7.9
116291,2014-10-02,UNSPECIFIED CULEX,21,8,2,75,66,71.0,1.954484,63,65.0,0.0,6.0,480.646739,1847.347826,0.0,0.000068,0.72,29.10,29.78,7.2,17,7.9


### Type handling

In [56]:
test = pd.get_dummies(test, columns=['Species'])

date = pd.to_datetime(test['Date'])
test['Year'] = date.dt.year
test['Month'] = date.dt.month
test['Day'] = date.dt.day
test = test.drop(["Date", "Species_UNSPECIFIED CULEX"], axis=1)

for column in test.columns:
    if test[column].dtype == 'bool':
        test[column] = test[column].astype(int)
test

Unnamed: 0,Block,AddressAccuracy,Station,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,Depth,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,Species_CULEX ERRATICUS,Species_CULEX PIPIENS,Species_CULEX PIPIENS/RESTUANS,Species_CULEX RESTUANS,Species_CULEX SALINARIUS,Species_CULEX TARSALIS,Species_CULEX TERRITANS,Year,Month,Day
0,41,9,1,86,61,74.0,7.000000,56,64.0,0.0,9.0,416.000000,1926.000000,0.0,0.000000,0.00,29.28,29.99,8.9,18,10.0,0,0,1,0,0,0,0,2008,6,11
1,41,9,1,86,61,74.0,7.000000,56,64.0,0.0,9.0,416.000000,1926.000000,0.0,0.000000,0.00,29.28,29.99,8.9,18,10.0,0,0,0,1,0,0,0,2008,6,11
2,41,9,1,86,61,74.0,7.000000,56,64.0,0.0,9.0,416.000000,1926.000000,0.0,0.000000,0.00,29.28,29.99,8.9,18,10.0,0,1,0,0,0,0,0,2008,6,11
3,41,9,1,86,61,74.0,7.000000,56,64.0,0.0,9.0,416.000000,1926.000000,0.0,0.000000,0.00,29.28,29.99,8.9,18,10.0,0,0,0,0,1,0,0,2008,6,11
4,41,9,1,86,61,74.0,7.000000,56,64.0,0.0,9.0,416.000000,1926.000000,0.0,0.000000,0.00,29.28,29.99,8.9,18,10.0,0,0,0,0,0,0,1,2008,6,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116288,21,8,2,75,66,71.0,1.954484,63,65.0,0.0,6.0,480.646739,1847.347826,0.0,0.000068,0.72,29.10,29.78,7.2,17,7.9,0,0,0,0,1,0,0,2014,10,2
116289,21,8,2,75,66,71.0,1.954484,63,65.0,0.0,6.0,480.646739,1847.347826,0.0,0.000068,0.72,29.10,29.78,7.2,17,7.9,0,0,0,0,0,0,1,2014,10,2
116290,21,8,2,75,66,71.0,1.954484,63,65.0,0.0,6.0,480.646739,1847.347826,0.0,0.000068,0.72,29.10,29.78,7.2,17,7.9,0,0,0,0,0,1,0,2014,10,2
116291,21,8,2,75,66,71.0,1.954484,63,65.0,0.0,6.0,480.646739,1847.347826,0.0,0.000068,0.72,29.10,29.78,7.2,17,7.9,0,0,0,0,0,0,0,2014,10,2


In [80]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import tensorflow as tf

In [81]:
import numpy as np

def create_sequences(data, n_steps):
    X, y = [], []
    for i in range(len(data) - n_steps):
        # Séquencer les données d'entrée et de sortie
        seq_x = data[i:i+n_steps, :-1]  # Toutes les colonnes sauf la dernière
        seq_y = data[i+n_steps, -1]  # Dernière colonne, à n_steps plus tard
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

In [82]:
n_steps = 5

data_with_target = pd.concat([train, target], axis=1)

# Créer des séquences
X, y = create_sequences(data_with_target.values, n_steps)

print("Shape of X:", X.shape)  # Devrait être (nombre d'échantillons, 5, nombre de features - 1)
print("Shape of y:", y.shape)

Shape of X: (10501, 5, 31)
Shape of y: (10501,)


In [83]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)

In [103]:

# Création du modèle
model = Sequential()

# Ajout d'une couche LSTM
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))

# Ajout de couches LSTM supplémentaires si nécessaire
model.add(LSTM(units=50, return_sequences=False))
model.add(Dropout(0.2))

# Couche de sortie
model.add(Dense(units=1, activation='sigmoid'))

def weighted_binary_crossentropy(pos_weight):
    def loss(y_true, y_pred):
        # Calcul de la perte de cross-entropie binaire standard
        entropy = tf.keras.losses.binary_crossentropy(y_true, y_pred)
        # Application des poids
        weight_vector = y_true * pos_weight + (1 - y_true) * (1 - pos_weight)
        weighted_entropy = weight_vector * entropy
        return tf.reduce_mean(weighted_entropy)
    return loss

# Poids pour la classe positive, supposant que c'est la classe minoritaire
pos_weight = 3.0

# Compilation du modèle
model.compile(optimizer='adam', loss=weighted_binary_crossentropy(pos_weight), metrics=['accuracy','auc'])

In [104]:
from sklearn.utils.class_weight import compute_class_weight

# Supposons que train_Y soit votre vecteur de labels
class_weights = compute_class_weight('balanced', classes=np.unique(Y_train), y=Y_train)
class_weight_dict = {i : class_weights[i] for i in range(len(class_weights))}

In [105]:
model.fit(X_train, Y_train, epochs=20, batch_size=32, validation_split=0.2)

Epoch 1/20


TypeError: in user code:

    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1155, in train_step
        return self.compute_metrics(x, y, y_pred, sample_weight)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1249, in compute_metrics
        self.compiled_metrics.update_state(y, y_pred, sample_weight)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/compile_utils.py", line 620, in update_state
        metric_obj.update_state(y_t, y_p, sample_weight=mask)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/metrics_utils.py", line 77, in decorated
        result = update_state_fn(*args, **kwargs)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/metrics/base_metric.py", line 140, in update_state_fn
        return ag_update_state(*args, **kwargs)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/metrics/base_metric.py", line 723, in update_state  **
        matches = ag_fn(y_true, y_pred, **self._fn_kwargs)

    TypeError: 'str' object is not callable


In [92]:
Y_pred = model.predict(X_test)

roc_auc = roc_auc_score(Y_test, Y_pred)
print("ROC-AUC Score:", roc_auc)

ROC-AUC Score: 0.5


In [93]:
val = pd.DataFrame(Y_train).value_counts()
val

0.0    7945
1.0     455
Name: count, dtype: int64