In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy

In [None]:
pd.set_option('display.max_colwidth',1000)
pd.set_option('display.max_columns',200)
pd.set_option('display.max_rows',1000)

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
weather_df = pd.read_csv("weather.csv")

In [None]:
# On remplace les données manquantes ('M', '-') par None
weather_df.replace(['M', '-'], np.nan, inplace=True)
print(weather_df.isnull().sum())

Station           0
Date              0
Tmax              0
Tmin              0
Tavg             11
Depart         1472
DewPoint          0
WetBulb           4
Heat             11
Cool             11
Sunrise        1472
Sunset         1472
CodeSum           0
Depth          1472
Water1         2944
SnowFall       1472
PrecipTotal       2
StnPressure       4
SeaLevel          9
ResultSpeed       0
ResultDir         0
AvgSpeed          3
dtype: int64


In [None]:
# Remplacer NaN par la moyenne de chaque colonne
col = ['Tavg', 'Depart', 'WetBulb', 'Heat', 'Cool', 'Sunrise', 'Sunset', 'Depth', 'Water1', 'SnowFall', 'PrecipTotal', 'StnPressure', 'SeaLevel', 'AvgSpeed']
for column in col:
        weather_df[column] = pd.to_numeric(weather_df[column], errors='coerce')
        mean_value = weather_df[column].mean()
        weather_df[column].fillna(mean_value, inplace=True)

# Features gênantes pour l'entrainement du modèle (de type object) pourront potentiellement être ajoutée après
weather_df = weather_df.drop(['Water1', 'CodeSum'], axis=1)
# Vérification qu'il n'y a plus de valeurs non définie
print(weather_df.isnull().sum())

Station        0
Date           0
Tmax           0
Tmin           0
Tavg           0
Depart         0
DewPoint       0
WetBulb        0
Heat           0
Cool           0
Sunrise        0
Sunset         0
Depth          0
SnowFall       0
PrecipTotal    0
StnPressure    0
SeaLevel       0
ResultSpeed    0
ResultDir      0
AvgSpeed       0
dtype: int64


In [None]:
target = train_df['WnvPresent']

In [None]:
train = train_df.drop(["WnvPresent", "Address", "AddressNumberAndStreet", "Street", "Trap"], axis=1)

station1 = (41.995, -87.933)
station2 = (41.786, -87.752)

def euclidean_distance(lat1, lon1, lat2, lon2):
    return np.sqrt((lat1 - lat2) ** 2 + (lon1 - lon2) ** 2)

def closest_station(row):
    dist_to_station1 = euclidean_distance(row['Latitude'], row['Longitude'], station1[0], station1[1])
    dist_to_station2 = euclidean_distance(row['Latitude'], row['Longitude'], station2[0], station2[1])
    return 1 if dist_to_station1 < dist_to_station2 else 2

train['Station'] = train.apply(closest_station, axis=1)

train = train.drop(["Latitude", "Longitude"], axis=1)

train

Unnamed: 0,Date,Species,Block,AddressAccuracy,NumMosquitos,Station
0,2007-05-29,CULEX PIPIENS/RESTUANS,41,9,1,1
1,2007-05-29,CULEX RESTUANS,41,9,1,1
2,2007-05-29,CULEX RESTUANS,62,9,1,1
3,2007-05-29,CULEX PIPIENS/RESTUANS,79,8,1,1
4,2007-05-29,CULEX RESTUANS,79,8,4,1
...,...,...,...,...,...,...
10501,2013-09-26,CULEX PIPIENS/RESTUANS,51,8,6,2
10502,2013-09-26,CULEX PIPIENS/RESTUANS,58,8,5,2
10503,2013-09-26,CULEX PIPIENS/RESTUANS,17,9,1,2
10504,2013-09-26,CULEX PIPIENS/RESTUANS,71,9,5,1


In [None]:
train = pd.merge(train, weather_df, on=['Date', 'Station'], how='left')
train

Unnamed: 0,Date,Species,Block,AddressAccuracy,NumMosquitos,Station,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,Depth,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,2007-05-29,CULEX PIPIENS/RESTUANS,41,9,1,1,88,60,74.0,10.000000,58,65.0,0.0,9.0,421.000000,1917.000000,0.0,0.000000,0.0,29.39,30.11,5.8,18,6.5
1,2007-05-29,CULEX RESTUANS,41,9,1,1,88,60,74.0,10.000000,58,65.0,0.0,9.0,421.000000,1917.000000,0.0,0.000000,0.0,29.39,30.11,5.8,18,6.5
2,2007-05-29,CULEX RESTUANS,62,9,1,1,88,60,74.0,10.000000,58,65.0,0.0,9.0,421.000000,1917.000000,0.0,0.000000,0.0,29.39,30.11,5.8,18,6.5
3,2007-05-29,CULEX PIPIENS/RESTUANS,79,8,1,1,88,60,74.0,10.000000,58,65.0,0.0,9.0,421.000000,1917.000000,0.0,0.000000,0.0,29.39,30.11,5.8,18,6.5
4,2007-05-29,CULEX RESTUANS,79,8,4,1,88,60,74.0,10.000000,58,65.0,0.0,9.0,421.000000,1917.000000,0.0,0.000000,0.0,29.39,30.11,5.8,18,6.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10501,2013-09-26,CULEX PIPIENS/RESTUANS,51,8,6,2,75,55,65.0,1.954484,52,58.0,0.0,0.0,480.646739,1847.347826,0.0,0.000068,0.0,29.40,30.04,4.1,9,4.6
10502,2013-09-26,CULEX PIPIENS/RESTUANS,58,8,5,2,75,55,65.0,1.954484,52,58.0,0.0,0.0,480.646739,1847.347826,0.0,0.000068,0.0,29.40,30.04,4.1,9,4.6
10503,2013-09-26,CULEX PIPIENS/RESTUANS,17,9,1,2,75,55,65.0,1.954484,52,58.0,0.0,0.0,480.646739,1847.347826,0.0,0.000068,0.0,29.40,30.04,4.1,9,4.6
10504,2013-09-26,CULEX PIPIENS/RESTUANS,71,9,5,1,75,50,63.0,3.000000,52,58.0,2.0,0.0,543.000000,1742.000000,0.0,0.000000,0.0,29.34,30.04,3.8,8,4.2


In [None]:
test_id = test_df["Id"]

In [None]:
test = test_df.drop(["Id", "Address", "AddressNumberAndStreet", "Street", "Trap"], axis=1)

test['Station'] = test.apply(closest_station, axis=1)
test = test.drop(["Latitude", "Longitude"], axis=1)

test

Unnamed: 0,Date,Species,Block,AddressAccuracy,Station
0,2008-06-11,CULEX PIPIENS/RESTUANS,41,9,1
1,2008-06-11,CULEX RESTUANS,41,9,1
2,2008-06-11,CULEX PIPIENS,41,9,1
3,2008-06-11,CULEX SALINARIUS,41,9,1
4,2008-06-11,CULEX TERRITANS,41,9,1
...,...,...,...,...,...
116288,2014-10-02,CULEX SALINARIUS,21,8,2
116289,2014-10-02,CULEX TERRITANS,21,8,2
116290,2014-10-02,CULEX TARSALIS,21,8,2
116291,2014-10-02,UNSPECIFIED CULEX,21,8,2


In [None]:
test = pd.merge(test, weather_df, on=['Date', 'Station'], how='left')
test

Unnamed: 0,Date,Species,Block,AddressAccuracy,Station,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,Depth,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,2008-06-11,CULEX PIPIENS/RESTUANS,41,9,1,86,61,74.0,7.000000,56,64.0,0.0,9.0,416.000000,1926.000000,0.0,0.000000,0.00,29.28,29.99,8.9,18,10.0
1,2008-06-11,CULEX RESTUANS,41,9,1,86,61,74.0,7.000000,56,64.0,0.0,9.0,416.000000,1926.000000,0.0,0.000000,0.00,29.28,29.99,8.9,18,10.0
2,2008-06-11,CULEX PIPIENS,41,9,1,86,61,74.0,7.000000,56,64.0,0.0,9.0,416.000000,1926.000000,0.0,0.000000,0.00,29.28,29.99,8.9,18,10.0
3,2008-06-11,CULEX SALINARIUS,41,9,1,86,61,74.0,7.000000,56,64.0,0.0,9.0,416.000000,1926.000000,0.0,0.000000,0.00,29.28,29.99,8.9,18,10.0
4,2008-06-11,CULEX TERRITANS,41,9,1,86,61,74.0,7.000000,56,64.0,0.0,9.0,416.000000,1926.000000,0.0,0.000000,0.00,29.28,29.99,8.9,18,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116288,2014-10-02,CULEX SALINARIUS,21,8,2,75,66,71.0,1.954484,63,65.0,0.0,6.0,480.646739,1847.347826,0.0,0.000068,0.72,29.10,29.78,7.2,17,7.9
116289,2014-10-02,CULEX TERRITANS,21,8,2,75,66,71.0,1.954484,63,65.0,0.0,6.0,480.646739,1847.347826,0.0,0.000068,0.72,29.10,29.78,7.2,17,7.9
116290,2014-10-02,CULEX TARSALIS,21,8,2,75,66,71.0,1.954484,63,65.0,0.0,6.0,480.646739,1847.347826,0.0,0.000068,0.72,29.10,29.78,7.2,17,7.9
116291,2014-10-02,UNSPECIFIED CULEX,21,8,2,75,66,71.0,1.954484,63,65.0,0.0,6.0,480.646739,1847.347826,0.0,0.000068,0.72,29.10,29.78,7.2,17,7.9


In [None]:
train = pd.get_dummies(train, columns=['Species'])

date = pd.to_datetime(train['Date'])
train['Year'] = date.dt.year
train['Month'] = date.dt.month
train['Day'] = date.dt.day
train = train.drop(["Date"], axis=1)

for column in train.columns:
    if train[column].dtype == 'bool':
        train[column] = train[column].astype(int)

In [None]:
test = pd.get_dummies(test, columns=['Species'])

date = pd.to_datetime(test['Date'])
test['Year'] = date.dt.year
test['Month'] = date.dt.month
test['Day'] = date.dt.day
test = test.drop(["Date"], axis=1)

for column in test.columns:
    if test[column].dtype == 'bool':
        test[column] = test[column].astype(int)

In [None]:
val = target.value_counts()
print(val[0])
print(val[1])

9955
551


In [None]:
train_X, test_X, train_Y, test_Y = train_test_split(train, target, test_size=0.2)

In [None]:
model = Sequential([
    Dense(10, activation='relu', input_dim=train_X.shape[1]),
    Dense(1, activation='sigmoid')
])

model.summary()

Model: "sequential_49"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_108 (Dense)           (None, 10)                330       
                                                                 
 dense_109 (Dense)           (None, 1)                 11        
                                                                 
Total params: 341 (1.33 KB)
Trainable params: 341 (1.33 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(train_X, train_Y, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f6a26e7e380>

In [None]:
loss, accuracy = model.evaluate(test_X, test_Y)
print(f"Test loss: {loss}")
print(f"Test accuracy: {accuracy}")

Test loss: 2.127587080001831
Test accuracy: 0.30827784538269043


In [None]:
predictions = model.predict(test)
predictions = (predictions > 0.5).astype(int)
submission = pd.DataFrame({
    'Id': test_id,  # Assurez-vous que 'Id' est correctement conservé dans test_merged
    'WnvPresent': predictions.flatten()  # Assurez-vous que les prédictions sont au bon format
})
submission



In [None]:
submission.to_csv('submission.csv', index=False)