In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, RidgeClassifier, RidgeClassifierCV, SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier


from imblearn.under_sampling import RandomUnderSampler, CondensedNearestNeighbour
from imblearn.over_sampling import RandomOverSampler, SMOTE

In [12]:
df = pd.read_csv('../data_saved/df_final.csv')
df_gps = pd.read_csv('../data/cities_coordinates_gps.csv', index_col = 0).T

df.columns

Index(['id_Location', 'id_Date', 'Date', 'Location', 'MinTemp', 'MaxTemp',
       'Rainfall', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm',
       'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RainTomorrow', 'WindGustDir_cos',
       'WindGustDir_sin', 'WindDir9am_cos', 'WindDir9am_sin', 'WindDir3pm_cos',
       'WindDir3pm_sin', 'Month_cos', 'Month_sin', 'Season_cos', 'Season_sin',
       'Climate_Desert', 'Climate_Grassland', 'Climate_Subtropical',
       'Climate_Temperate', 'Climate_Tropical', 'Year_2007', 'Year_2008',
       'Year_2009', 'Year_2010', 'Year_2011', 'Year_2012', 'Year_2013',
       'Year_2014', 'Year_2015', 'Year_2016', 'Year_2017'],
      dtype='object')

In [47]:
df['RainTomorrow'].value_counts(normalize = True)

RainTomorrow
0.0    0.778709
1.0    0.221291
Name: proportion, dtype: float64

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135600 entries, 0 to 135599
Data columns (total 44 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id_Location          135600 non-null  object 
 1   id_Date              135600 non-null  object 
 2   Date                 135600 non-null  object 
 3   Location             135600 non-null  object 
 4   MinTemp              135600 non-null  float64
 5   MaxTemp              135600 non-null  float64
 6   Rainfall             135600 non-null  float64
 7   WindGustSpeed        135600 non-null  float64
 8   WindSpeed9am         135600 non-null  float64
 9   WindSpeed3pm         135600 non-null  float64
 10  Humidity9am          135600 non-null  float64
 11  Humidity3pm          135600 non-null  float64
 12  Pressure9am          135600 non-null  float64
 13  Pressure3pm          135600 non-null  float64
 14  Temp9am              135600 non-null  float64
 15  Temp3pm          

# encodage des locations

In [14]:
df['gps_lat'] = df['Location'].map(lambda x : df_gps.loc[x, 'lat'])
df['gps_lon'] = df['Location'].map(lambda x : df_gps.loc[x, 'long'])

df['sin_lat'] = df['gps_lat'].apply(lambda x : np.sin(np.radians(x)))
df['cos_lat'] = df['gps_lat'].apply(lambda x : np.cos(np.radians(x)))

df['sin_lon'] = df['gps_lon'].apply(lambda x : np.round(np.sin(np.radians(x)), 6))
df['cos_lon'] = df['gps_lon'].apply(lambda x : np.round(np.cos(np.radians(x)), 6))

df = df.drop(columns = ['id_Location', 'gps_lat', 'gps_lon'])

In [9]:
df.head()

Unnamed: 0,id_Date,Date,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,Year_2012,Year_2013,Year_2014,Year_2015,Year_2016,Year_2017,sin_lat,cos_lat,sin_lon,cos_lon
0,2008-07-01,2008-07-01,8.8,15.7,5.0,48.0,13.0,15.0,92.0,67.0,...,0,0,0,0,0,0,-0.572554,0.819867,0.661303,-0.750119
1,2008-07-02,2008-07-02,12.7,15.8,0.8,35.0,13.0,15.0,75.0,52.0,...,0,0,0,0,0,0,-0.572554,0.819867,0.661303,-0.750119
2,2008-07-03,2008-07-03,6.2,15.1,0.0,20.0,2.0,11.0,81.0,56.0,...,0,0,0,0,0,0,-0.572554,0.819867,0.661303,-0.750119
3,2008-07-04,2008-07-04,5.3,15.9,0.0,30.0,6.0,13.0,71.0,46.0,...,0,0,0,0,0,0,-0.572554,0.819867,0.661303,-0.750119
4,2008-07-07,2008-07-07,7.6,11.2,16.2,46.0,17.0,13.0,83.0,88.0,...,0,0,0,0,0,0,-0.572554,0.819867,0.661303,-0.750119


In [15]:
df = df.drop(columns = ['id_Date'])


# 1 - modelisation simple

In [12]:
X = df.drop(columns = 'RainTomorrow')
y = df['RainTomorrow']

### a - tres simple

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression()

model.fit(X_train_scaled, y_train)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))


print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

score accuracy :  0.8478982300884956

f1 score :  0.5855520948457752
roc-auc score :  0.7194746831985506
[[20081  1106]
 [ 3019  2914]]
              precision    recall  f1-score   support

         0.0       0.87      0.95      0.91     21187
         1.0       0.72      0.49      0.59      5933

    accuracy                           0.85     27120
   macro avg       0.80      0.72      0.75     27120
weighted avg       0.84      0.85      0.84     27120



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### b - tres simple avec stratify

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter = 1000)

model.fit(X_train_scaled, y_train)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

score accuracy :  0.8476401179941003

f1 score :  0.5931469082315872
roc-auc score :  0.7238972332522922
[[19976  1143]
 [ 2989  3012]]
              precision    recall  f1-score   support

         0.0       0.87      0.95      0.91     21119
         1.0       0.72      0.50      0.59      6001

    accuracy                           0.85     27120
   macro avg       0.80      0.72      0.75     27120
weighted avg       0.84      0.85      0.84     27120



### c - avec mise à l'echelle [-1, 1]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

scaler = MinMaxScaler(feature_range = (-1, 1))
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter = 1000)

model.fit(X_train_scaled, y_train)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

score accuracy :  0.846976401179941

f1 score :  0.5917765099350777
roc-auc score :  0.7232325003774212
[[19962  1157]
 [ 2993  3008]]
              precision    recall  f1-score   support

         0.0       0.87      0.95      0.91     21119
         1.0       0.72      0.50      0.59      6001

    accuracy                           0.85     27120
   macro avg       0.80      0.72      0.75     27120
weighted avg       0.84      0.85      0.84     27120



### d - avec standardscaler

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter = 500)

model.fit(X_train_scaled, y_train)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

score accuracy :  0.8474926253687316

f1 score :  0.5929935052155088
roc-auc score :  0.723862175881967
[[19971  1148]
 [ 2988  3013]]
              precision    recall  f1-score   support

         0.0       0.87      0.95      0.91     21119
         1.0       0.72      0.50      0.59      6001

    accuracy                           0.85     27120
   macro avg       0.80      0.72      0.75     27120
weighted avg       0.84      0.85      0.84     27120



In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter = 500)

model.fit(X_train_scaled, y_train)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

score accuracy :  0.8471976401179941

f1 score :  0.5921259842519685
roc-auc score :  0.723374552557922
[[19968  1151]
 [ 2993  3008]]
              precision    recall  f1-score   support

         0.0       0.87      0.95      0.91     21119
         1.0       0.72      0.50      0.59      6001

    accuracy                           0.85     27120
   macro avg       0.80      0.72      0.75     27120
weighted avg       0.84      0.85      0.84     27120



conclusion 1 : je continue avec un minmaxscaler [-1, 1], augmenter le nom d'iter ne sert à rien, je garde straify = True

# 2 - Re *Sampling

### a - undersampling

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

sample = RandomUnderSampler(random_state = 42)

X_train_s, y_train_s = sample.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_s)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter = 500)

model.fit(X_train_scaled, y_train_s)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

2025-01-17 16:14:45.058979: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-17 16:14:45.341454: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-17 16:14:45.341531: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-17 16:14:45.343004: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-17 16:14:45.491259: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-17 16:14:45.493451: I tensorflow/core/platform/cpu_feature_guard.cc:182] This Tens

AttributeError: _ARRAY_API not found

ImportError: numpy.core._multiarray_umath failed to import

ImportError: numpy.core.umath failed to import

score accuracy :  0.7934734513274336

f1 score :  0.6235634115195914
roc-auc score :  0.7861590479904785
[[16880  4239]
 [ 1362  4639]]
              precision    recall  f1-score   support

         0.0       0.93      0.80      0.86     21119
         1.0       0.52      0.77      0.62      6001

    accuracy                           0.79     27120
   macro avg       0.72      0.79      0.74     27120
weighted avg       0.84      0.79      0.81     27120



### b - oversampling 

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

sample = SMOTE(random_state = 42)

X_train_s, y_train_s = sample.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_s)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter = 500)

model.fit(X_train_scaled, y_train_s)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

score accuracy :  0.7923672566371681

f1 score :  0.621496269409155
roc-auc score :  0.784494481754501
[[16866  4253]
 [ 1378  4623]]
              precision    recall  f1-score   support

         0.0       0.92      0.80      0.86     21119
         1.0       0.52      0.77      0.62      6001

    accuracy                           0.79     27120
   macro avg       0.72      0.78      0.74     27120
weighted avg       0.84      0.79      0.80     27120



conclusion : les stratégie d'undersampling est la plus efficace

# 3 - test différence model

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

sample = RandomUnderSampler(random_state = 42)

X_train_s, y_train_s = sample.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_s)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegressionCV(max_iter = 500)

model.fit(X_train_scaled, y_train_s)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

score accuracy :  0.7932890855457227

f1 score :  0.6232020432853879
roc-auc score :  0.7858617389233682
[[16878  4241]
 [ 1365  4636]]
              precision    recall  f1-score   support

         0.0       0.93      0.80      0.86     21119
         1.0       0.52      0.77      0.62      6001

    accuracy                           0.79     27120
   macro avg       0.72      0.79      0.74     27120
weighted avg       0.84      0.79      0.81     27120



In [21]:
from sklearn.linear_model import SGDClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

sample = RandomUnderSampler(random_state = 42)

X_train_s, y_train_s = sample.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_s)
X_test_scaled = scaler.transform(X_test)

model = SGDClassifier(max_iter = 2000, n_jobs = -1)

model.fit(X_train_scaled, y_train_s)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

score accuracy :  0.782853982300885

f1 score :  0.6125912768896783
roc-auc score :  0.7803544927432466
[[16575  4544]
 [ 1345  4656]]
              precision    recall  f1-score   support

         0.0       0.92      0.78      0.85     21119
         1.0       0.51      0.78      0.61      6001

    accuracy                           0.78     27120
   macro avg       0.72      0.78      0.73     27120
weighted avg       0.83      0.78      0.80     27120



In [22]:
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

sample = RandomUnderSampler(random_state = 42)

X_train_s, y_train_s = sample.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_s)
X_test_scaled = scaler.transform(X_test)

model = RandomForestClassifier(n_jobs = -1)

model.fit(X_train_scaled, y_train_s)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

score accuracy :  0.8164454277286136

f1 score :  0.6619125237707145
roc-auc score :  0.8148655149012011
[[17269  3850]
 [ 1128  4873]]
              precision    recall  f1-score   support

         0.0       0.94      0.82      0.87     21119
         1.0       0.56      0.81      0.66      6001

    accuracy                           0.82     27120
   macro avg       0.75      0.81      0.77     27120
weighted avg       0.85      0.82      0.83     27120



conclusion : on va partir sur un RandomForrestClassifier

# 4 - exploration des hyperparametres

In [25]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.model_selection import cross_val_score


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

smote = SMOTE(random_state=42)
X_train_s, y_train_s = smote.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_s)
X_test_scaled = scaler.transform(X_test)

model = RandomForestClassifier(
    n_estimators = 200,
    max_features = 'sqrt',
    criterion = 'entropy',
    max_depth = 30,
    bootstrap = False,
    min_samples_split = 5,
    min_samples_leaf = 2,
    n_jobs = -1
)

model.fit(X_train_scaled, y_train_s)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))


print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

score accuracy :  0.8692109144542773

f1 score :  0.671175762068007
roc-auc score :  0.7740828609629506
[[19949  1170]
 [ 2379  3622]]
              precision    recall  f1-score   support

         0.0       0.89      0.94      0.92     21119
         1.0       0.76      0.60      0.67      6001

    accuracy                           0.87     27120
   macro avg       0.82      0.77      0.79     27120
weighted avg       0.86      0.87      0.86     27120



# essai de modelisation par ville

In [17]:

df_sydney = df[df['Location'] == 'Sydney']

In [21]:
print(df_sydney.columns, end = '\n\n')
print(df_sydney.shape, end = '\n\n')
print(df_sydney['RainTomorrow'].value_counts(normalize = True))

df_sydney = df_sydney.drop(columns = ['Date', 'Location', 'sin_lat', 'cos_lat', 'sin_lon', 'cos_lon',
                  'Climate_Desert','Climate_Grassland', 'Climate_Subtropical',
                  'Climate_Temperate', 'Climate_Tropical'])

Index(['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'WindGustSpeed',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Temp9am', 'Temp3pm', 'RainToday',
       'RainTomorrow', 'WindGustDir_cos', 'WindGustDir_sin', 'WindDir9am_cos',
       'WindDir9am_sin', 'WindDir3pm_cos', 'WindDir3pm_sin', 'Month_cos',
       'Month_sin', 'Season_cos', 'Season_sin', 'Climate_Desert',
       'Climate_Grassland', 'Climate_Subtropical', 'Climate_Temperate',
       'Climate_Tropical', 'Year_2007', 'Year_2008', 'Year_2009', 'Year_2010',
       'Year_2011', 'Year_2012', 'Year_2013', 'Year_2014', 'Year_2015',
       'Year_2016', 'Year_2017', 'sin_lat', 'cos_lat', 'sin_lon', 'cos_lon'],
      dtype='object')

(3337, 46)

RainTomorrow
0.0    0.740785
1.0    0.259215
Name: proportion, dtype: float64


In [22]:
X = df_sydney.drop(columns = 'RainTomorrow')
y = df_sydney['RainTomorrow']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2, stratify = y)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter = 10000, verbose = 1)

model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print('score accuracy :', model.score(X_test_scaled, y_test))
print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

score accuracy : 0.8577844311377245
f1 score :  0.6905537459283387
roc-auc score :  0.7780755532200619
[[467  28]
 [ 67 106]]
              precision    recall  f1-score   support

         0.0       0.87      0.94      0.91       495
         1.0       0.79      0.61      0.69       173

    accuracy                           0.86       668
   macro avg       0.83      0.78      0.80       668
weighted avg       0.85      0.86      0.85       668



In [24]:
X = df_sydney.drop(columns = 'RainTomorrow')
y = df_sydney['RainTomorrow']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2, stratify = y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter = 10000, verbose = 1, class_weight = {0 : 0.26, 1 : 0.74})

model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print('score accuracy :', model.score(X_test_scaled, y_test))
print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


print('après essai sur grid search, le modele simple de regressionlogicitc est le meilleur')

score accuracy : 0.8293413173652695
f1 score :  0.7076923076923077
roc-auc score :  0.8190459508378584
[[416  79]
 [ 35 138]]
              precision    recall  f1-score   support

         0.0       0.92      0.84      0.88       495
         1.0       0.64      0.80      0.71       173

    accuracy                           0.83       668
   macro avg       0.78      0.82      0.79       668
weighted avg       0.85      0.83      0.83       668

après essai sur grid search, le modele simple de regressionlogicitc est le meilleur


# Essai avec date

In [42]:
df_sydney = df[df['Location'] == 'Sydney']

df_sydney = df_sydney.set_index('Date')

df_sydney = df_sydney.drop(columns = ['Location', 'sin_lat', 'cos_lat', 'sin_lon', 'cos_lon',
                  'Climate_Desert','Climate_Grassland', 'Climate_Subtropical',
                  'Climate_Temperate', 'Climate_Tropical'])

print(df_sydney['RainTomorrow'].value_counts(normalize = True))


RainTomorrow
0.0    0.740785
1.0    0.259215
Name: proportion, dtype: float64


In [61]:
X = df_sydney.drop(columns = 'RainTomorrow')
y = df_sydney['RainTomorrow']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2, stratify = y)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter = 10000, verbose = 1, class_weight = 'balanced')

model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print('score accuracy :', model.score(X_test_scaled, y_test))
print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

score accuracy : 0.8308383233532934
f1 score :  0.710997442455243
roc-auc score :  0.821936124248263
[[416  79]
 [ 34 139]]
              precision    recall  f1-score   support

         0.0       0.92      0.84      0.88       495
         1.0       0.64      0.80      0.71       173

    accuracy                           0.83       668
   macro avg       0.78      0.82      0.80       668
weighted avg       0.85      0.83      0.84       668



In [64]:
X = df_sydney.drop(columns = 'RainTomorrow')
y = df_sydney['RainTomorrow']

cut_index = df_sydney.index[int(0.8 * len(df_sydney))]  # 80% des données pour l'entraînement

    # Diviser les données en fonction de la date
X_train = X[df_sydney.index <= cut_index]
y_train = y[df_sydney.index <= cut_index]
X_test = X[df_sydney.index > cut_index]
y_test = y[df_sydney.index > cut_index]

print(y_train.value_counts(normalize = True))
print(y_test.value_counts(normalize = True))


scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter = 10000, verbose = 1, class_weight = {0 : 0.73, 1 : 0.27})

model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print('score accuracy :', model.score(X_test_scaled, y_test))
print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

RainTomorrow
0.0    0.741948
1.0    0.258052
Name: proportion, dtype: float64
RainTomorrow
0.0    0.736132
1.0    0.263868
Name: proportion, dtype: float64
score accuracy : 0.815592203898051
f1 score :  0.48535564853556484
roc-auc score :  0.6596810775782262
[[486   5]
 [118  58]]
              precision    recall  f1-score   support

         0.0       0.80      0.99      0.89       491
         1.0       0.92      0.33      0.49       176

    accuracy                           0.82       667
   macro avg       0.86      0.66      0.69       667
weighted avg       0.84      0.82      0.78       667



In [69]:
model =RandomForestClassifier( n_jobs = -1, class_weight = {0 : 4, 1 : 1})

grid_search = GridSearchCV(estimator = model, param_grid = param_grid, n_jobs = -1, verbose = 2, scoring = 'recall')

grid_search.fit(X_train_scaled, y_train)

print('best_estimator : ', grid_search.best_estimator_)
print('best_params :', grid_search.best_params_)

y_pred = grid_search.predict(X_test_scaled)

print('score accuracy :', grid_search.score(X_test_scaled, y_test))
print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END bootstrap=False, criterion=log_loss, max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=7, n_estimators=924; total time=   8.6s
[CV] END bootstrap=False, criterion=log_loss, max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=7, n_estimators=924; total time=   8.7s
[CV] END bootstrap=False, criterion=log_loss, max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=7, n_estimators=924; total time=   8.7s
[CV] END bootstrap=False, criterion=log_loss, max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=7, n_estimators=924; total time=   8.8s
[CV] END bootstrap=False, criterion=log_loss, max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=7, n_estimators=924; total time=   8.8s
best_estimator :  RandomForestClassifier(bootstrap=False, class_weight={0: 4, 1: 1},
                       criterion='log_loss', max_depth=30, max_features