In [33]:
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, RidgeClassifier, RidgeClassifierCV
from sklearn.metrics import confusion_matrix, classification_report, f1_score, roc_auc_score

from imblearn.under_sampling import RandomUnderSampler, CondensedNearestNeighbour
from imblearn.over_sampling import RandomOverSampler, SMOTE


In [34]:
df = pd.read_csv('../data_saved/data_mat.csv')
df_gps = pd.read_csv('../data/cities_coordinates_gps.csv', index_col = 0).T

df.head()

Unnamed: 0,id_Location,id_Date,Date,Location,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,...,Climate,Year,Month,Season,WindGustDir_cos,WindGustDir_sin,WindDir9am_cos,WindDir9am_sin,WindDir3pm_cos,WindDir3pm_sin
0,Adelaide,2008-07-01,2008-07-01,Adelaide,8.8,15.7,5.0,48.0,13.0,15.0,...,Temperate,2008,July,Winter,0.7071068,-0.707107,-0.707107,-0.707107,-1.83697e-16,-1.0
1,Adelaide,2008-07-02,2008-07-02,Adelaide,12.7,15.8,0.8,35.0,13.0,15.0,...,Temperate,2008,July,Winter,-0.7071068,-0.707107,-0.92388,-0.382683,-0.7071068,-0.707107
2,Adelaide,2008-07-03,2008-07-03,Adelaide,6.2,15.1,0.0,20.0,2.0,11.0,...,Temperate,2008,July,Winter,-1.83697e-16,-1.0,0.92388,0.382683,-0.7071068,-0.707107
3,Adelaide,2008-07-04,2008-07-04,Adelaide,5.3,15.9,0.0,30.0,6.0,13.0,...,Temperate,2008,July,Winter,0.9238795,0.382683,0.92388,0.382683,0.7071068,0.707107
4,Adelaide,2008-07-07,2008-07-07,Adelaide,7.6,11.2,16.2,46.0,17.0,13.0,...,Temperate,2008,July,Winter,-0.3826834,-0.92388,0.382683,-0.92388,-0.7071068,-0.707107


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135600 entries, 0 to 135599
Data columns (total 28 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id_Location      135600 non-null  object 
 1   id_Date          135600 non-null  object 
 2   Date             135600 non-null  object 
 3   Location         135600 non-null  object 
 4   MinTemp          135600 non-null  float64
 5   MaxTemp          135600 non-null  float64
 6   Rainfall         135600 non-null  float64
 7   WindGustSpeed    135600 non-null  float64
 8   WindSpeed9am     135600 non-null  float64
 9   WindSpeed3pm     135600 non-null  float64
 10  Humidity9am      135600 non-null  float64
 11  Humidity3pm      135600 non-null  float64
 12  Pressure9am      135600 non-null  float64
 13  Pressure3pm      135600 non-null  float64
 14  Temp9am          135600 non-null  float64
 15  Temp3pm          135600 non-null  float64
 16  RainToday        135600 non-null  floa

# encodage des locations

In [36]:
df['gps_lat'] = df['Location'].map(lambda x : df_gps.loc[x, 'lat'])
df['gps_lon'] = df['Location'].map(lambda x : df_gps.loc[x, 'long'])

df['sin_lat'] = df['gps_lat'].apply(lambda x : np.sin(np.radians(x)))
df['cos_lat'] = df['gps_lat'].apply(lambda x : np.cos(np.radians(x)))

df['sin_lon'] = df['gps_lon'].apply(lambda x : np.round(np.sin(np.radians(x)), 6))
df['cos_lon'] = df['gps_lon'].apply(lambda x : np.round(np.cos(np.radians(x)), 6))

df = df.drop(columns = ['Location', 'id_Location', 'gps_lat', 'gps_lon'])

# encodage Month

In [37]:
dict_mois = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8,
    'September': 9,
    'October': 10,
    'November': 11,
    'December': 12
}

df['Month'] = df['Month'].map(dict_mois)

df['sin_month'] = df['Month'].apply(lambda x : np.sin((2 * np.pi) * (( x - 1 ) / 12)))
df['cos_month'] = df['Month'].apply(lambda x : np.cos((2 * np.pi) * (( x - 1 ) / 12)))


df = df.drop(columns = 'Month')

# encodage season

In [38]:
df['Season'].unique()

array(['Winter', 'Spring', 'Summer', 'Autumn'], dtype=object)

In [39]:
dict_season = {
    'Spring' : 0,
    'Summer' : 1,
    'Autumn' : 2,
    'Winter' : 3
}

df['Season'] = df['Season'].map(dict_season)

df['sin_season'] = df['Season'].apply(lambda x : round(np.sin((2 * np.pi) * (x / 4))), 1)
df['cos_season'] = df['Season'].apply(lambda x : round(np.cos((2 * np.pi) * ( x / 4))), 1)

df = df.drop(columns = 'Season')

  df['sin_season'] = df['Season'].apply(lambda x : round(np.sin((2 * np.pi) * (x / 4))), 1)
  df['cos_season'] = df['Season'].apply(lambda x : round(np.cos((2 * np.pi) * ( x / 4))), 1)


# encodage Climate

In [40]:
ohe = OneHotEncoder(sparse_output = False)

climate_transform = ohe.fit_transform(df[['Climate']])
df_climate_ohe = pd.DataFrame(climate_transform, columns = ohe.get_feature_names_out())

df = pd.concat([df, df_climate_ohe], axis = 1)

df = df.drop(columns = 'Climate')

In [41]:
df.columns

Index(['id_Date', 'Date', 'MinTemp', 'MaxTemp', 'Rainfall', 'WindGustSpeed',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Temp9am', 'Temp3pm', 'RainToday',
       'RainTomorrow', 'Year', 'WindGustDir_cos', 'WindGustDir_sin',
       'WindDir9am_cos', 'WindDir9am_sin', 'WindDir3pm_cos', 'WindDir3pm_sin',
       'sin_lat', 'cos_lat', 'sin_lon', 'cos_lon', 'sin_month', 'cos_month',
       'sin_season', 'cos_season', 'Climate_Desert', 'Climate_Grassland',
       'Climate_Subtropical', 'Climate_Temperate', 'Climate_Tropical'],
      dtype='object')

In [42]:
df.head()

Unnamed: 0,id_Date,Date,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,cos_lon,sin_month,cos_month,sin_season,cos_season,Climate_Desert,Climate_Grassland,Climate_Subtropical,Climate_Temperate,Climate_Tropical
0,2008-07-01,2008-07-01,8.8,15.7,5.0,48.0,13.0,15.0,92.0,67.0,...,-0.750119,1.224647e-16,-1.0,-1,0,0.0,0.0,0.0,1.0,0.0
1,2008-07-02,2008-07-02,12.7,15.8,0.8,35.0,13.0,15.0,75.0,52.0,...,-0.750119,1.224647e-16,-1.0,-1,0,0.0,0.0,0.0,1.0,0.0
2,2008-07-03,2008-07-03,6.2,15.1,0.0,20.0,2.0,11.0,81.0,56.0,...,-0.750119,1.224647e-16,-1.0,-1,0,0.0,0.0,0.0,1.0,0.0
3,2008-07-04,2008-07-04,5.3,15.9,0.0,30.0,6.0,13.0,71.0,46.0,...,-0.750119,1.224647e-16,-1.0,-1,0,0.0,0.0,0.0,1.0,0.0
4,2008-07-07,2008-07-07,7.6,11.2,16.2,46.0,17.0,13.0,83.0,88.0,...,-0.750119,1.224647e-16,-1.0,-1,0,0.0,0.0,0.0,1.0,0.0


In [43]:
df = df.drop(columns = ['id_Date', 'Date'])

# 1 - modelisation simple

In [44]:
X = df.drop(columns = 'RainTomorrow')
y = df['RainTomorrow']

### a - tres simple

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression()

model.fit(X_train_scaled, y_train)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))


confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))

score accuracy :  0.8478982300884956

f1 score :  0.5855520948457752
roc-auc score :  0.7194746831985506
              precision    recall  f1-score   support

         0.0       0.87      0.95      0.91     21187
         1.0       0.72      0.49      0.59      5933

    accuracy                           0.85     27120
   macro avg       0.80      0.72      0.75     27120
weighted avg       0.84      0.85      0.84     27120



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### b - tres simple avec stratify

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter = 1000)

model.fit(X_train_scaled, y_train)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))

confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))

score accuracy :  0.8476401179941003

f1 score :  0.5931469082315872
roc-auc score :  0.7238972332522922
              precision    recall  f1-score   support

         0.0       0.87      0.95      0.91     21119
         1.0       0.72      0.50      0.59      6001

    accuracy                           0.85     27120
   macro avg       0.80      0.72      0.75     27120
weighted avg       0.84      0.85      0.84     27120



### c - avec mise à l'echelle [-1, 1]

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

scaler = MinMaxScaler(feature_range = (-1, 1))
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter = 1000)

model.fit(X_train_scaled, y_train)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))

confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))

score accuracy :  0.846976401179941

f1 score :  0.5917765099350777
roc-auc score :  0.7232325003774212
              precision    recall  f1-score   support

         0.0       0.87      0.95      0.91     21119
         1.0       0.72      0.50      0.59      6001

    accuracy                           0.85     27120
   macro avg       0.80      0.72      0.75     27120
weighted avg       0.84      0.85      0.84     27120



### d - avec standardscaler

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter = 500)

model.fit(X_train_scaled, y_train)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))

confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))

score accuracy :  0.8474926253687316

f1 score :  0.5929935052155088
roc-auc score :  0.723862175881967
              precision    recall  f1-score   support

         0.0       0.87      0.95      0.91     21119
         1.0       0.72      0.50      0.59      6001

    accuracy                           0.85     27120
   macro avg       0.80      0.72      0.75     27120
weighted avg       0.84      0.85      0.84     27120



In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter = 500)

model.fit(X_train_scaled, y_train)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))

confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))

score accuracy :  0.8471976401179941

f1 score :  0.5921259842519685
roc-auc score :  0.723374552557922
              precision    recall  f1-score   support

         0.0       0.87      0.95      0.91     21119
         1.0       0.72      0.50      0.59      6001

    accuracy                           0.85     27120
   macro avg       0.80      0.72      0.75     27120
weighted avg       0.84      0.85      0.84     27120



conclusion 1 : je continue avec un minmaxscaler [-1, 1], augmenter le nom d'iter ne sert à rien, je garde straify = True

# 2 - Re *Sampling

### a - undersampling

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

sample = RandomUnderSampler(random_state = 42)

X_train_s, y_train_s = sample.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_s)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter = 500)

model.fit(X_train_scaled, y_train_s)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))

confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))

score accuracy :  0.7934734513274336

f1 score :  0.6235634115195914
roc-auc score :  0.7861590479904785
              precision    recall  f1-score   support

         0.0       0.93      0.80      0.86     21119
         1.0       0.52      0.77      0.62      6001

    accuracy                           0.79     27120
   macro avg       0.72      0.79      0.74     27120
weighted avg       0.84      0.79      0.81     27120



### b - oversampling 

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

sample = SMOTE(random_state = 42)

X_train_s, y_train_s = sample.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_s)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter = 500)

model.fit(X_train_scaled, y_train_s)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))

confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))

score accuracy :  0.7923672566371681

f1 score :  0.621496269409155
roc-auc score :  0.784494481754501
              precision    recall  f1-score   support

         0.0       0.92      0.80      0.86     21119
         1.0       0.52      0.77      0.62      6001

    accuracy                           0.79     27120
   macro avg       0.72      0.78      0.74     27120
weighted avg       0.84      0.79      0.80     27120



conclusion : les stratégie d'undersampling est la plus efficace

# 3 - test différence model

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

sample = RandomUnderSampler(random_state = 42)

X_train_s, y_train_s = sample.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_s)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegressionCV(max_iter = 500)

model.fit(X_train_scaled, y_train_s)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))

confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))

score accuracy :  0.7932890855457227

f1 score :  0.6232020432853879
roc-auc score :  0.7858617389233682
              precision    recall  f1-score   support

         0.0       0.93      0.80      0.86     21119
         1.0       0.52      0.77      0.62      6001

    accuracy                           0.79     27120
   macro avg       0.72      0.79      0.74     27120
weighted avg       0.84      0.79      0.81     27120



In [53]:
from sklearn.linear_model import SGDClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

sample = RandomUnderSampler(random_state = 42)

X_train_s, y_train_s = sample.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_s)
X_test_scaled = scaler.transform(X_test)

model = SGDClassifier(max_iter = 2000, n_jobs = -1)

model.fit(X_train_scaled, y_train_s)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))

confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))

score accuracy :  0.7936578171091445

f1 score :  0.6204557786218122
roc-auc score :  0.7824005593903297
              precision    recall  f1-score   support

         0.0       0.92      0.80      0.86     21119
         1.0       0.52      0.76      0.62      6001

    accuracy                           0.79     27120
   macro avg       0.72      0.78      0.74     27120
weighted avg       0.83      0.79      0.81     27120



In [54]:
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

sample = RandomUnderSampler(random_state = 42)

X_train_s, y_train_s = sample.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_s)
X_test_scaled = scaler.transform(X_test)

model = RandomForestClassifier(n_jobs = -1)

model.fit(X_train_scaled, y_train_s)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))

confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))

score accuracy :  0.8165929203539823

f1 score :  0.6620923913043478
roc-auc score :  0.8149602163548684
              precision    recall  f1-score   support

         0.0       0.94      0.82      0.87     21119
         1.0       0.56      0.81      0.66      6001

    accuracy                           0.82     27120
   macro avg       0.75      0.81      0.77     27120
weighted avg       0.85      0.82      0.83     27120



In [55]:
from sklearn.tree import DecisionTreeClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

sample = RandomUnderSampler(random_state = 42)

X_train_s, y_train_s = sample.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_s)
X_test_scaled = scaler.transform(X_test)

model = DecisionTreeClassifier()

model.fit(X_train_scaled, y_train_s)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))

confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))

score accuracy :  0.7231932153392331

f1 score :  0.5382864874838551
roc-auc score :  0.7253474113990457
              precision    recall  f1-score   support

         0.0       0.90      0.72      0.80     21119
         1.0       0.43      0.73      0.54      6001

    accuracy                           0.72     27120
   macro avg       0.67      0.73      0.67     27120
weighted avg       0.80      0.72      0.74     27120



In [56]:
from sklearn.svm import SVC

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

sample = RandomUnderSampler(random_state = 42)

X_train_s, y_train_s = sample.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_s)
X_test_scaled = scaler.transform(X_test)

model = SVC(verbose = 2, max_iter = 500)

model.fit(X_train_scaled, y_train_s)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))

confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))

[LibSVM]WARN: libsvm Solver reached max_iter
optimization finished, #iter = 500
obj = -985.535335, rho = -0.493495
nSV = 1000, nBSV = 1000
Total nSV = 1000




score accuracy :  0.7803466076696165

f1 score :  0.3735408560311284
roc-auc score :  0.6069696080058125
              precision    recall  f1-score   support

         0.0       0.82      0.92      0.87     21119
         1.0       0.51      0.30      0.37      6001

    accuracy                           0.78     27120
   macro avg       0.66      0.61      0.62     27120
weighted avg       0.75      0.78      0.76     27120



conclusion : on va partir sur un RandomForrestClassifier

# 4 - exploration des hyperparametres

In [57]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.model_selection import cross_val_score


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

smote = SMOTE(random_state=42)
X_train_s, y_train_s = smote.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_s)
X_test_scaled = scaler.transform(X_test)

model = RandomForestClassifier(
    n_estimators = 200,
    max_features = 'sqrt',
    criterion = 'entropy',
    max_depth = 30,
    bootstrap = False,
    min_samples_split = 5,
    min_samples_leaf = 2,
    n_jobs = -1
)

model.fit(X_train_scaled, y_train_s)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))

print('Confusion Matrix :')
print(confusion_matrix(y_test, y_pred))

score accuracy :  0.8686209439528023

f1 score :  0.6701231367465975
roc-auc score :  0.773572473625089
Confusion Matrix :
[[19938  1181]
 [ 2382  3619]]


In [58]:
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

sample = RandomUnderSampler(random_state = 42)

X_train_s, y_train_s = sample.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_s)
X_test_scaled = scaler.transform(X_test)



model = RandomForestClassifier(n_jobs = -1)

model.fit(X_train_scaled, y_train_s)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

score accuracy :  0.8169616519174041

f1 score :  0.6620370370370371
roc-auc score :  0.8145408850722742
[[17294  3825]
 [ 1139  4862]]
              precision    recall  f1-score   support

         0.0       0.94      0.82      0.87     21119
         1.0       0.56      0.81      0.66      6001

    accuracy                           0.82     27120
   macro avg       0.75      0.81      0.77     27120
weighted avg       0.85      0.82      0.83     27120



In [59]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import confusion_matrix, classification_report


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

sample = RandomUnderSampler(random_state=42)
X_train_s, y_train_s = sample.fit_resample(X_train, y_train)

scaler = MinMaxScaler(feature_range=(-1, 1))
X_train_scaled = scaler.fit_transform(X_train_s)
X_test_scaled = scaler.transform(X_test)

param_grid = {
    'n_estimators': [200],
    'max_features': ['sqrt'],
    'criterion': ['entropy'],
    'max_depth': [30],
    'bootstrap': [False],
    'min_samples_split': [5],
    'min_samples_leaf': [2],
}

model = RandomForestClassifier(
    verbose=0,
    n_jobs=-1
)

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    verbose=0,
    n_jobs=-1,
    return_train_score=True
)

grid_search.fit(X_train_scaled, y_train_s)

print(f"Meilleurs paramètres : {grid_search.best_params_}")
print(f"Meilleur score F1 : {grid_search.best_score_}")

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test_scaled)

print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

print('Score accuracy : ', best_model.score(X_test_scaled, y_test), end='\n\n')

Meilleurs paramètres : {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
Meilleur score F1 : 0.811748518802989
f1 score :  0.6663512234689739
roc-auc score :  0.8191999488318625
[[17255  3864]
 [ 1072  4929]]
              precision    recall  f1-score   support

         0.0       0.94      0.82      0.87     21119
         1.0       0.56      0.82      0.67      6001

    accuracy                           0.82     27120
   macro avg       0.75      0.82      0.77     27120
weighted avg       0.86      0.82      0.83     27120

Score accuracy :  0.8179941002949852



In [60]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import confusion_matrix, classification_report


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

sample = RandomUnderSampler(random_state=42)
X_train_s, y_train_s = sample.fit_resample(X_train, y_train)

scaler = MinMaxScaler(feature_range=(-1, 1))
X_train_scaled = scaler.fit_transform(X_train_s)
X_test_scaled = scaler.transform(X_test)

param_grid = {
    'n_estimators': [200],
    'max_features': ['sqrt'],
    'criterion': ['entropy'],
    'max_depth': [30],
    'bootstrap': [False],
    'min_samples_split': [5],
    'min_samples_leaf': [2],
}

model = RandomForestClassifier(
    verbose=0,
    n_jobs=-1
)

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    verbose=0,
    n_jobs=-1,
    return_train_score=True
)

grid_search.fit(X_train_scaled, y_train_s)

print(f"Meilleurs paramètres : {grid_search.best_params_}")
print(f"Meilleur score F1 : {grid_search.best_score_}")

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test_scaled)

print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

print('Score accuracy : ', best_model.score(X_test_scaled, y_test), end='\n\n')

Meilleurs paramètres : {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
Meilleur score F1 : 0.812449602854503
f1 score :  0.667882225823879
roc-auc score :  0.8205804107068381
[[17257  3862]
 [ 1056  4945]]
              precision    recall  f1-score   support

         0.0       0.94      0.82      0.88     21119
         1.0       0.56      0.82      0.67      6001

    accuracy                           0.82     27120
   macro avg       0.75      0.82      0.77     27120
weighted avg       0.86      0.82      0.83     27120

Score accuracy :  0.8186578171091445



In [61]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import confusion_matrix, classification_report


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

sample = RandomUnderSampler(random_state=42)
X_train_s, y_train_s = sample.fit_resample(X_train, y_train)

scaler = MinMaxScaler(feature_range=(-1, 1))
X_train_scaled = scaler.fit_transform(X_train_s)
X_test_scaled = scaler.transform(X_test)

param_grid = {
    # 'n_estimators': [100, 200],
    'max_features': ['sqrt'],
    'criterion': ['entropy'],
    'max_depth': [10, 20, 30, None],
    # 'min_samples_split': [2, 5, 10],
    # 'min_samples_leaf': [1, 2, 4],
    # 'bootstrap': [True, False],
    # 'warm_start': [True, False]
}

model = ExtraTreesClassifier(
    verbose=0,
    n_jobs=-1
)

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    verbose=0,
    n_jobs=-1,
    return_train_score=True
)

grid_search.fit(X_train_scaled, y_train_s)

print(f"Meilleurs paramètres : {grid_search.best_params_}")
print(f"Meilleur score F1 : {grid_search.best_score_}")

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test_scaled)

print('f1 score : ', f1_score(y_test, y_pred))
print('roc-auc score : ', roc_auc_score(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

print('Score accuracy : ', best_model.score(X_test_scaled, y_test), end='\n\n')

Meilleurs paramètres : {'criterion': 'entropy', 'max_depth': None, 'max_features': 'sqrt'}
Meilleur score F1 : 0.8091781696461833
f1 score :  0.6595079516107109
roc-auc score :  0.8128553775216797
[[17258  3861]
 [ 1149  4852]]
              precision    recall  f1-score   support

         0.0       0.94      0.82      0.87     21119
         1.0       0.56      0.81      0.66      6001

    accuracy                           0.82     27120
   macro avg       0.75      0.81      0.77     27120
weighted avg       0.85      0.82      0.83     27120

Score accuracy :  0.8152654867256637

