In [2]:
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, RidgeClassifier, RidgeClassifierCV
from sklearn.metrics import confusion_matrix, classification_report

from imblearn.under_sampling import RandomUnderSampler, CondensedNearestNeighbour
from imblearn.over_sampling import RandomOverSampler, SMOTE


In [3]:
df = pd.read_csv('../data_saved/data_mat.csv')
df_gps = pd.read_csv('../data/cities_coordinates_gps.csv', index_col = 0).T

df.head()

Unnamed: 0,id_Location,id_Date,Date,Location,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,...,Climate,Year,Month,Season,WindGustDir_cos,WindGustDir_sin,WindDir9am_cos,WindDir9am_sin,WindDir3pm_cos,WindDir3pm_sin
0,Adelaide,2008-07-01,2008-07-01,Adelaide,8.8,15.7,5.0,48.0,13.0,15.0,...,Temperate,2008,July,Winter,0.7071068,-0.707107,-0.707107,-0.707107,-1.83697e-16,-1.0
1,Adelaide,2008-07-02,2008-07-02,Adelaide,12.7,15.8,0.8,35.0,13.0,15.0,...,Temperate,2008,July,Winter,-0.7071068,-0.707107,-0.92388,-0.382683,-0.7071068,-0.707107
2,Adelaide,2008-07-03,2008-07-03,Adelaide,6.2,15.1,0.0,20.0,2.0,11.0,...,Temperate,2008,July,Winter,-1.83697e-16,-1.0,0.92388,0.382683,-0.7071068,-0.707107
3,Adelaide,2008-07-04,2008-07-04,Adelaide,5.3,15.9,0.0,30.0,6.0,13.0,...,Temperate,2008,July,Winter,0.9238795,0.382683,0.92388,0.382683,0.7071068,0.707107
4,Adelaide,2008-07-07,2008-07-07,Adelaide,7.6,11.2,16.2,46.0,17.0,13.0,...,Temperate,2008,July,Winter,-0.3826834,-0.92388,0.382683,-0.92388,-0.7071068,-0.707107


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135600 entries, 0 to 135599
Data columns (total 28 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id_Location      135600 non-null  object 
 1   id_Date          135600 non-null  object 
 2   Date             135600 non-null  object 
 3   Location         135600 non-null  object 
 4   MinTemp          135600 non-null  float64
 5   MaxTemp          135600 non-null  float64
 6   Rainfall         135600 non-null  float64
 7   WindGustSpeed    135600 non-null  float64
 8   WindSpeed9am     135600 non-null  float64
 9   WindSpeed3pm     135600 non-null  float64
 10  Humidity9am      135600 non-null  float64
 11  Humidity3pm      135600 non-null  float64
 12  Pressure9am      135600 non-null  float64
 13  Pressure3pm      135600 non-null  float64
 14  Temp9am          135600 non-null  float64
 15  Temp3pm          135600 non-null  float64
 16  RainToday        135600 non-null  floa

# encodage des locations

In [5]:
df['gps_lat'] = df['Location'].map(lambda x : df_gps.loc[x, 'lat'])
df['gps_lon'] = df['Location'].map(lambda x : df_gps.loc[x, 'long'])

df['sin_lat'] = df['gps_lat'].apply(lambda x : np.sin(np.radians(x)))
df['cos_lat'] = df['gps_lat'].apply(lambda x : np.cos(np.radians(x)))

df['sin_lon'] = df['gps_lon'].apply(lambda x : np.round(np.sin(np.radians(x)), 6))
df['cos_lon'] = df['gps_lon'].apply(lambda x : np.round(np.cos(np.radians(x)), 6))

df = df.drop(columns = ['Location', 'id_Location', 'gps_lat', 'gps_lon'])

# encodage Month

In [6]:
dict_mois = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8,
    'September': 9,
    'October': 10,
    'November': 11,
    'December': 12
}

df['Month'] = df['Month'].map(dict_mois)

df['sin_month'] = df['Month'].apply(lambda x : np.sin((2 * np.pi) * (( x - 1 ) / 12)))
df['cos_month'] = df['Month'].apply(lambda x : np.cos((2 * np.pi) * (( x - 1 ) / 12)))


df = df.drop(columns = 'Month')

# encodage season

In [7]:
df['Season'].unique()

array(['Winter', 'Spring', 'Summer', 'Autumn'], dtype=object)

In [8]:
dict_season = {
    'Spring' : 0,
    'Summer' : 1,
    'Autumn' : 2,
    'Winter' : 3
}

df['Season'] = df['Season'].map(dict_season)

df['sin_season'] = df['Season'].apply(lambda x : round(np.sin((2 * np.pi) * (x / 4))), 1)
df['cos_season'] = df['Season'].apply(lambda x : round(np.cos((2 * np.pi) * ( x / 4))), 1)

df = df.drop(columns = 'Season')

  df['sin_season'] = df['Season'].apply(lambda x : round(np.sin((2 * np.pi) * (x / 4))), 1)
  df['cos_season'] = df['Season'].apply(lambda x : round(np.cos((2 * np.pi) * ( x / 4))), 1)


# encodage Climate

In [9]:
ohe = OneHotEncoder(sparse_output = False)

climate_transform = ohe.fit_transform(df[['Climate']])
df_climate_ohe = pd.DataFrame(climate_transform, columns = ohe.get_feature_names_out())

df = pd.concat([df, df_climate_ohe], axis = 1)

df = df.drop(columns = 'Climate')

In [10]:
df.columns

Index(['id_Date', 'Date', 'MinTemp', 'MaxTemp', 'Rainfall', 'WindGustSpeed',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Temp9am', 'Temp3pm', 'RainToday',
       'RainTomorrow', 'Year', 'WindGustDir_cos', 'WindGustDir_sin',
       'WindDir9am_cos', 'WindDir9am_sin', 'WindDir3pm_cos', 'WindDir3pm_sin',
       'sin_lat', 'cos_lat', 'sin_lon', 'cos_lon', 'sin_month', 'cos_month',
       'sin_season', 'cos_season', 'Climate_Desert', 'Climate_Grassland',
       'Climate_Subtropical', 'Climate_Temperate', 'Climate_Tropical'],
      dtype='object')

In [11]:
df.head()

Unnamed: 0,id_Date,Date,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,cos_lon,sin_month,cos_month,sin_season,cos_season,Climate_Desert,Climate_Grassland,Climate_Subtropical,Climate_Temperate,Climate_Tropical
0,2008-07-01,2008-07-01,8.8,15.7,5.0,48.0,13.0,15.0,92.0,67.0,...,-0.750119,1.224647e-16,-1.0,-1,0,0.0,0.0,0.0,1.0,0.0
1,2008-07-02,2008-07-02,12.7,15.8,0.8,35.0,13.0,15.0,75.0,52.0,...,-0.750119,1.224647e-16,-1.0,-1,0,0.0,0.0,0.0,1.0,0.0
2,2008-07-03,2008-07-03,6.2,15.1,0.0,20.0,2.0,11.0,81.0,56.0,...,-0.750119,1.224647e-16,-1.0,-1,0,0.0,0.0,0.0,1.0,0.0
3,2008-07-04,2008-07-04,5.3,15.9,0.0,30.0,6.0,13.0,71.0,46.0,...,-0.750119,1.224647e-16,-1.0,-1,0,0.0,0.0,0.0,1.0,0.0
4,2008-07-07,2008-07-07,7.6,11.2,16.2,46.0,17.0,13.0,83.0,88.0,...,-0.750119,1.224647e-16,-1.0,-1,0,0.0,0.0,0.0,1.0,0.0


In [12]:
df = df.drop(columns = ['id_Date', 'Date'])

# 1 - modelisation simple

In [13]:
X = df.drop(columns = 'RainTomorrow')
y = df['RainTomorrow']

### a - tres simple

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression()

model.fit(X_train_scaled, y_train)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

confusion_matrix(y_test, y_pred)


score accuracy :  0.8475294985250738



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([[20076,  1111],
       [ 3024,  2909]])

In [221]:
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

         0.0       0.78      1.00      0.88     21187
         1.0       0.00      0.00      0.00      5933

    accuracy                           0.78     27120
   macro avg       0.39      0.50      0.44     27120
weighted avg       0.61      0.78      0.69     27120



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### b - tres simple avec stratify

In [252]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter = 1000)

model.fit(X_train_scaled, y_train)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

confusion_matrix(y_test, y_pred)

score accuracy :  0.8475663716814159



array([[19977,  1142],
       [ 2992,  3009]])

### c - avec mise à l'echelle [-1, 1]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

scaler = MinMaxScaler(feature_range = (-1, 1))
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter = 1000)

model.fit(X_train_scaled, y_train)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

confusion_matrix(y_test, y_pred)

score accuracy :  0.846976401179941



array([[19964,  1155],
       [ 2995,  3006]])

### d - avec standardscaler

In [266]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter = 500)

model.fit(X_train_scaled, y_train)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

confusion_matrix(y_test, y_pred)

score accuracy :  0.847382005899705



array([[19967,  1152],
       [ 2987,  3014]])

In [273]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter = 500)

model.fit(X_train_scaled, y_train)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

confusion_matrix(y_test, y_pred)

score accuracy :  0.8471976401179941



array([[19967,  1152],
       [ 2992,  3009]])

conclusion 1 : je continue avec un minmaxscaler [-1, 1], augmenter le nom d'iter ne sert à rien, je garde straify = True

# 2 - Re *Sampling

### a - undersampling

In [291]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

sample = RandomUnderSampler(random_state = 42)

X_train_s, y_train_s = sample.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_s)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter = 500)

model.fit(X_train_scaled, y_train_s)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

confusion_matrix(y_test, y_pred)

score accuracy :  0.7935103244837758



array([[16881,  4238],
       [ 1362,  4639]])

### b - oversampling 

In [289]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

sample = SMOTE(random_state = 42)

X_train_s, y_train_s = sample.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_s)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter = 500)

model.fit(X_train_scaled, y_train_s)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

confusion_matrix(y_test, y_pred)

score accuracy :  0.7926991150442478



array([[16873,  4246],
       [ 1376,  4625]])

conclusion : les stratégie d'undersampling est la plus efficace

# 3 - test différence model

In [296]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

sample = RandomUnderSampler(random_state = 42)

X_train_s, y_train_s = sample.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_s)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegressionCV(max_iter = 500)

model.fit(X_train_scaled, y_train_s)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

confusion_matrix(y_test, y_pred)

score accuracy :  0.7937684365781711



array([[16889,  4230],
       [ 1363,  4638]])

In [301]:
from sklearn.linear_model import SGDClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

sample = RandomUnderSampler(random_state = 42)

X_train_s, y_train_s = sample.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_s)
X_test_scaled = scaler.transform(X_test)

model = SGDClassifier(max_iter = 2000)

model.fit(X_train_scaled, y_train_s)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

confusion_matrix(y_test, y_pred)

score accuracy :  0.7860250737463127



array([[16679,  4440],
       [ 1363,  4638]])

In [305]:
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

sample = RandomUnderSampler(random_state = 42)

X_train_s, y_train_s = sample.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_s)
X_test_scaled = scaler.transform(X_test)

model = RandomForestClassifier()

model.fit(X_train_scaled, y_train_s)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

confusion_matrix(y_test, y_pred)

score accuracy :  0.8165929203539823



array([[17271,  3848],
       [ 1126,  4875]])

In [30]:
from sklearn.tree import DecisionTreeClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

sample = RandomUnderSampler(random_state = 42)

X_train_s, y_train_s = sample.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_s)
X_test_scaled = scaler.transform(X_test)

model = DecisionTreeClassifier()

model.fit(X_train_scaled, y_train_s)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

confusion_matrix(y_test, y_pred)

score accuracy :  0.7230457227138644



array([[15238,  5881],
       [ 1630,  4371]])

In [28]:
from sklearn.svm import SVC

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

sample = RandomUnderSampler(random_state = 42)

X_train_s, y_train_s = sample.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_s)
X_test_scaled = scaler.transform(X_test)

model = SVC(verbose = 2, max_iter = 500)

model.fit(X_train_scaled, y_train_s)

print('score accuracy : ', model.score(X_test_scaled, y_test), end = '\n\n')

y_pred = model.predict(X_test_scaled)

confusion_matrix(y_test, y_pred)

[LibSVM]WARN: libsvm Solver reached max_iter
optimization finished, #iter = 500
obj = -985.535335, rho = -0.493495
nSV = 1000, nBSV = 1000
Total nSV = 1000




score accuracy :  0.7803466076696165



array([[19387,  1732],
       [ 4225,  1776]])

conclusion : on va partir sur un RandomForrestClassifier

# 4 - exploration des hyperparametres