In [39]:
import pandas as pd
import numpy as np

df = pd.read_csv('weather_data_no_outliers.csv', usecols=[1,2,3,4,5,6,7,8])

In [40]:
df.tail()

Unnamed: 0,Temperature(°C),Dew Point(°F),Humidity(°%),Wind Speed(°mph),Wind Gust(°mph),Pressure(°in),Precip.,Condition
43684,9,4,71,13,0,31,0.0,Mostly Cloudy
43685,9,4,71,12,0,31,0.0,Fair
43686,9,4,71,10,0,31,0.0,Fair
43687,9,4,71,12,0,31,0.0,Fair
43688,9,4,71,13,0,31,0.0,Fair


### Logistic Regression

In [57]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression

In [78]:
import warnings
warnings.filterwarnings('ignore')

1. Encode dependent variable

In [58]:
df_encoded = pd.get_dummies(df, columns=['Condition'])
df_encoded.tail()

Unnamed: 0,Temperature(°C),Dew Point(°F),Humidity(°%),Wind Speed(°mph),Wind Gust(°mph),Pressure(°in),Precip.,Condition_Fair,Condition_Light Rain Shower,Condition_Mostly Cloudy,Condition_Partly Cloudy
43684,9,4,71,13,0,31,0.0,0,0,1,0
43685,9,4,71,12,0,31,0.0,1,0,0,0
43686,9,4,71,10,0,31,0.0,1,0,0,0
43687,9,4,71,12,0,31,0.0,1,0,0,0
43688,9,4,71,13,0,31,0.0,1,0,0,0


2. Define X and Y variables clearly as numpy array

In [160]:
x = df_encoded.iloc[:,0:7].values
Y = df_encoded.iloc[:,7:] .values
Y_reshaped = np.argmax(Y, axis=1) 

(43689,)

3. Scale independent variables (x)

In [88]:
sc = StandardScaler()
X = sc.fit_transform(x)

4. Due to the frequencies of values are uneven, use Stratified K Fold Validation

In [89]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=23)
logreg = LogisticRegression()
scores = cross_val_score(logreg, X, Y_reshaped, cv=skf)
#different accuracies for different scroing parameters:
print('Accurracy:', scores.mean())

Accurracy: 0.635789295349913


### LGBM

In [90]:
from lightgbm import LGBMClassifier

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=23)
lgbm = LGBMClassifier()
scores = cross_val_score(lgbm, X ,Y_reshaped, cv = skf)
print('Accurracy:', scores.mean())


Accurracy: 0.6635538270729822


### Cat Boost

In [91]:
from catboost import CatBoostClassifier

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=23)
catb = CatBoostClassifier()
scores = cross_val_score(catb, X, Y_reshaped, cv = skf)
print('Accurracy:', scores.mean())

Learning rate set to 0.094851
0:	learn: 1.3005144	total: 173ms	remaining: 2m 52s
1:	learn: 1.2334970	total: 187ms	remaining: 1m 33s
2:	learn: 1.1807120	total: 199ms	remaining: 1m 6s
3:	learn: 1.1365210	total: 212ms	remaining: 52.7s
4:	learn: 1.1016561	total: 225ms	remaining: 44.7s
5:	learn: 1.0731023	total: 238ms	remaining: 39.4s
6:	learn: 1.0472802	total: 252ms	remaining: 35.8s
7:	learn: 1.0267166	total: 267ms	remaining: 33.1s
8:	learn: 1.0083488	total: 282ms	remaining: 31.1s
9:	learn: 0.9926598	total: 296ms	remaining: 29.3s
10:	learn: 0.9784044	total: 310ms	remaining: 27.9s
11:	learn: 0.9658185	total: 325ms	remaining: 26.8s
12:	learn: 0.9549975	total: 340ms	remaining: 25.8s
13:	learn: 0.9454042	total: 358ms	remaining: 25.2s
14:	learn: 0.9369352	total: 387ms	remaining: 25.4s
15:	learn: 0.9289914	total: 415ms	remaining: 25.5s
16:	learn: 0.9233630	total: 433ms	remaining: 25s
17:	learn: 0.9171937	total: 460ms	remaining: 25.1s
18:	learn: 0.9121095	total: 475ms	remaining: 24.5s
19:	learn: 

### ANN

In [101]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=23)

def create_model():
    ann = Sequential()

    ann.add(Dense(units = X.shape[1], activation = 'relu'))
    ann.add(Dense(units = 6, activation = 'relu'))
    ann.add(Dense(units = 5, activation = 'relu'))
    ann.add(Dense(units = 4, activation = 'softmax'))

    ann.compile(optimizer='adam',
                loss='categorical_crossentropy',
                metrics=['accuracy'])

    return ann

keras_clf = KerasClassifier(build_fn=create_model, epochs=20, batch_size=32)

scores = cross_val_score(keras_clf, X, Y_reshaped, cv = skf, scoring='accuracy')
print('Accuracy:', scores.mean())


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20

In [105]:
df_encoded.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Temperature(°C),43689.0,16.556708,7.628737,-17.0,10.0,17.0,23.0,35.0
Dew Point(°F),43689.0,10.883495,6.671856,-17.0,7.0,10.0,17.0,25.0
Humidity(°%),43689.0,70.705921,14.185836,0.0,62.0,72.0,82.0,100.0
Wind Speed(°mph),43689.0,9.315388,4.917687,0.0,6.0,9.0,13.0,20.0
Wind Gust(°mph),43689.0,0.15988,2.128947,0.0,0.0,0.0,0.0,36.0
Pressure(°in),43689.0,30.147108,1.14785,0.0,30.0,30.0,30.0,31.0
Precip.,43689.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Condition_Fair,43689.0,0.574767,0.494384,0.0,0.0,1.0,1.0,1.0
Condition_Light Rain Shower,43689.0,0.05411,0.226237,0.0,0.0,0.0,0.0,1.0
Condition_Mostly Cloudy,43689.0,0.25503,0.435883,0.0,0.0,0.0,1.0,1.0


In [113]:
df.tail()

Unnamed: 0,Temperature(°C),Dew Point(°F),Humidity(°%),Wind Speed(°mph),Wind Gust(°mph),Pressure(°in),Precip.,Condition
43684,9,4,71,13,0,31,0.0,Mostly Cloudy
43685,9,4,71,12,0,31,0.0,Fair
43686,9,4,71,10,0,31,0.0,Fair
43687,9,4,71,12,0,31,0.0,Fair
43688,9,4,71,13,0,31,0.0,Fair


In [111]:
df_encoded.corr().iloc[:,7:].T

Unnamed: 0,Temperature(°C),Dew Point(°F),Humidity(°%),Wind Speed(°mph),Wind Gust(°mph),Pressure(°in),Precip.,Condition_Fair,Condition_Light Rain Shower,Condition_Mostly Cloudy,Condition_Partly Cloudy
Condition_Fair,0.232072,0.15363,-0.203931,-0.286171,-0.038857,-0.061069,,1.0,-0.278067,-0.680234,-0.42134
Condition_Light Rain Shower,-0.152014,-0.063821,0.238172,0.096952,0.031843,-0.008265,,-0.278067,1.0,-0.13994,-0.08668
Condition_Mostly Cloudy,-0.2689,-0.212937,0.159246,0.18968,0.023793,0.067934,,-0.680234,-0.13994,1.0,-0.212044
Condition_Partly Cloudy,0.115089,0.097715,-0.070162,0.115083,0.005105,0.007649,,-0.42134,-0.08668,-0.212044,1.0


Maybe I must have done better data preprocessing

1. Drop Precip. Column 
2. Drop Pressure(°in) Column
3. Drop Wind Gust(°mph) Column

In [130]:
# df.drop(columns= {'Pressure(°in)', 'Precip.'}, inplace=True)
# df.drop(columns= {'Wind Gust(°mph)'}, inplace=True)
df

Unnamed: 0,Temperature(°C),Dew Point(°F),Humidity(°%),Wind Speed(°mph),Condition
0,3,2,87,3,Partly Cloudy
1,8,3,76,9,Mostly Cloudy
2,8,4,81,8,Mostly Cloudy
3,7,3,76,6,Mostly Cloudy
4,5,3,81,5,Mostly Cloudy
...,...,...,...,...,...
43684,9,4,71,13,Mostly Cloudy
43685,9,4,71,12,Fair
43686,9,4,71,10,Fair
43687,9,4,71,12,Fair


In [131]:
df_encoded2 = pd.get_dummies(df, columns=['Condition'])
df_encoded2.tail()

Unnamed: 0,Temperature(°C),Dew Point(°F),Humidity(°%),Wind Speed(°mph),Condition_Fair,Condition_Light Rain Shower,Condition_Mostly Cloudy,Condition_Partly Cloudy
43684,9,4,71,13,0,0,1,0
43685,9,4,71,12,1,0,0,0
43686,9,4,71,10,1,0,0,0
43687,9,4,71,12,1,0,0,0
43688,9,4,71,13,1,0,0,0


In [180]:
x = df_encoded2.iloc[:,0:4].values
Y = df_encoded2.iloc[:,4:] .values
Y

array([[0, 0, 0, 1],
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       ...,
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0]], dtype=uint8)

In [181]:
sc = StandardScaler()
X_sc = sc.fit_transform(x)

In [182]:
from sklearn.preprocessing import MinMaxScaler
min_max_sc = MinMaxScaler()
X_mmsc = min_max_sc.fit_transform(x)

with Standard Scaler:

In [183]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=23)
logreg = LogisticRegression()
Y_reshaped = np.argmax(Y, axis=1) 
scores = cross_val_score(logreg, X_sc, Y_reshaped, cv=skf)
#different accuracies for different scroing parameters:
print('Accurracy:', scores.mean())

Accurracy: 0.6363844155419556


with Min-Max Scaler:

In [184]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=23)
logreg = LogisticRegression()
Y_reshaped = np.argmax(Y, axis=1) 
scores = cross_val_score(logreg, X_mmsc, Y_reshaped, cv=skf)
#different accuracies for different scroing parameters:
print('Accurracy:', scores.mean())

Accurracy: 0.6364302083259926


So, let's say no changes

### LGBM

In [185]:
from lightgbm import LGBMClassifier

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=23)
lgbm = LGBMClassifier()
scores = cross_val_score(lgbm, X_mmsc ,Y_reshaped, cv = skf)
print('Accurracy:', scores.mean())

Accurracy: 0.6546957081289501


### Cat Boost

In [186]:
from catboost import CatBoostClassifier

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=23)
catb = CatBoostClassifier()
scores = cross_val_score(catb, X_mmsc, Y_reshaped, cv = skf)
print('Accurracy:', scores.mean())

Learning rate set to 0.094851
0:	learn: 1.3005546	total: 36.6ms	remaining: 36.6s
1:	learn: 1.2343394	total: 54.1ms	remaining: 27s
2:	learn: 1.1815793	total: 72.3ms	remaining: 24s
3:	learn: 1.1408459	total: 90.6ms	remaining: 22.6s
4:	learn: 1.1047467	total: 109ms	remaining: 21.7s
5:	learn: 1.0752887	total: 131ms	remaining: 21.7s
6:	learn: 1.0496185	total: 148ms	remaining: 20.9s
7:	learn: 1.0289503	total: 166ms	remaining: 20.5s
8:	learn: 1.0103403	total: 186ms	remaining: 20.5s
9:	learn: 0.9943235	total: 203ms	remaining: 20.1s
10:	learn: 0.9800849	total: 218ms	remaining: 19.6s
11:	learn: 0.9675831	total: 238ms	remaining: 19.6s
12:	learn: 0.9567867	total: 257ms	remaining: 19.5s
13:	learn: 0.9476434	total: 272ms	remaining: 19.1s
14:	learn: 0.9390303	total: 288ms	remaining: 18.9s
15:	learn: 0.9322730	total: 306ms	remaining: 18.8s
16:	learn: 0.9255218	total: 323ms	remaining: 18.7s
17:	learn: 0.9199968	total: 343ms	remaining: 18.7s
18:	learn: 0.9145958	total: 357ms	remaining: 18.4s
19:	learn: 

### ANN

In [187]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=23)

def create_model():
    ann = Sequential()

    ann.add(Dense(units = X.shape[1], activation = 'relu'))
    ann.add(Dense(units = 6, activation = 'relu'))
    ann.add(Dense(units = 5, activation = 'relu'))
    ann.add(Dense(units = 4, activation = 'softmax'))

    ann.compile(optimizer='adam',
                loss='categorical_crossentropy',
                metrics=['accuracy'])

    return ann

keras_clf = KerasClassifier(build_fn=create_model, epochs=20, batch_size=32)

scores = cross_val_score(keras_clf, X_mmsc, Y_reshaped, cv = skf, scoring='accuracy')
print('Accuracy:', scores.mean())

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20

At the end part, Let's try to tune LGBM

#### LGBM Model Tuning

In [165]:
?lgbm

[1;31mType:[0m           LGBMClassifier
[1;31mString form:[0m    LGBMClassifier()
[1;31mFile:[0m           c:\users\anerg\appdata\local\programs\python\python310\lib\site-packages\lightgbm\sklearn.py
[1;31mDocstring:[0m      LightGBM classifier.
[1;31mInit docstring:[0m
Construct a gradient boosting model.

Parameters
----------
boosting_type : str, optional (default='gbdt')
    'gbdt', traditional Gradient Boosting Decision Tree.
    'dart', Dropouts meet Multiple Additive Regression Trees.
    'goss', Gradient-based One-Side Sampling.
    'rf', Random Forest.
num_leaves : int, optional (default=31)
    Maximum tree leaves for base learners.
max_depth : int, optional (default=-1)
    Maximum tree depth for base learners, <=0 means no limit.
learning_rate : float, optional (default=0.1)
    Boosting learning rate.
    You can use ``callbacks`` parameter of ``fit`` method to shrink/adapt learning rate
    in training using ``reset_parameter`` callback.
    Note, that this will

In [168]:
lgbm_params = {
    'learning_rate':[0.01, 0.05, 0.1],
    'num_leaves':[20,31,40],
    'min_child_samples':[10,20], 
    'n_estimators': [30,50,70],
    'min_child_samples': [5,10,20]
}

In [169]:
from sklearn.model_selection import GridSearchCV

lgbm_cv_model = GridSearchCV( lgbm, lgbm_params, cv=10, verbose=2, n_jobs=4)

lgbm_cv_model.fit(X_mmsc, Y_reshaped)


Fitting 10 folds for each of 81 candidates, totalling 810 fits


In [170]:
lgbm_best_params = lgbm_cv_model.best_params_
lgbm_best_model = lgbm_cv_model.best_estimator_

In [171]:
print(lgbm_best_model)

LGBMClassifier(learning_rate=0.01, n_estimators=70, num_leaves=20)


In [172]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=23)
lgbm = LGBMClassifier()
scores = cross_val_score(lgbm_best_model, X ,Y_reshaped, cv = skf)
print('Accurracy:', scores.mean())

Accurracy: 0.8589347681529421


In [None]:
# Accurracy: 0.8589347681529421

Okay I will not try more

## Final Model:

In [173]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=23)
lgbm = LGBMClassifier()
scores = cross_val_score(lgbm, X ,Y_reshaped, cv = skf)
print('Accurracy:', scores.mean())

Accurracy: 0.8639703082522395


In [175]:
df_encoded2.to_csv('final_weather_data.csv', index=False)