In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("rainfall_final_df.csv")

In [4]:
df.head()

Unnamed: 0,pressure,temparature,humidity,cloud,sunshine,winddirection,windspeed,rainfall
0,0.108504,0.86,0.790323,0.89,0.198347,0.676471,0.332613,1
1,0.43695,0.752,0.532258,0.17,0.867769,0.176471,0.172786,0
2,0.222874,0.888,0.612903,0.27,0.892562,0.617647,0.092873,0
3,0.246334,0.712,0.935484,0.88,0.008264,0.088235,0.386609,1
4,0.665689,0.296,0.483871,0.18,0.834711,0.029412,0.431965,0


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [6]:
X = df.drop(columns=["rainfall"])  
y = df["rainfall"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "SVM": SVC(probability=True),
    "k-NN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0)
}

In [9]:
results = []

for name, model in models.items():
    pipeline = Pipeline([  
        ("classifier", model)
    ])

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results.append({"Model": name, "Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1})

results_df = pd.DataFrame(results).sort_values(by="Accuracy", ascending=False)

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 94, number of negative: 93
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001509 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set: 187, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502674 -> initscore=0.010695
[LightGBM] [Info] Start training from score 0.010695


In [10]:
results_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
3,Gradient Boosting,0.765957,0.730769,0.826087,0.77551
2,Random Forest,0.744681,0.703704,0.826087,0.76
4,AdaBoost,0.744681,0.703704,0.826087,0.76
8,XGBoost,0.744681,0.72,0.782609,0.75
9,LightGBM,0.744681,0.72,0.782609,0.75
10,CatBoost,0.723404,0.678571,0.826087,0.745098
6,k-NN,0.680851,0.653846,0.73913,0.693878
1,Decision Tree,0.659574,0.62963,0.73913,0.68
7,Naive Bayes,0.659574,0.62069,0.782609,0.692308
0,Logistic Regression,0.638298,0.607143,0.73913,0.666667


In [31]:
rf_model = RandomForestClassifier(random_state=42)

param_grid_rf = {
    "n_estimators": [250, 300, 400, 500], 
    "max_features":['sqrt', 'log2'],
    "max_depth": [None, 5, 15, 25],
    "min_samples_split": [3, 7, 9, 11],
    "min_samples_leaf": [2, 5, 6, 9]
}

In [32]:
from sklearn.model_selection import GridSearchCV

In [33]:
grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid_rf, cv = 5, n_jobs = -1, verbose = 2)

In [34]:
grid_search_rf.fit(X_train, y_train)

Fitting 5 folds for each of 512 candidates, totalling 2560 fits


In [35]:
best_rf_model = grid_search_rf.best_estimator_

In [36]:
grid_search_rf.best_params_

{'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 7,
 'n_estimators': 250}

In [37]:
from sklearn.model_selection import cross_val_score

In [38]:
cv_scores = cross_val_score(best_rf_model, X_train, y_train, cv=5)
cv_scores

array([0.73684211, 0.89473684, 0.86486486, 0.81081081, 0.83783784])

In [39]:
np.mean(cv_scores)

0.829018492176387

In [40]:
y_pred = best_rf_model.predict(X_test)

In [41]:
accuracy_score(y_test, y_pred)

0.7446808510638298

In [42]:
from sklearn.metrics import confusion_matrix

In [43]:
confusion_matrix(y_test, y_pred)

array([[16,  8],
       [ 4, 19]], dtype=int64)

In [44]:
from sklearn.metrics import classification_report

In [45]:
report_dict = classification_report(y_test, y_pred, output_dict = True)

In [46]:
report_dict

{'0': {'precision': 0.8,
  'recall': 0.6666666666666666,
  'f1-score': 0.7272727272727273,
  'support': 24.0},
 '1': {'precision': 0.7037037037037037,
  'recall': 0.8260869565217391,
  'f1-score': 0.76,
  'support': 23.0},
 'accuracy': 0.7446808510638298,
 'macro avg': {'precision': 0.7518518518518519,
  'recall': 0.7463768115942029,
  'f1-score': 0.7436363636363637,
  'support': 47.0},
 'weighted avg': {'precision': 0.7528762805358552,
  'recall': 0.7446808510638298,
  'f1-score': 0.7432882011605416,
  'support': 47.0}}

In [47]:
import mlflow

In [48]:
mlflow.set_experiment("Rainfall")
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000/")

with mlflow.start_run(run_name="rainfall2"):
    mlflow.log_params(grid_search_rf.best_params_)
    mlflow.log_metrics({
        "Accuracy" : report_dict["accuracy"],
        "Precision_0" : report_dict["0"]["precision"],
        "Recall_0" : report_dict["0"]["recall"],

        "Precision_1" : report_dict["1"]["precision"],
        "Recall_1" : report_dict["1"]["recall"],
        "f1_score_macro":report_dict['macro avg']['f1-score']
        
    })
    mlflow.sklearn.log_model(best_rf_model, "Random Forest Model")



🏃 View run rainfall2 at: http://127.0.0.1:5000/#/experiments/370065289959288740/runs/ad51fce886c84c6baef18c4bc418a509
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/370065289959288740
