<a href="https://www.kaggle.com/code/nyagami/flood-prediction-eda?scriptVersionId=180543588" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Import Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import iqr

# Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor, VotingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
import xgboost as xgb

# Preprocessing
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Evaluation
from sklearn.metrics import mean_squared_error

# Options
pd.set_option('display.max_columns',50)
plt.style.use('bmh')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s4e5/train.csv', index_col ='id')
test = pd.read_csv('/kaggle/input/playground-series-s4e5/test.csv', index_col ='id')

# Preview datasets

In [None]:
train.info()

In [None]:
test.info()

In [None]:
cols = train.drop('FloodProbability', axis = 1).columns.tolist()

# Exploratory Data Analysis

In [None]:
for col in cols:
    fig, ax = plt.subplots(figsize=(6,2))
    max_val = round(train[col].max()) + 1
    train[col].hist(density=True,bins = np.arange(0,max_val,1), ax=ax)
    plt.xticks(np.arange(0,20,1))
    plt.title(col)
    plt.show()

## Descriptive analysis
The data shows that all variables have a median of 5 and a mean of 4.9, with variance and standard deviation nearly identical across the board. There is a moderate right skew in the data distribution.

In [None]:
round(train.agg(['min','mean','median','max','var','std','skew']),2).T

In [None]:
round(test.agg(['min','mean','median','max','var','std','skew']),2).T

## Correlation
There is no correlation between variables

In [None]:
corr = train.drop('FloodProbability', axis=1).corr()
mask = np.triu(np.ones_like(corr, dtype = bool))

sns.heatmap(corr, mask = mask,linewidth=0.1)
plt.show()

In [None]:
corr = train.corr()
mask = np.triu(np.ones_like(corr, dtype = bool))

sns.heatmap(corr, mask = mask,linewidth=0.1)
plt.show()

# Preprocessing
## Outliers
All variables have outliers

In [None]:
train.drop('FloodProbability', axis=1).plot(kind='box',vert=False)
plt.title('Boxplot of train variables')
plt.show()

In [None]:
test.plot(kind='box',vert=False)
plt.title('Boxplot of test variables')
plt.show()

### Removing outliers

In [None]:
for col in cols:
    col_iqr = iqr(train[col])
    Q1, Q3 = np.quantile(train[col], [0.25, 0.75])
    
    # Convert outliers to np.nan
    train.loc[train[col] < (Q1 - 1.5*col_iqr), col] = np.nan
    train.loc[train[col] > (Q3 + 1.5*col_iqr), col] = np.nan    

Fraction of outliers is less than 3% in each variable. We can drop the outliers.

In [None]:
train.isna().sum()/train.shape[0]

In [None]:
print('Shape before :',train.shape)
train.dropna(how='any', inplace=True)
print("Shape after :",train.shape)

In [None]:
y = train['FloodProbability']
X = train.drop('FloodProbability', axis=1)

In [None]:
scaler = StandardScaler()
X[X.columns] = scaler.fit_transform(X)
test[test.columns] = scaler.transform(test)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state =  987)

# Modelling

In [None]:
regressors = [
    ('linear_reg', LinearRegression()),
    ('random forest',RandomForestRegressor()),
    ('decision tree',DecisionTreeRegressor()),
    ('gradient', GradientBoostingRegressor()),
    ('svr',SVR()),
    ('adaboost',AdaBoostRegressor()),
    ('bagging', BaggingRegressor()),
    ('xgb_reg',xgb.XGBRFRegressor(objective = 'reg:squarederror'))    
]

In [None]:
evals = {}
for clf, model in regressors:
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared = False)
    evals[clf] = rmse

In [None]:
evals_df = pd.DataFrame({'model':evals.keys(),'rmse':evals.values()})
evals_df.sort_values('rmse', inplace = True)

In [None]:
sns.pointplot(y='model',x='rmse',data=evals_df)
plt.show()

# Hyperparameter tuning

In [None]:
# Define the models
models = {
    'linear_reg': LinearRegression(),
    'random_forest': RandomForestRegressor(),
    'svr': SVR(),
    'xgboost': xgb.XGBRegressor(objective='reg:squarederror')
}

# Define the parameter grids for each model
param_grids = {
    'linear_reg': {
        'fit_intercept': [True, False],
        'normalize': [True, False]
    },
    'random_forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'svr': {
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto'],
        'kernel': ['linear', 'rbf', 'poly']
    },
    'xgboost': {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 6, 10],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
    }
}



In [None]:
# Initialize the GridSearchCV for each model
grid_searches = {name: GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=1)
                 for name, (model, param_grid) in zip(models.keys(), zip(models.values(), param_grids.values()))}

# Assume you have a dataset X (features) and y (target)
best_estimators = {}

# Fit the GridSearchCV for each model
for name, gs in grid_searches.items():
    print(f"Running GridSearchCV for {name}")
    gs.fit(X, y)
    print(f"Best parameters for {name}: {gs.best_params_}")
    print(f"Best score for {name}: {gs.best_score_}")
    best_estimators[name] = gs.best_estimator_

# Print the best estimators
for name, estimator in best_estimators.items():
    print(f"Best estimator for {name}: {estimator}")

In [None]:
# Create a VotingRegressor using the best estimators
voting_regressor = VotingRegressor(estimators=[
    ('linear_reg', best_estimators['linear_reg']),
    ('random_forest', best_estimators['random_forest']),
    ('svr', best_estimators['svr']),
    ('xgboost', best_estimators['xgboost'])
])


# Fit the VotingRegressor on the training data
voting_regressor.fit(X_train, y_train)

# Make predictions using the VotingRegressor
predictions = voting_regressor.predict(X_test)

# Optionally, evaluate the performance of the VotingRegressor
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error of Voting Regressor: {mse}')

In [None]:
# Create a VotingRegressor using the best estimators
voting_regressor = VotingRegressor(estimators=[
    ('linear_reg', best_estimators['linear_reg']),
    ('random_forest', best_estimators['random_forest']),
    ('svr', best_estimators['svr']),
    ('xgboost', best_estimators['xgboost'])
])

voting_regressor.fit(X,y)
y_preds = voting_regressor.predict(test)

In [None]:
submission = pd.DataFrame({'id':test.index, 'FloodProbability':y_preds})
submission

In [None]:
submission.to_csv('/kaggle/working/submission.csv',index=False)