# Import Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import iqr

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
import xgboost as xgb

# Preprocessing
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Options
pd.set_option('display.max_columns',50)
plt.style.use('bmh')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s4e5/train.csv', index_col ='id')
test = pd.read_csv('/kaggle/input/playground-series-s4e5/test.csv', index_col ='id')

# Preview datasets

In [None]:
train.info()

In [None]:
test.info()

In [None]:
cols = train.drop('FloodProbability', axis = 1).columns.tolist()

# Exploratory Data Analysis

In [None]:
for col in cols:
    fig, ax = plt.subplots(figsize=(6,2))
    max_val = round(train[col].max()) + 1
    train[col].hist(density=True,bins = np.arange(0,max_val,1), ax=ax)
    plt.xticks(np.arange(0,20,1))
    plt.title(col)
    plt.show()

## Descriptive analysis
The data shows that all variables have a median of 5 and a mean of 4.9, with variance and standard deviation nearly identical across the board. There is a moderate right skew in the data distribution.

In [None]:
round(train.agg(['min','mean','median','max','var','std','skew']),2).T

In [None]:
round(test.agg(['min','mean','median','max','var','std','skew']),2).T

## Correlation
There is no correlation between variables

In [None]:
corr = train.drop('FloodProbability', axis=1).corr()
mask = np.triu(np.ones_like(corr, dtype = bool))

sns.heatmap(corr, mask = mask,linewidth=0.1)
plt.show()

In [None]:
corr = train.corr()
mask = np.triu(np.ones_like(corr, dtype = bool))

sns.heatmap(corr, mask = mask,linewidth=0.1)
plt.show()

# Preprocessing
## Outliers
All variables have outliers

In [None]:
train.drop('FloodProbability', axis=1).plot(kind='box',vert=False)
plt.title('Boxplot of train variables')
plt.show()

In [None]:
test.plot(kind='box',vert=False)
plt.title('Boxplot of test variables')
plt.show()

### Removing outliers

In [None]:
for col in cols:
    col_iqr = iqr(train[col])
    Q1, Q3 = np.quantile(train[col], [0.25, 0.75])
    
    # Convert outliers to np.nan
    train.loc[train[col] < (Q1 - 1.5*col_iqr), col] = np.nan
    train.loc[train[col] > (Q3 + 1.5*col_iqr), col] = np.nan    

Fraction of outliers is less than 3% in each variable. We can drop the outliers.

In [None]:
train.isna().sum()/train.shape[0]

In [None]:
print('Shape before :',train.shape)
train.dropna(how='any', inplace=True)
print("Shape after :",train.shape)

In [None]:
y = train['FloodProbability']
X = train.drop('FloodProbability', axis=1)

After dropping of outliers, the variables are no longer skewed.

In [None]:
for col in cols:
    fig, ax = plt.subplots(figsize=(6,2))
    max_val = round(X[col].max()) + 2
    X[col].hist(density=True, bins = np.arange(0,max_val,1), ax=ax)
    plt.title(col)
    plt.show()

# Feature engineering

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Modelling

In [None]:
# Logistic Regression (can output probabilities for binary classification)
logistic = LogisticRegression()

# Random Forest Regression
forest = RandomForestRegressor()

# Decision tree regressor
dt = DecisionTreeRegressor()

# Gradient Boosting Regression
gradient = GradientBoostingRegressor()

# Support Vector Regression
svr = SVR()

# xgboost
xgb_reg = xgb.XGBRFRegressor(objective = 'binary:logistic')

In [None]:
# Transform train probabilities to log odds
log_odds_y_train = np.log(y_train / (1 - y_train)).values

# Transform test probabilities to log odds
log_odds_y_test = np.log(y_test / (1 - y_test))

In [None]:
scaler = StandardScaler()
steps = [('scaler', scaler),
        ('xgboost',xgb_reg)]
pipeline = Pipeline(steps)

kf = KFold(n_splits=5, shuffle = True, random_state = 987)


pipeline.fit(X_train,y_train)
y_pred_proba = pipeline.predict(X_test)
score = (np.round(y_pred_proba)== np.round(y_test)).sum()/len(y_test)
print(score)

In [None]:
regressors = [
    ('random forest', forest),
    ('decision tree',dt),
    ('gradient boosting',gradient),
    ('support vector', svr),
    ('xgboost',xgb_reg)
]

kf = KFold(n_splits=5, shuffle = True, random_state = 987)

for clf, model in regressors:
    model.fit(X_train,log_odds_y_train)
    y_pred_proba = model.predict_proba(X_test)
    score = (round(y_pred_proba)==round(y_test)).sum()/len(y_test)
    print(clf, score)