In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk(f'{os.getcwd()}/dataset'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv("./dataset/train.csv")
test_df = pd.read_csv("./dataset/test.csv")
train_df

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
train_df.columns

In [None]:
for col in train_df.columns:
    print(f'{col}: {train_df[col].unique()}')
    print(f'{col} min: {train_df[col].min()}')
    print(f'{col} max: {train_df[col].max()}')

In [None]:
train_df[train_df['FloodProbability'] > 0.5]

In [None]:
%matplotlib inline
# only in a Jupyter notebook
import matplotlib.pyplot as plt
train_df[train_df['FloodProbability'] > 0.5].hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
%matplotlib inline
# only in a Jupyter notebook
import matplotlib.pyplot as plt
train_df.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

train_df_dr = train_df.copy()
train_df_dr = train_df_dr.drop(columns=['id'])

# Apply PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(train_df_dr)

# Create a DataFrame with the principal components
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])

# Plot the principal components
plt.figure(figsize=(8, 6))
sns.scatterplot(x='PC1', y='PC2', data=pca_df)
plt.title('PCA - 2 Principal Components')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

In [None]:
def get_groups(N):
    q = N // 3
    r = N % 3
    if r == 0:
        return q
    elif r >= 1:
        return q + 1

In [None]:
def grouping(col_df, N):
    return ['low' if i <= get_groups(N) else 'intermediate' if i <= get_groups(N) * 2 else 'high' for i in col_df]

In [None]:
import math
def get_features(df, is_train=True):
    new_df = df.copy()
    new_df['EnvironmentalStress'] =  new_df['Deforestation'] + new_df['Siltation'] + new_df['WetlandLoss']
    new_df['InfrastructureVulnerability'] =  new_df['DeterioratingInfrastructure'] + new_df['DrainageSystems'] + new_df['DamsQuality']
    new_df['DisasterRisk'] =  new_df['IneffectiveDisasterPreparedness'] + new_df['InadequatePlanning'] + new_df['PoliticalFactors']
    
    N_Urbanization = new_df['Urbanization'].max()
    new_df['UrbanizationGroup'] = grouping(new_df['Urbanization'], N_Urbanization)
    
    N_PopulationScore = new_df['PopulationScore'].max()
    new_df['PopulationScoreGroup'] = grouping(new_df['PopulationScore'], N_PopulationScore)
    
    N_MonsoonIntensity = new_df['MonsoonIntensity'].max()
    new_df['MonsoonIntensityGroup'] = grouping(new_df['MonsoonIntensity'], N_MonsoonIntensity)
    
    N_TopographyDrainage = new_df['TopographyDrainage'].max()
    new_df['TopographyDrainageGroup'] = grouping(new_df['TopographyDrainage'], N_TopographyDrainage)
    
    N_RiverManagement = new_df['RiverManagement'].max()
    new_df['RiverManagementGroup'] = grouping(new_df['RiverManagement'], N_RiverManagement)

    N_Deforestation = new_df['Deforestation'].max()
    new_df['DeforestationGroup'] = grouping(new_df['Deforestation'], N_Deforestation)

    N_DamsQuality = new_df['DamsQuality'].max()
    new_df['DamsQualityGroup'] = grouping(new_df['DamsQuality'], N_DamsQuality)

    N_Siltation = new_df['Siltation'].max()
    new_df['SiltationGroup'] = grouping(new_df['Siltation'], N_Siltation)

    N_IneffectiveDisasterPreparedness = new_df['IneffectiveDisasterPreparedness'].max()
    new_df['IneffectiveDisasterPreparednessGroup'] = grouping(new_df['IneffectiveDisasterPreparedness'], N_IneffectiveDisasterPreparedness)

    N_DrainageSystems = new_df['DrainageSystems'].max()
    new_df['DrainageSystemsGroup'] = grouping(new_df['DrainageSystems'], N_DrainageSystems)

    N_Landslides = new_df['Landslides'].max()
    new_df['LandslidesGroup'] = grouping(new_df['Landslides'], N_Landslides)

    N_Watersheds = new_df['Watersheds'].max()
    new_df['WatershedsGroup'] = grouping(new_df['Watersheds'], N_Watersheds)

    N_DeterioratingInfrastructure = new_df['DeterioratingInfrastructure'].max()
    new_df['DeterioratingInfrastructureGroup'] = grouping(new_df['DeterioratingInfrastructure'], N_DeterioratingInfrastructure)

    N_WetlandLoss = new_df['WetlandLoss'].max()
    new_df['WetlandLossGroup'] = grouping(new_df['WetlandLoss'], N_WetlandLoss)

    N_InadequatePlanning = new_df['InadequatePlanning'].max()
    new_df['InadequatePlanningGroup'] = grouping(new_df['InadequatePlanning'], N_InadequatePlanning)

    N_PoliticalFactors = new_df['PoliticalFactors'].max()
    new_df['PoliticalFactorsGroup'] = grouping(new_df['PoliticalFactors'], N_PoliticalFactors)

    N_ClimateChange = new_df['ClimateChange'].max()
    new_df['ClimateChangeGroup'] = grouping(new_df['ClimateChange'], N_ClimateChange)
    
    N_AgriculturalPractices = new_df['AgriculturalPractices'].max()
    new_df['AgriculturalPracticesGroup'] = grouping(new_df['AgriculturalPractices'], N_AgriculturalPractices)
    
    N_Encroachments = new_df['Encroachments'].max()
    new_df['EncroachmentsGroup'] = grouping(new_df['Encroachments'], N_Encroachments)
    
    N_CoastalVulnerability = new_df['CoastalVulnerability'].max()
    new_df['CoastalVulnerabilityGroup'] = grouping(new_df['CoastalVulnerability'], N_CoastalVulnerability)
    
    new_df['FloodRisk'] = 0.3 * new_df['MonsoonIntensity'] + 0.2 * new_df['TopographyDrainage'] + 0.5 * new_df['RiverManagement']
    new_df['EnvironmentalDegradation'] = 0.4 * new_df['Deforestation'] + 0.3 * new_df['Siltation'] + 0.3 * new_df['WetlandLoss']

    if(is_train):
        # Move the Rings column to the last column place
        rings = new_df['FloodProbability']
        new_df = new_df.drop(columns=['FloodProbability'])
        new_df['FloodProbability'] = rings
    return new_df

In [None]:
new_train_df = get_features(train_df)
new_train_df

In [None]:
new_train_df.columns

In [None]:
import matplotlib.pyplot as plt

def pie_chart(df, label):
    # Labels and counts
    counts = df.value_counts()

    # Plotting the pie chart
    plt.figure(figsize=(5, 5))
    plt.pie(counts, labels=counts.index, autopct='%1.1f%%', colors=['#3487ff', '#73acff', '#a1c7ff'])
    plt.title(label)
    plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
    plt.show()

In [None]:
pie_chart(new_train_df['UrbanizationGroup'], 'Urbanization Groups Distribution')

In [None]:
pie_chart(new_train_df['PopulationScoreGroup'], 'Population Score Groups Distribution')

In [None]:
pie_chart(new_train_df['MonsoonIntensityGroup'], 'Monsoon Intensity Groups Distribution')

In [None]:
def corr_matrix(cols):
    # Correlation matrix
    import seaborn as sns
    train_df_copy = new_train_df[cols].copy()
    corr_matrix = train_df_copy.corr()
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
    plt.title('Correlation Matrix Heatmap')
    plt.show()

In [None]:
corr_matrix(['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
       'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
       'Siltation', 'AgriculturalPractices', 'Encroachments', 'FloodProbability'])

In [None]:
corr_matrix(['IneffectiveDisasterPreparedness', 'DrainageSystems',
       'CoastalVulnerability', 'Landslides', 'Watersheds',
       'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss',
       'InadequatePlanning', 'PoliticalFactors', 'FloodProbability'])

In [None]:
corr_matrix(['EnvironmentalStress', 'InfrastructureVulnerability', 'DisasterRisk', 'FloodRisk', 'EnvironmentalDegradation', 'FloodProbability'])

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

def preprocessing(df, is_train=True):

    numerical_features = ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
                           'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
                           'Siltation', 'AgriculturalPractices', 'Encroachments',
                           'IneffectiveDisasterPreparedness', 'DrainageSystems',
                           'CoastalVulnerability', 'Landslides', 'Watersheds',
                           'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss',
                           'InadequatePlanning', 'PoliticalFactors', 'EnvironmentalStress',
                           'InfrastructureVulnerability', 'DisasterRisk', 'FloodRisk', 'EnvironmentalDegradation']
    categorical_features = ['UrbanizationGroup','PopulationScoreGroup', 'MonsoonIntensityGroup',
                            'TopographyDrainageGroup', 'RiverManagementGroup', 'DeforestationGroup',
                            'DamsQualityGroup', 'SiltationGroup',
                            'IneffectiveDisasterPreparednessGroup', 'DrainageSystemsGroup',
                            'LandslidesGroup', 'WatershedsGroup',
                            'DeterioratingInfrastructureGroup', 'WetlandLossGroup',
                            'InadequatePlanningGroup', 'PoliticalFactorsGroup',
                            'ClimateChangeGroup', 'AgriculturalPracticesGroup',
                            'EncroachmentsGroup', 'CoastalVulnerabilityGroup']
    
    numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

    categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    pipeline = Pipeline(steps=[('preprocessor', preprocessor)])
    df_transformed = pipeline.fit_transform(df)
    
    columns_transformed = numerical_features + list(preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features))
    df_transformed = pd.DataFrame(df_transformed, columns=columns_transformed)
    
    df_transformed = pd.concat([pd.DataFrame(df[['id']], columns=['id']), df_transformed], axis=1) 
    
    if(is_train):
        df_transformed = pd.concat([df_transformed, pd.DataFrame(df['FloodProbability'], columns=['FloodProbability']) ], axis=1) 
        
    return df_transformed

In [None]:
processed_df = preprocessing(new_train_df)
processed_df

In [None]:
processed_df.columns

In [None]:
from sklearn.model_selection import train_test_split
X_columns = ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
       'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
       'Siltation', 'AgriculturalPractices', 'Encroachments',
       'IneffectiveDisasterPreparedness', 'DrainageSystems',
       'CoastalVulnerability', 'Landslides', 'Watersheds',
       'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss',
       'InadequatePlanning', 'PoliticalFactors', 'EnvironmentalStress',
       'InfrastructureVulnerability', 'DisasterRisk', 'FloodRisk',
       'EnvironmentalDegradation', 'UrbanizationGroup_high',
       'UrbanizationGroup_intermediate', 'UrbanizationGroup_low',
       'PopulationScoreGroup_high', 'PopulationScoreGroup_intermediate',
       'PopulationScoreGroup_low', 'MonsoonIntensityGroup_high',
       'MonsoonIntensityGroup_intermediate', 'MonsoonIntensityGroup_low',
       'TopographyDrainageGroup_high', 'TopographyDrainageGroup_intermediate',
       'TopographyDrainageGroup_low', 'RiverManagementGroup_high',
       'RiverManagementGroup_intermediate', 'RiverManagementGroup_low',
       'DeforestationGroup_high', 'DeforestationGroup_intermediate',
       'DeforestationGroup_low', 'DamsQualityGroup_high',
       'DamsQualityGroup_intermediate', 'DamsQualityGroup_low',
       'SiltationGroup_high', 'SiltationGroup_intermediate',
       'SiltationGroup_low', 'IneffectiveDisasterPreparednessGroup_high',
       'IneffectiveDisasterPreparednessGroup_intermediate',
       'IneffectiveDisasterPreparednessGroup_low', 'DrainageSystemsGroup_high',
       'DrainageSystemsGroup_intermediate', 'DrainageSystemsGroup_low',
       'LandslidesGroup_high', 'LandslidesGroup_intermediate',
       'LandslidesGroup_low', 'WatershedsGroup_high',
       'WatershedsGroup_intermediate', 'WatershedsGroup_low',
       'DeterioratingInfrastructureGroup_high',
       'DeterioratingInfrastructureGroup_intermediate',
       'DeterioratingInfrastructureGroup_low', 'WetlandLossGroup_high',
       'WetlandLossGroup_intermediate', 'WetlandLossGroup_low',
       'InadequatePlanningGroup_high', 'InadequatePlanningGroup_intermediate',
       'InadequatePlanningGroup_low', 'PoliticalFactorsGroup_high',
       'PoliticalFactorsGroup_intermediate', 'PoliticalFactorsGroup_low',
       'ClimateChangeGroup_high', 'ClimateChangeGroup_intermediate',
       'ClimateChangeGroup_low', 'AgriculturalPracticesGroup_high',
       'AgriculturalPracticesGroup_intermediate',
       'AgriculturalPracticesGroup_low', 'EncroachmentsGroup_high',
       'EncroachmentsGroup_intermediate', 'EncroachmentsGroup_low',
       'CoastalVulnerabilityGroup_high',
       'CoastalVulnerabilityGroup_intermediate',
       'CoastalVulnerabilityGroup_low']
y_columns = ['FloodProbability']
X = processed_df[X_columns]
y = processed_df[y_columns]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV

def findBestParams(model, params, X, y):
    grid_search = GridSearchCV(model, params, cv=5, scoring='r2', verbose=10)

    # Fit the GridSearchCV object to the data
    grid_search.fit(X, y.values.ravel())

    # Get the best parameters and best model
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_

    # Display the results
    print("Best Hyperparameters:", best_params)
    return best_model

In [None]:
from sklearn.linear_model import Lasso

# Define the parameter grid for GridSearch
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1, 10, 100]
}
lasso_model = findBestParams(Lasso(), param_grid, X_train, y_train)
lasso_model.fit(X_train, y_train.values.ravel())

In [None]:
lasso_model_pred = lasso_model.predict(X_test)
lasso_model_pred

In [None]:
from sklearn.metrics import r2_score

def get_metrics(y_pred, y_true):
    r2_score_value = r2_score(y_true, y_pred)
    print(f"r2 score: {r2_score_value}")

In [None]:
get_metrics(lasso_model_pred, y_test)

In [None]:
from sklearn.linear_model import Ridge

# Define the parameter grid for GridSearch
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1, 10],
}
ridge_model = findBestParams(Ridge(), param_grid, X_train, y_train)
ridge_model.fit(X_train, y_train.values.ravel())

In [None]:
ridge_pred = ridge_model.predict(X_test)
ridge_pred

In [None]:
get_metrics(ridge_pred, y_test)

In [None]:
from sklearn.linear_model import BayesianRidge

# Define the parameter grid for GridSearch
param_grid = {
    'n_iter': [300, 500, 1000],
    'alpha_1': [1e-6, 1e-5, 1e-4],
    'alpha_2': [1e-6, 1e-5, 1e-4],
}
bayesian_ridge_model = findBestParams(BayesianRidge(), param_grid, X_train, y_train)
bayesian_ridge_model.fit(X_train, y_train.values.ravel())

In [None]:
bayesian_ridge_pred = bayesian_ridge_model.predict(X_test)
bayesian_ridge_pred

In [None]:
get_metrics(bayesian_ridge_pred, y_test)

In [None]:
test_df = get_features(test_df, False)
test_df

In [None]:
test_processed_df = preprocessing(test_df, False)
test_processed_df

In [None]:
def get_submission(y_pred, id_column, target_cols, filename):
    df = pd.DataFrame(data=y_pred, columns=target_cols)
    df["id"] = id_column
    df = df[['id', 'FloodProbability']]
    df.to_csv('./submission/'+filename+'.csv', index=False)
    print(df)

In [None]:
lasso_pred = lasso_model.predict(test_processed_df[X_columns])
lasso_pred = [abs(x) for x in lasso_pred]

In [None]:
ridge_pred = ridge_model.predict(test_processed_df[X_columns])
ridge_pred = [abs(x) for x in ridge_pred]

In [None]:
get_submission(lasso_pred, test_processed_df["id"], ["FloodProbability"], "lasso_pred")

In [None]:
get_submission(ridge_pred, test_processed_df["id"], ["FloodProbability"], "ridge_pred")