In [2]:
# https://www.kaggle.com/datasets/brijlaldhankour/flood-prediction-factors
# https://www.kaggle.com/competitions/playground-series-s4e5
import sklearn
import numpy  as np
import pandas as pd
import zipfile
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import Perceptron, LinearRegression, Ridge, Lasso
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, make_scorer, mean_squared_error, r2_score, mean_absolute_percentage_error, max_error
from sklearn.pipeline import Pipeline, make_pipeline

In [4]:
with zipfile.ZipFile('./flood.csv.zip', 'r') as zip_file:
    print(zip_file.namelist())
with zipfile.ZipFile('./flood.csv.zip', 'r') as zip_file:
    with zip_file.open('flood.csv') as csv_file:
        df = pd.read_csv(csv_file)

['flood.csv']


In [6]:
df

Unnamed: 0,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,...,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability
0,3,8,6,6,4,4,6,2,3,2,...,10,7,4,2,3,4,3,2,6,0.450
1,8,4,5,7,7,9,1,5,5,4,...,9,2,6,2,1,1,9,1,3,0.475
2,3,10,4,1,7,5,4,7,4,9,...,7,4,4,8,6,1,8,3,6,0.515
3,4,4,2,7,3,4,1,4,6,4,...,4,2,6,6,8,8,6,6,10,0.520
4,3,7,5,2,5,8,5,2,7,5,...,7,6,5,3,3,4,4,3,4,0.475
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,3,7,4,7,5,9,4,6,10,4,...,7,3,8,8,6,1,5,4,2,0.535
49996,3,10,3,8,3,3,4,4,3,11,...,8,6,3,6,4,4,2,4,5,0.510
49997,4,4,5,7,2,1,4,5,6,7,...,4,6,4,1,5,1,6,4,3,0.430
49998,4,5,4,4,6,3,10,2,6,11,...,6,3,4,7,6,2,4,0,11,0.515


In [8]:
X = df.drop(columns = ['FloodProbability'])
y = df['FloodProbability']

# Data Split: 80% Training, 20% Testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = (0.2), random_state = 27)

In [30]:
# Define the models
models = {
    'SVM'             : SVR(cache_size = 2000),
    #'LinearRegression': LinearRegression(),
    'LASSO'           : Lasso(),
    #'Ridge'           : Ridge(),
    'RandomForest'    : RandomForestRegressor(),
    'GradientBoost'   : GradientBoostingRegressor(),
    'Dummy'           : DummyRegressor()
}

# Define the scalers
scalers = {
    'None': None
}

# Define hyperparameter grids for each model
param_grids = {

    'SVM': [
        {
            'model__C': [0.1, 1, 100],
            'model__epsilon': [0.01, 0.1, 0.2]
        }
    ],
    
    'LinearRegression': [
        {
            'model__fit_intercept': [True]
        }
    ],
    
    'LASSO': [
        {
            'model__fit_intercept': [True],
            'model__alpha': [0.1, 1, 10, 100]
        }
    ],
    
    'Ridge': [
        {
            'model__fit_intercept': [True],
            'model__alpha': [0.1, 1, 10, 100]
        }
    ],
    
    'RandomForest': [
        {
            'model__n_estimators': [10, 100, 500],
            'model__max_depth': [None, 2]
        }
    ],
    
    'GradientBoost': [
        {
            'model__n_estimators': [10, 100, 500],
            'model__learning_rate': [0.1, 0.2]
        }
    ],

    'Dummy': [
        {
            'model__strategy': ['mean', 'median']
        },

        {
            'model__strategy': ['constant'], 
            'model__constant': [0, 0.5, 1]
        },

        {
            'model__strategy': ['quantile'],
            'model__quantile': [0, 0.5, 1]
        }
    ]
}

scorer = r2_score

best_score = 0
best_model = None
for model_name, model in models.items():
    for scaler_name, scaler in scalers.items():

        pipeline = Pipeline(
            [
                ('scaler', scaler),
                ('model',  model)
            ]
        )
        print(
            f'Scaler: {scaler_name}'
            f'\n'
            f'Model: {model_name}'
        )
        grid_search = GridSearchCV(
            estimator  = pipeline,
            param_grid = param_grids[model_name],
            cv = 10, 
            scoring = make_scorer(scorer),
            refit = True,
            n_jobs = -1,
            verbose = 4
        )

        grid_search.fit(X_train, y_train)
        
        # Evaluate the model on the train set
        y_pred = grid_search.predict(X_test)
        test_score = scorer(y_test, y_pred)

        # Print results of grid parameter search
        print(f"Best parameters for {model_name} with {scaler_name}: {grid_search.best_params_}")
        print(f"Train set score for {model_name} with {scaler_name}: {grid_search.best_score_:.4f}")
        print( f"Test set score for {model_name} with {scaler_name}: {test_score:.4f}\n")
        
        # Update best model if necessary
        if test_score > best_score:
            best_score = test_score
            best_model = model_name
            best_estimator = grid_search.best_estimator_

print(f"Best performing model: {best_model}")
print(f"Best Model's Score on the test set: {best_score:.4f}")
print(f"Scorer was: {scorer.__name__}")

Scaler: None
Model: SVM
Fitting 10 folds for each of 9 candidates, totalling 90 fits
Best parameters for SVM with None: {'model__C': 1, 'model__epsilon': 0.01}
Train set score for SVM with None: 0.9857
Test set score for SVM with None: 0.9865

Scaler: None
Model: LASSO
Fitting 10 folds for each of 4 candidates, totalling 40 fits
Best parameters for LASSO with None: {'model__alpha': 0.1, 'model__fit_intercept': True}
Train set score for LASSO with None: -0.0003
Test set score for LASSO with None: -0.0003

Scaler: None
Model: RandomForest
Fitting 10 folds for each of 6 candidates, totalling 60 fits
Best parameters for RandomForest with None: {'model__max_depth': None, 'model__n_estimators': 500}
Train set score for RandomForest with None: 0.7345
Test set score for RandomForest with None: 0.7351

Scaler: None
Model: GradientBoost
Fitting 10 folds for each of 6 candidates, totalling 60 fits
Best parameters for GradientBoost with None: {'model__learning_rate': 0.1, 'model__n_estimators': 50

In [16]:
y_pred = best_estimator.predict(X_test)
print(
    f"r2_score: {r2_score(y_test, y_pred):.4f}"
)
print(
    f"max_error: {max_error(y_test, y_pred):.4f}"
)

r2_score: 1.0000
max_error: 0.0000


In [26]:

        print(
            f'Scaler: {scaler}'
            f'\n'
            f'Model: {model}'
        )

Scaler: None
Model: RandomForestRegressor()


In [388]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [390]:
# load dataset titanic
df = sns.load_dataset('titanic')

In [392]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [476]:
df = df.dropna(axis = 'index', how = 'any', subset = ['embark_town', 'embarked'], ignore_index = True)
y = df['survived']
X = df.drop(columns='survived')
X = X.drop(columns = ['deck'])
X = X.drop(columns = ['alive'])

In [478]:
print(X.isnull().sum())

pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         0
class            0
who              0
adult_male       0
embark_town      0
alone            0
dtype: int64


In [480]:
imputer = sklearn.impute.SimpleImputer(strategy = 'mean')
imputer = imputer.fit(X[['age']])
X['age'] = np.rint(imputer.transform(X[['age']]))
print(X.isnull().sum())

pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alone          0
dtype: int64


In [482]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [484]:
X_train

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alone
822,3,female,27.0,0,1,12.4750,S,Third,woman,False,Southampton,False
221,3,male,51.0,0,0,8.0500,S,Third,man,True,Southampton,True
31,1,female,30.0,1,0,146.5208,C,First,woman,False,Cherbourg,False
862,2,male,24.0,0,0,13.0000,S,Second,man,True,Southampton,True
223,1,male,38.0,1,0,90.0000,S,First,man,True,Southampton,False
...,...,...,...,...,...,...,...,...,...,...,...,...
554,1,male,62.0,0,0,26.5500,S,First,man,True,Southampton,True
170,3,male,4.0,4,1,29.1250,Q,Third,child,False,Queenstown,False
760,3,male,41.0,0,0,7.1250,S,Third,man,True,Southampton,True
866,3,male,30.0,0,0,9.5000,S,Third,man,True,Southampton,True


In [486]:
encodeCols = ['pclass',
 'sex',
 'sibsp',
 'parch',
 'embarked',
 'class',
 'who',
 'adult_male',
 'embark_town',
 'alone']

In [488]:
encoder = OneHotEncoder(drop = 'first')
X_train_encoded = encoder.fit_transform(X_train[encodeCols])

In [490]:
encoded_df = pd.DataFrame(X_train_encoded.toarray(), columns=encoder.get_feature_names_out(encodeCols))
X_train_final = pd.concat([X_train.drop(columns=encodeCols).reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

In [492]:
X_train_final

Unnamed: 0,age,fare,pclass_2,pclass_3,sex_male,sibsp_1,sibsp_2,sibsp_3,sibsp_4,sibsp_5,...,embarked_Q,embarked_S,class_Second,class_Third,who_man,who_woman,adult_male_True,embark_town_Queenstown,embark_town_Southampton,alone_True
0,27.0,12.4750,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,51.0,8.0500,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0
2,30.0,146.5208,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,24.0,13.0000,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
4,38.0,90.0000,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
661,62.0,26.5500,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
662,4.0,29.1250,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
663,41.0,7.1250,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0
664,30.0,9.5000,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0


In [494]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [496]:
#using Kfold and cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

In [498]:
estimators = {
    'RandomForestClassifier': RandomForestClassifier()
}

In [500]:
for name, code in estimators.items():
    kfold = StratifiedKFold(n_splits=10, random_state=11 , shuffle=True)
    cv_result = cross_val_score(code, X = X_train_final, y = y_train,
                            cv = kfold, scoring= 'accuracy')
    print(f'{name:>20}: ' + 
          f'mean accuracy={cv_result.mean():.2%}; ' +
          f'standard deviation={cv_result.std():.2%}')

RandomForestClassifier: mean accuracy=80.02%; standard deviation=4.15%


In [502]:
rf = RandomForestClassifier().fit(X_train_final, y_train)

In [504]:
rf.feature_importances_

array([0.20639514, 0.22145054, 0.01435317, 0.0431963 , 0.09833742,
       0.01626365, 0.0050271 , 0.00771892, 0.0040988 , 0.00277272,
       0.00280059, 0.01458001, 0.01090624, 0.00094923, 0.0019006 ,
       0.00252805, 0.00058633, 0.00559466, 0.01113507, 0.00919544,
       0.04166249, 0.09242031, 0.05553999, 0.09500412, 0.00764436,
       0.01278763, 0.01515114])

In [506]:
feature_scores = pd.Series(rf.feature_importances_, index=X_train_final.columns).sort_values(ascending=False)

feature_scores

fare                       0.221451
age                        0.206395
sex_male                   0.098337
adult_male_True            0.095004
who_man                    0.092420
who_woman                  0.055540
pclass_3                   0.043196
class_Third                0.041662
sibsp_1                    0.016264
alone_True                 0.015151
parch_1                    0.014580
pclass_2                   0.014353
embark_town_Southampton    0.012788
embarked_S                 0.011135
parch_2                    0.010906
class_Second               0.009195
sibsp_3                    0.007719
embark_town_Queenstown     0.007644
embarked_Q                 0.005595
sibsp_2                    0.005027
sibsp_4                    0.004099
sibsp_8                    0.002801
sibsp_5                    0.002773
parch_5                    0.002528
parch_4                    0.001901
parch_3                    0.000949
parch_6                    0.000586
dtype: float64

In [510]:
X_test_encoded = encoder.transform(X_test[encodeCols])
encoded_df_test = pd.DataFrame(X_test_encoded.toarray(), columns=encoder.get_feature_names_out(encodeCols))
X_test_final = pd.concat([X_test.drop(columns=encodeCols).reset_index(drop=True), encoded_df_test.reset_index(drop=True)], axis=1)

In [512]:
X_test_final

Unnamed: 0,age,fare,pclass_2,pclass_3,sex_male,sibsp_1,sibsp_2,sibsp_3,sibsp_4,sibsp_5,...,embarked_Q,embarked_S,class_Second,class_Third,who_man,who_woman,adult_male_True,embark_town_Queenstown,embark_town_Southampton,alone_True
0,30.0,7.8958,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0
1,20.0,4.0125,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
2,22.0,9.8375,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0
3,49.0,76.7292,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,45.0,83.4750,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218,30.0,7.2250,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
219,18.0,73.5000,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
220,24.0,247.5208,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
221,34.0,32.5000,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [514]:
accuracy_score(y_test, rf.predict(X_test_final))

0.7757847533632287