In [1]:
import os
import pandas as pd

In [2]:
# data_path

root_path = os.path.dirname(os.getcwd())
data_path = os.path.join(root_path, "data/raw")
train_path = os.path.join(data_path, "train.csv")
train_df = pd.read_csv(train_path)
train_df.head()

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability
0,0,5,8,5,8,6,4,4,3,3,...,5,3,3,5,4,7,5,7,3,0.445
1,1,6,7,4,4,8,8,3,5,4,...,7,2,0,3,5,3,3,4,3,0.45
2,2,6,5,6,7,3,7,1,5,4,...,7,3,7,5,6,8,2,3,3,0.53
3,3,3,4,6,5,4,8,4,7,6,...,2,4,7,4,4,6,5,7,5,0.535
4,4,5,3,2,6,4,4,3,3,3,...,2,2,6,6,4,1,2,3,5,0.415


In [3]:
train_df.shape

(1117957, 22)

In [4]:
test_path = os.path.join(data_path, "test.csv")
test_df = pd.read_csv(test_path)
test_df.head()

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,IneffectiveDisasterPreparedness,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors
0,1117957,4,6,3,5,6,7,8,7,8,...,8,5,7,5,6,3,6,4,4,5
1,1117958,4,4,2,9,5,5,4,7,5,...,2,4,7,4,5,1,7,4,4,3
2,1117959,1,3,6,5,7,2,4,6,4,...,7,9,2,5,5,2,3,6,8,3
3,1117960,2,4,4,6,4,5,4,3,4,...,7,8,4,6,7,6,4,2,4,4
4,1117961,6,3,2,4,6,4,5,5,3,...,4,3,2,6,4,6,8,4,5,5


In [19]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_validate
from sklearn.pipeline import make_pipeline


X = train_df.drop(columns=["id", "FloodProbability"]).values
y = train_df["FloodProbability"].values

scaler = MinMaxScaler()
model = LinearRegression()
clf = make_pipeline(scaler, model)

cv = KFold(shuffle=True, random_state=123)
outputs = cross_validate(clf, X, y, cv=cv, scoring="r2")
avg_score = outputs["test_score"].mean()

{'fit_time': array([1.23013854, 0.99859333, 0.96607947, 1.0585742 , 1.03410196]),
 'score_time': array([0.0430243 , 0.04124022, 0.03083849, 0.03715301, 0.03335834]),
 'test_score': array([0.84459758, 0.84520504, 0.84497443, 0.84473535, 0.84520258])}

In [20]:
avg_score = outputs["test_score"].mean()
avg_score

0.8449429938334838

In [27]:
import optuna
from optuna import trial

def get_model_params(trial):
    return {
        "fit_intercept": trial.suggest_categorical("lr__fit_intercept", [True, False])
    }

def objective(trial):
    X = train_df.drop(columns=["id", "FloodProbability"]).values
    y = train_df["FloodProbability"].values

    scaler = MinMaxScaler()
    model_params = get_model_params(trial)
    model = LinearRegression(**model_params)
    clf = make_pipeline(scaler, model)

    cv = KFold(shuffle=True, random_state=123)
    outputs = cross_validate(clf, X, y, cv=cv, scoring="r2")
    avg_score = outputs["test_score"].mean()
    return avg_score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5)

# Print the best parameters found
print("Best trial:")
best_trial = study.best_trial
print("  Value: ", best_trial.value)
print("  Params: ")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")

[I 2024-05-27 13:30:45,017] A new study created in memory with name: no-name-c9b07ed4-65ef-4e54-8484-bbefa93fac76
[I 2024-05-27 13:30:49,468] Trial 0 finished with value: 0.8449429938334838 and parameters: {'lr__fit_intercept': True}. Best is trial 0 with value: 0.8449429938334838.
[I 2024-05-27 13:30:53,924] Trial 1 finished with value: 0.8372836369904689 and parameters: {'lr__fit_intercept': False}. Best is trial 0 with value: 0.8449429938334838.
[I 2024-05-27 13:30:58,672] Trial 2 finished with value: 0.8449429938334838 and parameters: {'lr__fit_intercept': True}. Best is trial 0 with value: 0.8449429938334838.
[I 2024-05-27 13:31:03,493] Trial 3 finished with value: 0.8449429938334838 and parameters: {'lr__fit_intercept': True}. Best is trial 0 with value: 0.8449429938334838.
[I 2024-05-27 13:31:08,353] Trial 4 finished with value: 0.8449429938334838 and parameters: {'lr__fit_intercept': True}. Best is trial 0 with value: 0.8449429938334838.


Best trial:
  Value:  0.8449429938334838
  Params: 
    lr__fit_intercept: True


In [32]:
train_df[["MonsoonIntensity", "TopographyDrainage"]].quantile(0.25, axis=1)

0          5.75
1          6.25
2          5.25
3          3.25
4          3.50
           ... 
1117952    3.00
1117953    2.00
1117954    4.00
1117955    4.00
1117956    4.25
Name: 0.25, Length: 1117957, dtype: float64

In [57]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion

orig_features = train_df.columns
pipe1 = ColumnTransformer([
    ("original", "passthrough", orig_features),
    ("teste", FunctionTransformer(lambda x: x.sum(axis=1).values.reshape((-1, 1))), ["MonsoonIntensity", "ClimateChange"])
], remainder="drop")

out = pipe1.fit_transform(train_df)
out.shape

(1117957, 23)

In [137]:
train_df.loc[0].index

Index(['id', 'MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
       'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
       'Siltation', 'AgriculturalPractices', 'Encroachments',
       'IneffectiveDisasterPreparedness', 'DrainageSystems',
       'CoastalVulnerability', 'Landslides', 'Watersheds',
       'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss',
       'InadequatePlanning', 'PoliticalFactors', 'FloodProbability'],
      dtype='object')

In [156]:
from copy import deepcopy
from sklearn.feature_selection import SelectKBest

def create_features(s: pd.Series):
    """
    Return new features
    """
    s_copy = s.copy(deep=True)
    s_copy["ClimateAnthropogenicInteraction"] = (s["MonsoonIntensity"] + s["ClimateChange"]) * (
        s["Deforestation"] + s["Urbanization"] + s["AgriculturalPractices"] + s["Encroachments"])
    s_copy["InfrastructurePreventionInteraction"] = (s["DamsQuality"] + s["DrainageSystems"] + s["DeterioratingInfrastructure"]) * (
        s["RiverManagement"] + s["IneffectiveDisasterPreparedness"] + s["InadequatePlanning"])
    s_copy["sum"] = s.sum(axis=1)
    s_copy["std"] = s.std(axis=1)
    s_copy["mean"] = s.mean(axis=1)
    s_copy["max"] = s.max(axis=1)
    s_copy["min"] = s.min(axis=1)
    s_copy["mode"] = s.mode(axis=1)[0]
    s_copy["median"] = s.median(axis=1)
    s_copy["q_25th"] = s.quantile(0.25, axis=1)
    s_copy["q_75th"] = s.quantile(0.75, axis=1)
    s_copy["skew"] = s.skew(axis=1)
    s_copy["kurt"] = s.kurt(axis=1)
    s_copy["zscore"] = ((s - s.mean()) / s.std()).mean(axis=1)

    return s_copy.values

new_features = FunctionTransformer(create_features)
out = new_features.fit_transform(train_df.head(20))
out.shape

# preprocess = Pipeline([
#     ("create_features", new_features),
#     ("scale", MinMaxScaler()),
#     ("univ_select", SelectKBest(k=10))
# ])

# out = preprocess.fit_transform(train_df.head(100), train_df.head(100)["FloodProbability"])
# out.shape

(20, 36)

In [158]:
from sklearn.metrics import get_scorer

get_scorer("r2")

make_scorer(r2_score, response_method='predict')

In [104]:
import numpy as np


def anthtropogenic_interaction(s):
    feat = (s["MonsoonIntensity"] + s["ClimateChange"]) * (s["Deforestation"] + s["Urbanization"] + s["AgriculturalPractices"] + s["Encroachments"])
    return feat.values.reshape((-1, 1))

def infrastructure_interaction(s):
    feat = (s["DamsQuality"] + s["DrainageSystems"] + s["DeterioratingInfrastructure"]) * (s["RiverManagement"] + s["IneffectiveDisasterPreparedness"] + s["InadequatePlanning"])
    return feat.values.reshape((-1, 1))

new_features = ColumnTransformer([
    ("ClimateAnthropogenicInteraction", 
     FunctionTransformer(anthtropogenic_interaction, feature_names_out=lambda self, features_in: ["ClimateAnthropogenicInteraction"]),
     ["MonsoonIntensity", "ClimateChange", "Deforestation", "Urbanization", "AgriculturalPractices", "Encroachments"]),
    # ("InfrastructurePreventionInteraction", FunctionTransformer(infrastructure_interaction), ["DamsQuality", "DrainageSystems", "DeterioratingInfrastructure", "RiverManagement", "IneffectiveDisasterPreparedness", "InadequatePlanning"]),
    ("sum",
     FunctionTransformer(lambda x: np.sum(x, axis=1).values.reshape((-1, 1)),
                         feature_names_out=lambda a, b: ["sum"]),
     lambda X: list(range(X.shape[1]))),
    # ("std", FunctionTransformer(lambda x: np.std(x, axis=1).values.reshape((-1, 1))), lambda X: list(range(X.shape[1]))),
    # ("min", FunctionTransformer(lambda x: np.min(x, axis=1).values.reshape((-1, 1))), lambda X: list(range(X.shape[1]))),
    # ("max", FunctionTransformer(lambda x: np.max(x, axis=1).values.reshape((-1, 1))), lambda X: list(range(X.shape[1]))),
    # ("mode", FunctionTransformer(lambda x: x.mode(axis=1)[0].values.reshape((-1, 1))), lambda X: list(range(X.shape[1]))),
    # ("median", FunctionTransformer(lambda x: np.median(x, axis=1).reshape((-1, 1))), lambda X: list(range(X.shape[1]))),
])

union = FeatureUnion([
    ("original_features", FunctionTransformer(feature_names_out=lambda self, feat_in: feat_in)),
    ("new_features", new_features)
])
out = union.fit_transform(train_df.head(100))
out.shape

(100, 24)

In [105]:
union.get_feature_names_out()

array(['original_features__id', 'original_features__MonsoonIntensity',
       'original_features__TopographyDrainage',
       'original_features__RiverManagement',
       'original_features__Deforestation',
       'original_features__Urbanization',
       'original_features__ClimateChange',
       'original_features__DamsQuality', 'original_features__Siltation',
       'original_features__AgriculturalPractices',
       'original_features__Encroachments',
       'original_features__IneffectiveDisasterPreparedness',
       'original_features__DrainageSystems',
       'original_features__CoastalVulnerability',
       'original_features__Landslides', 'original_features__Watersheds',
       'original_features__DeterioratingInfrastructure',
       'original_features__PopulationScore',
       'original_features__WetlandLoss',
       'original_features__InadequatePlanning',
       'original_features__PoliticalFactors',
       'original_features__FloodProbability',
       'new_features__Climate

In [93]:
train_df.head().mode(axis=1)

Unnamed: 0,0,1,2,3
0,3.0,5.0,,
1,3.0,,,
2,3.0,5.0,6.0,7.0
3,4.0,,,
4,3.0,,,
