In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb


In [2]:
train_data = pd.read_csv("train-data.csv", index_col="gwb_code_10")
test_data = pd.read_csv("test-data.csv", index_col="gwb_code_10")


In [3]:
X = train_data.drop(["loneliness", "sporting"], axis=1)
y_loneliness = train_data["loneliness"]
y_sporting = train_data["sporting"]

X_test = test_data.drop(["loneliness", "sporting"], axis=1)
y_test_loneliness = test_data["loneliness"]
y_test_sporting = test_data["sporting"]


## Regression model


Ridge，Lasso，LinearRegression


In [15]:
# The imputer that will be used in all models:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.neighbors import KNeighborsRegressor

imputer = IterativeImputer(estimator=KNeighborsRegressor(n_neighbors=10), max_iter=20, tol=1e-3, random_state=100)
X_pre_imputed = imputer.fit_transform(X)
X_test_pre_imputed = imputer.transform(X_test)

In [46]:
from sklearn.linear_model import Ridge, LinearRegression, Lasso
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from typing import Union
from sklearn.model_selection import GridSearchCV, train_test_split
from typing import Union, List, Dict
from sklearn.metrics import mean_absolute_error, r2_score


def grid_search_and_test(model: Pipeline, params: Dict[str, List[float]], target_variable: str = "loneliness", pre_imputed: bool = False) -> pd.DataFrame:
    gsc = GridSearchCV(model, param_grid=params, n_jobs=3, error_score=0.0, verbose=3)

    if target_variable == "loneliness":
        y = y_loneliness
        y_test = y_test_loneliness
    elif target_variable == "sporting":
        y = y_sporting
        y_test = y_test_sporting
    else:
        raise ValueError("Wrong Target variable")

    if pre_imputed:
        X_grid = X_pre_imputed
        X_test_grid = X_test_pre_imputed
    else:
        X_grid = X
        X_test_grid = X_test

    gsc.fit(X_grid, y)
    print("Best parameters found for the model:", gsc.best_params_)
    model.set_params(**gsc.best_params_)
    model.fit(X_grid, y)

    pred = model.predict(X_test_grid)
    scores = {
        "Mean Absolute Error": [mean_absolute_error(y_test, pred)],
        "R2 score": [r2_score(y_test, pred)],
    }
    # Use this to see the top 5 results
    # display(pd.DataFrame(gsc.cv_results_).sort_values(by="rank_test_score").head(5))
    return pd.DataFrame(scores)


For Loneliness


In [47]:
# Thus the only thing that is needed to test all the models is to create a pipeline like this:
ridge_pipe = Pipeline([("scaler", StandardScaler()),("estimator", Ridge(random_state=100))])

# Then we need to create the "grid" with paramaters that we want to search through, so here different alphas
params = {"estimator__alpha": np.logspace(-5,1,num=50)}

# We can now call the function we defined above to get the df with the test results, and the best parameter
# Make sure that you don't get a ridicoulus amounts of totalling fits, i.e > 1 000 as that might take way to much time!
# Use pre_imputed = True to significantlly increase the time to run it. However, a small data leakage occurs then, but it is okay!
ridge_result = grid_search_and_test(model=ridge_pipe, params=params, target_variable="loneliness", pre_imputed=True)
ridge_result

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters found for the model: {'estimator__alpha': 0.00029470517025518097}


Unnamed: 0,Mean Absolute Error,R2 score
0,1.908699,0.76846


# The code below is unchanged!

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score


def grid_search_and_test_1(model: Pipeline, params: Dict[str, List[float]]) -> None:
    """Prints the best parameters found for the model, and the score on the test data"""
    gsc = GridSearchCV(model, param_grid=params, n_jobs=4, error_score=0.0)
    gsc.fit(X_lone_train, y_lone_train)

    print("Best parameters found for Lasso:", gsc.best_params_)
    model.set_params(**gsc.best_params_)
    model.fit(X_lone_train, y_lone_train)

    pred = model.predict(X_lone_test)

    scores = {
        "Mean Absolute Error ": [mean_absolute_error(y_lone_test, pred)],
        "R2 score": [r2_score(y_lone_test, pred)],
    }
    score_df = pd.DataFrame(scores)

    return score_df


In [None]:
grid_search_and_test_1(get_classifier_and_scaler_1(Lasso()), {})


TypeError: ignored

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score


def grid_search_and_test_1(model: Pipeline, params: Dict[str, List[float]]) -> None:
    """Prints the best parameters found for the model, and the score on the test data"""
    gsc = GridSearchCV(model, param_grid=params, n_jobs=4, error_score=0.0)
    gsc.fit(X_lone_train, y_lone_train)

    print("Best parameters found for LinearRegression:", gsc.best_params_)
    model.set_params(**gsc.best_params_)
    model.fit(X_lone_train, y_lone_train)

    pred = model.predict(X_lone_test)

    scores = {
        "Mean Absolute Error ": [mean_absolute_error(y_lone_test, pred)],
        "R2 score": [r2_score(y_lone_test, pred)],
    }
    score_df = pd.DataFrame(scores)

    return score_df


In [None]:
grid_search_and_test_1(get_classifier_and_scaler_1(LinearRegression()), {})


Best parameters found for LinearRegression: {}


Unnamed: 0,Mean Absolute Error,R2 score
0,1.694412,0.800505


For sport


In [None]:
from sklearn.metrics import mean_absolute_error, r2_score


def grid_search_and_test_1(model: Pipeline, params: Dict[str, List[float]]) -> None:
    """Prints the best parameters found for the model, and the score on the test data"""
    gsc = GridSearchCV(model, param_grid=params, n_jobs=4, error_score=0.0)
    gsc.fit(X_sport_train, y_sport_train)

    print("Best parameters found for Ridge:", gsc.best_params_)
    model.set_params(**gsc.best_params_)
    model.fit(X_sport_train, y_sport_train)

    pred = model.predict(X_sport_test)

    scores = {
        "Mean Absolute Error ": [mean_absolute_error(y_sport_test, pred)],
        "R2 score": [r2_score(y_sport_test, pred)],
    }
    score_df = pd.DataFrame(scores)

    return score_df


In [None]:
grid_search_and_test_1(get_classifier_and_scaler_1(Ridge(random_state=100)), {})


Best parameters found for Ridge: {}


Unnamed: 0,Mean Absolute Error,R2 score
0,3.267283,0.631361


In [None]:
from sklearn.metrics import mean_absolute_error, r2_score


def grid_search_and_test_1(model: Pipeline, params: Dict[str, List[float]]) -> None:
    """Prints the best parameters found for the model, and the score on the test data"""
    gsc = GridSearchCV(model, param_grid=params, n_jobs=4, error_score=0.0)
    gsc.fit(X_sport_train, y_sport_train)

    print("Best parameters found for Lasso:", gsc.best_params_)
    model.set_params(**gsc.best_params_)
    model.fit(X_sport_train, y_sport_train)

    pred = model.predict(X_sport_test)

    scores = {
        "Mean Absolute Error ": [mean_absolute_error(y_sport_test, pred)],
        "R2 score": [r2_score(y_sport_test, pred)],
    }
    score_df = pd.DataFrame(scores)

    return score_df


In [None]:
grid_search_and_test_1(get_classifier_and_scaler_1(Lasso()), {})


TypeError: ignored

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score


def grid_search_and_test_1(model: Pipeline, params: Dict[str, List[float]]) -> None:
    """Prints the best parameters found for the model, and the score on the test data"""
    gsc = GridSearchCV(model, param_grid=params, n_jobs=4, error_score=0.0)
    gsc.fit(X_sport_train, y_sport_train)

    print("Best parameters found for LinearRegression:", gsc.best_params_)
    model.set_params(**gsc.best_params_)
    model.fit(X_sport_train, y_sport_train)

    pred = model.predict(X_sport_test)

    scores = {
        "Mean Absolute Error ": [mean_absolute_error(y_sport_test, pred)],
        "R2 score": [r2_score(y_sport_test, pred)],
    }
    score_df = pd.DataFrame(scores)

    return score_df


In [None]:
grid_search_and_test_1(get_classifier_and_scaler_1(LinearRegression()), {})


Best parameters found for LinearRegression: {}


Unnamed: 0,Mean Absolute Error,R2 score
0,3.267343,0.631227


DecisionTreeClassifier and RandomTreeClassifier


In [None]:
## List of categorical columns
catetgorical = [
    "a_man",
    "a_vrouw",
    "a_00_14",
    "a_15_24",
    "a_25_44",
    "a_45_64",
    "a_65_oo",
    "a_ongeh",
    "a_gehuwd",
    "a_gesch",
    "a_verwed",
    "a_marok",
    "a_antaru",
    "a_suri",
    "a_tur",
    "a_ov_nw",
    "a_1p_hh",
    "a_hh_z_k",
    "a_hh_m_k",
    "bev_dich",
    "p_wcorpw",
    "p_ov_hw",
    "p_e_o_w",
    "p_bjj2k",
    "p_bjo2k",
    "g_ele_ko",
    "g_ele_hu",
    "g_ele_vw",
    "a_soz_ow",
    "a_soz_ww",
    "a_soz_ao",
    "a_soz_wb",
    "a_bed_ru",
    "a_bed_mn",
    "a_bed_kl",
    "a_bed_hj",
    "a_bed_gi",
    "a_bed_bf",
    "a_bed_a",
    "a_m2w",
    "a_bst_nb",
    "a_bst_b",
    "g_3km_sc",
    "g_afs_sc",
    "g_afs_kv",
    "g_afs_gs",
    "g_afs_hp",
    "ste_mvs",
    "ste_oad",
]
# List of numerical columns
numerical = [
    "a_inw",
    "a_w_all",
    "a_nw_all",
    "a_geb",
    "p_geb",
    "a_ste",
    "p_ste",
    "a_hh",
    "g_hhgro",
    "p_mgezw",
    "p_1gezw",
    "g_woz",
    "a_woning",
    "p_bewndw",
    "p_leegsw",
    "p_koopw",
    "p_huurw",
    "g_ele",
    "p_stadsv",
    "g_ gas_ ko",
    "g_ gas_ hu",
    "g_ gas_ vw",
    "g_gas_2w",
    "g_ gas_ hw",
    "g_ gas_ tw",
    "g_gas_ap",
    "g_gas",
    "g_ele",
    "g_ink_pi",
    "g_ink_po",
    "a_inkont",
    "g_gewsek",
    "g_vernoo",
    "g_wodief",
    "a_bedv",
    "g_pau_km",
    "g_pau_hh",
    "a_pau",
    "a_wat_ha",
    "a_lan_ha",
    "a_opp_ha",
    "pst_dekp",
]


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from typing import Union
from sklearn.compose import ColumnTransformer
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer

lab = preprocessing.LabelEncoder()
y_lone_transformed = lab.fit_transform(y_loneliness)
y_sport_transformed = lab.fit_transform(y_sporting)

X_lone_train, X_lone_test, y_lone_train, y_lone_test = train_test_split(
    X, y_lone_transformed, random_state=100
)

X_sport_train, X_sport_test, y_sport_train, y_sport_test = train_test_split(
    X, y_sport_transformed, random_state=100
)


def get_classifier_and_scaler_2(
    classifier: Union[DecisionTreeClassifier, RandomForestClassifier],
    scaler: Union[Union[StandardScaler, MinMaxScaler], None] = None,
) -> Pipeline:
    if scaler:
        ct = ColumnTransformer(
            [
                ("scaling", scaler, numerical),
                ("onehot", OneHotEncoder(handle_unknown="ignore"), catetgorical),
            ]
        )
        return Pipeline([("scaler", ct), ("classifier", classifier)])
    ct = ColumnTransformer(
        [("onehot", OneHotEncoder(handle_unknown="ignore"), catetgorical)]
    )
    return Pipeline([("scaler", ct), ("classifier", classifier)])


For loneliness


In [None]:
from sklearn.metrics import mean_absolute_error, r2_score


def grid_search_and_test_2(model: Pipeline, params: Dict[str, List[float]]) -> None:
    """Prints the best parameters found for the model, and the score on the test data"""
    gsc = GridSearchCV(model, param_grid=params, n_jobs=4, error_score=0.0)
    gsc.fit(X_lone_train, y_lone_train)

    print("Best parameters found for RandomForestClassifier :", gsc.best_params_)
    model.set_params(**gsc.best_params_)
    model.fit(X_lone_train, y_lone_train)

    pred = model.predict(X_lone_test)

    scores = {
        "Mean Absolute Error ": [mean_absolute_error(y_lone_test, pred)],
        "R2 score": [r2_score(y_lone_test, pred)],
    }
    score_df = pd.DataFrame(scores)

    return score_df


In [None]:
grid_search_and_test_2(
    get_classifier_and_scaler_2(RandomForestClassifier(random_state=100)), {}
)


Best parameters found for RandomForestClassifier : {}


Unnamed: 0,Mean Absolute Error,R2 score
0,30.535258,0.304213


In [None]:
from sklearn.metrics import mean_absolute_error, r2_score


def grid_search_and_test_2(model: Pipeline, params: Dict[str, List[float]]) -> None:
    """Prints the best parameters found for the model, and the score on the test data"""
    gsc = GridSearchCV(model, param_grid=params, n_jobs=4, error_score=0.0)
    gsc.fit(X_lone_train, y_lone_train)

    print("Best parameters found for DecisionTreeClassifier :", gsc.best_params_)
    model.set_params(**gsc.best_params_)
    model.fit(X_lone_train, y_lone_train)

    pred = model.predict(X_lone_test)

    scores = {
        "Mean Absolute Error ": [mean_absolute_error(y_lone_test, pred)],
        "R2 score": [r2_score(y_lone_test, pred)],
    }
    score_df = pd.DataFrame(scores)

    return score_df


In [None]:
grid_search_and_test_2(
    get_classifier_and_scaler_2(DecisionTreeClassifier(random_state=100)), {}
)


Best parameters found for DecisionTreeClassifier : {}


Unnamed: 0,Mean Absolute Error,R2 score
0,33.76,0.21801


For sport


In [None]:
from sklearn.metrics import mean_absolute_error, r2_score


def grid_search_and_test_2(model: Pipeline, params: Dict[str, List[float]]) -> None:
    """Prints the best parameters found for the model, and the score on the test data"""
    gsc = GridSearchCV(model, param_grid=params, n_jobs=4, error_score=0.0)
    gsc.fit(X_sport_train, y_sport_train)

    print("Best parameters found for RandomForestClassifier :", gsc.best_params_)
    model.set_params(**gsc.best_params_)
    model.fit(X_sport_train, y_sport_train)

    pred = model.predict(X_sport_test)

    scores = {
        "Mean Absolute Error ": [mean_absolute_error(y_sport_test, pred)],
        "R2 score": [r2_score(y_sport_test, pred)],
    }
    score_df = pd.DataFrame(scores)

    return score_df


In [None]:
grid_search_and_test_2(
    get_classifier_and_scaler_2(RandomForestClassifier(random_state=100)), {}
)


TerminatedWorkerError: ignored

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score


def grid_search_and_test_2(model: Pipeline, params: Dict[str, List[float]]) -> None:
    """Prints the best parameters found for the model, and the score on the test data"""
    gsc = GridSearchCV(model, param_grid=params, n_jobs=4, error_score=0.0)
    gsc.fit(X_sport_train, y_sport_train)

    print("Best parameters found for DecisionTreeClassifier :", gsc.best_params_)
    model.set_params(**gsc.best_params_)
    model.fit(X_sport_train, y_sport_train)

    pred = model.predict(X_sport_test)

    scores = {
        "Mean Absolute Error ": [mean_absolute_error(y_sport_test, pred)],
        "R2 score": [r2_score(y_sport_test, pred)],
    }
    score_df = pd.DataFrame(scores)

    return score_df


In [None]:
grid_search_and_test_2(
    get_classifier_and_scaler_2(DecisionTreeClassifier(random_state=100)), {}
)


Best parameters found for DecisionTreeClassifier : {}


Unnamed: 0,Mean Absolute Error,R2 score
0,63.287835,-0.404066
