<a href="https://colab.research.google.com/github/ashleyrennnnnn/Assignments-Machine-Learning/blob/main/Regression_model_part.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb


In [None]:
train_data = pd.read_csv("train-data.csv", index_col="gwb_code_10")
test_data = pd.read_csv("test-data.csv", index_col="gwb_code_10")


In [None]:
X = train_data.drop(["loneliness", "sporting"], axis=1)
y_loneliness = train_data["loneliness"]
y_sporting = train_data["sporting"]

X_test = test_data.drop(["loneliness", "sporting"], axis=1)
y_test_loneliness = test_data["loneliness"]
y_test_sporting = test_data["sporting"]


## Regression model


Ridge，Lasso，LinearRegression


In [None]:
# The imputer that will be used in all models:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.neighbors import KNeighborsRegressor

imputer = IterativeImputer(estimator=KNeighborsRegressor(n_neighbors=10), max_iter=20, tol=1e-3, random_state=100)
X_pre_imputed = imputer.fit_transform(X)
X_test_pre_imputed = imputer.transform(X_test)

In [None]:
from sklearn.linear_model import Ridge, LinearRegression, Lasso
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from typing import Union
from sklearn.model_selection import GridSearchCV, train_test_split
from typing import Union, List, Dict
from sklearn.metrics import mean_absolute_error, r2_score


def grid_search_and_test(model: Pipeline, params: Dict[str, List[float]], target_variable: str = "loneliness", pre_imputed: bool = False) -> pd.DataFrame:
    gsc = GridSearchCV(model, param_grid=params, n_jobs=3, error_score=0.0, verbose=3)

    if target_variable == "loneliness":
        y = y_loneliness
        y_test = y_test_loneliness
    elif target_variable == "sporting":
        y = y_sporting
        y_test = y_test_sporting
    else:
        raise ValueError("Wrong Target variable")

    if pre_imputed:
        X_grid = X_pre_imputed
        X_test_grid = X_test_pre_imputed
    else:
        X_grid = X
        X_test_grid = X_test

    gsc.fit(X_grid, y)
    print("Best parameters found for the model:", gsc.best_params_)
    model.set_params(**gsc.best_params_)
    model.fit(X_grid, y)

    pred = model.predict(X_test_grid)
    scores = {
        "Mean Absolute Error": [mean_absolute_error(y_test, pred)],
        "R2 score": [r2_score(y_test, pred)],
    }
    # Use this to see the top 5 results
    # display(pd.DataFrame(gsc.cv_results_).sort_values(by="rank_test_score").head(5))
    return pd.DataFrame(scores)


For Loneliness


In [None]:
# Thus the only thing that is needed to test all the models is to create a pipeline like this:
ridge_pipe = Pipeline([("scaler", StandardScaler()),("estimator", Ridge(random_state=100))])

# Then we need to create the "grid" with paramaters that we want to search through, so here different alphas
params = {"estimator__alpha": np.logspace(-5,1,num=50)}

# We can now call the function we defined above to get the df with the test results, and the best parameter
# Make sure that you don't get a ridicoulus amounts of totalling fits, i.e > 1 000 as that might take way to much time!
# Use pre_imputed = True to significantlly increase the time to run it. However, a small data leakage occurs then, but it is okay!
ridge_result = grid_search_and_test(model=ridge_pipe, params=params, target_variable="loneliness", pre_imputed=True)
ridge_result

In [None]:
lasso_pipe = Pipeline([("scaler", StandardScaler()),("estimator", Lasso(random_state=100))])
params = {"estimator__alpha": np.logspace(-5,1,num=50)}
lasso_result = grid_search_and_test(model=lasso_pipe, params=params, target_variable="loneliness", pre_imputed=True)
lasso_result

In [None]:
lr_pipe = Pipeline([("scaler", StandardScaler()),("estimator", LinearRegression(n_jobs=-1))])
params = { }
lr_result = grid_search_and_test(model=lr_pipe, params=params, target_variable="loneliness", pre_imputed=True)
lr_result

For sport

In [None]:
ridge_pipe = Pipeline([("scaler", StandardScaler()),("estimator", Ridge(random_state=100))])
params = {"estimator__alpha": np.logspace(-5,1,num=50)}

ridge_result = grid_search_and_test(model=ridge_pipe, params=params, target_variable="sporting", pre_imputed=True)
ridge_result

In [None]:
lasso_pipe = Pipeline([("scaler", StandardScaler()),("estimator", Lasso(random_state=100))])
params = {"estimator__alpha": np.logspace(-5,1,num=50)}
lasso_result = grid_search_and_test(model=lasso_pipe, params=params, target_variable="sporting", pre_imputed=True)
lasso_result

In [None]:
lr_pipe = Pipeline([("scaler", StandardScaler()),("estimator", LinearRegression(n_jobs=-1))])
params = {}
lr_result = grid_search_and_test(model=lr_pipe, params=params, target_variable="sporting", pre_imputed=True)
lr_result

Desicion Tree Classifier and Random forest Classifier


For loneliness 

In [None]:
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from typing import List

In [None]:
tree = Pipeline([("scaler", StandardScaler()),("estimator", DecisionTreeRegressor())])
params = {"estimator__max_depth": np.linspace(2, 30, num=20,dtype=int),
          "estimator__max_leaf_nodes":np.linspace(2,50,num=20,dtype=int),
          "estimator__max_features": np.linspace(0.05,1,num=20)}
tree_result = grid_search_and_test(model=tree, params=params, target_variable="loneliness", pre_imputed=True)
tree_result

Fitting 5 folds for each of 25000 candidates, totalling 125000 fits


In [None]:
forest = Pipeline([("scaler", StandardScaler()),("estimator", RandomForestRegressor())])
params = {"estimator__max_depth": np.linspace(2, 30, num=20,dtype=int),
          "estimator__max_leaf_nodes":np.linspace(2,50,num=20,dtype=int),
          "estimator__max_features": np.linspace(0.05,1,num=20)}
forest_result = grid_search_and_test(model=forest, params=params, target_variable="loneliness", pre_imputed=True)
forest_result

For sporting

In [None]:
tree = Pipeline([("scaler", StandardScaler()),("estimator", DecisionTreeRegressor())])
params = {"estimator__max_depth": np.linspace(2, 30, num=25,dtype=int),
          "estimator__max_leaf_nodes":np.linspace(2,50,num=40,dtype=int),
          "estimator__max_features": np.linspace(0.05,1,num=25)}
tree_result = grid_search_and_test(model=tree, params=params, target_variable="sporting", pre_imputed=True)
tree_result

In [None]:
forest = Pipeline([("scaler", StandardScaler()),("estimator", RandomForestRegressor(max_depth=5))])
params = {"estimator__max_depth": np.linspace(2, 30, num=25,dtype=int),
          "estimator__max_leaf_nodes":np.linspace(2,50,num=40,dtype=int),
          "estimator__max_features": np.linspace(0.05,1,num=25)}
forest_result = grid_search_and_test(model=forest, params=params, target_variable="sporting", pre_imputed=True)
forest_result