In [1]:
import os

import numpy as np
import pandas as pd
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import Lasso, LassoCV, LogisticRegression, Ridge, RidgeCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.svm import SVC, SVR
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm.sklearn import LGBMRegressor

#from sklearn import datasets
from sklearn.compose import ColumnTransformer, make_column_transformer
#from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import RFE, RFECV, SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.metrics import make_scorer
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    ShuffleSplit,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (
    OneHotEncoder,
    OrdinalEncoder,
    PolynomialFeatures,
    StandardScaler,
)

In [2]:
df = pd.read_csv("../data/processed/clean_data.csv").dropna()
df.shape
df.head()
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

X_train, y_train = train_df.drop(columns=["bloom_doy"]), train_df["bloom_doy"]
X_test, y_test = test_df.drop(columns=["bloom_doy"]), test_df["bloom_doy"]

In [5]:
categorical_features = ["country", "city"]
numeric_features = [
    "lat",
    "long",
    "alt",
    "tmax",
    "tmin",
    "prcp",
    "agdd_winter",
    "tmax_winter",
    "prcp_winter",
    "co2_percapita",
    "co2_emission",
]
drop_features = ["year"]
target = "bloom_doy"

In [6]:
scaler = StandardScaler()
enc = OneHotEncoder(handle_unknown="ignore", sparse=False)

preprocessor = make_column_transformer(
    (StandardScaler(), numeric_features),  # scaling on numeric features
    (OneHotEncoder(handle_unknown="ignore"),categorical_features),  # OHE on categorical features
    ("drop", drop_features)  # drop features

)

preprocessor.fit_transform(X_train).shape

(4327, 120)

In [7]:
results = {}  # dictionary to store all the results

In [8]:
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [9]:
# make a scorer function that we can pass into cross-validation

def mape(true, pred):
    return 100.0 * np.mean(np.abs((pred - true) / true))

mape_scorer = make_scorer(mape, greater_is_better=False)

In [10]:
scoring_metrics = {
    "neg RMSE": "neg_root_mean_squared_error",
    "r2": "r2",
    "mape": mape_scorer,
}

In [11]:
# making pipelines for all the regressor models and storing in "models" dictionary
models = {
    "Baseline": make_pipeline(preprocessor, DummyRegressor()),
    "Ridge": make_pipeline(preprocessor, Ridge(random_state=123)),
    "Lasso": make_pipeline(preprocessor, Lasso(random_state=123)),
    "Decision Tree": make_pipeline(
        preprocessor, DecisionTreeRegressor(random_state=123)
    ),
    "CatBoost": make_pipeline(
        preprocessor, CatBoostRegressor(verbose=0, random_state=123)
    ),
    "XGBoost": make_pipeline(
        preprocessor, XGBRegressor(verbosity=0, random_state=123)
    ),
    "LGBM": make_pipeline(preprocessor, LGBMRegressor(random_state=123)),
    "Support Vector": make_pipeline(preprocessor, SVR(epsilon=0.2)),
    "KNN": make_pipeline(preprocessor, KNeighborsRegressor(n_neighbors=30, weights='uniform', algorithm = 'auto'))
}

# saving the mean cross validation scores in resutls dictionary created previously
for model_name, model in models.items():
    results[model_name] = mean_std_cross_val_scores(
        model,
        X_train,
        y_train,
        scoring=scoring_metrics,
        return_train_score=True,
    )

In [12]:
pd.DataFrame(results)

Unnamed: 0,Baseline,Ridge,Lasso,Decision Tree,CatBoost,XGBoost,LGBM,Support Vector,KNN
fit_time,0.009 (+/- 0.001),0.012 (+/- 0.003),0.037 (+/- 0.001),0.082 (+/- 0.001),1.516 (+/- 0.071),0.194 (+/- 0.010),0.137 (+/- 0.042),0.355 (+/- 0.020),0.009 (+/- 0.000)
score_time,0.005 (+/- 0.001),0.004 (+/- 0.001),0.004 (+/- 0.001),0.005 (+/- 0.001),0.008 (+/- 0.001),0.013 (+/- 0.001),0.008 (+/- 0.001),0.063 (+/- 0.001),0.112 (+/- 0.005)
test_neg RMSE,-21.795 (+/- 1.233),-5.781 (+/- 0.225),-9.869 (+/- 0.400),-6.500 (+/- 0.323),-4.252 (+/- 0.075),-4.356 (+/- 0.106),-4.271 (+/- 0.101),-9.266 (+/- 0.850),-6.888 (+/- 0.287)
train_neg RMSE,-21.806 (+/- 0.312),-5.552 (+/- 0.042),-9.820 (+/- 0.068),0.000 (+/- 0.000),-2.592 (+/- 0.027),-1.531 (+/- 0.066),-2.784 (+/- 0.027),-9.095 (+/- 0.098),-6.639 (+/- 0.113)
test_r2,-0.003 (+/- 0.002),0.929 (+/- 0.009),0.794 (+/- 0.008),0.910 (+/- 0.010),0.962 (+/- 0.004),0.960 (+/- 0.004),0.961 (+/- 0.004),0.819 (+/- 0.013),0.899 (+/- 0.011)
train_r2,0.000 (+/- 0.000),0.935 (+/- 0.002),0.797 (+/- 0.003),1.000 (+/- 0.000),0.986 (+/- 0.000),0.995 (+/- 0.000),0.984 (+/- 0.001),0.826 (+/- 0.004),0.907 (+/- 0.004)
test_mape,-21.041 (+/- 2.937),-5.116 (+/- 0.418),-9.745 (+/- 1.072),-5.193 (+/- 0.596),-3.635 (+/- 0.340),-3.593 (+/- 0.359),-3.600 (+/- 0.354),-8.964 (+/- 1.469),-5.962 (+/- 0.649)
train_mape,-21.031 (+/- 0.605),-4.934 (+/- 0.093),-9.696 (+/- 0.216),0.000 (+/- 0.000),-2.328 (+/- 0.047),-1.263 (+/- 0.037),-2.421 (+/- 0.058),-8.716 (+/- 0.247),-5.735 (+/- 0.162)


## Feature Selection 

In [13]:
# creating a pipeline model with preprocessor, L1 regularization and LGBMRegressor
pipe_l1_lgbm = make_pipeline(
    preprocessor,
    SelectFromModel(Lasso(alpha=0.01, max_iter=100000)),
    LGBMRegressor(random_state=123),
)

In [14]:
# finding mean cross validation scores for the model and storing the resutls in the "results" dictionary
results["LGBM + L1 regularization"] = mean_std_cross_val_scores(
    pipe_l1_lgbm, X_train, y_train, scoring=scoring_metrics, return_train_score=True
)

In [15]:
# displaying the scores of all the models so far created
pd.DataFrame(results)

Unnamed: 0,Baseline,Ridge,Lasso,Decision Tree,CatBoost,XGBoost,LGBM,Support Vector,KNN,LGBM + L1 regularization
fit_time,0.009 (+/- 0.001),0.012 (+/- 0.003),0.037 (+/- 0.001),0.082 (+/- 0.001),1.516 (+/- 0.071),0.194 (+/- 0.010),0.137 (+/- 0.042),0.355 (+/- 0.020),0.009 (+/- 0.000),0.338 (+/- 0.036)
score_time,0.005 (+/- 0.001),0.004 (+/- 0.001),0.004 (+/- 0.001),0.005 (+/- 0.001),0.008 (+/- 0.001),0.013 (+/- 0.001),0.008 (+/- 0.001),0.063 (+/- 0.001),0.112 (+/- 0.005),0.008 (+/- 0.001)
test_neg RMSE,-21.795 (+/- 1.233),-5.781 (+/- 0.225),-9.869 (+/- 0.400),-6.500 (+/- 0.323),-4.252 (+/- 0.075),-4.356 (+/- 0.106),-4.271 (+/- 0.101),-9.266 (+/- 0.850),-6.888 (+/- 0.287),-4.299 (+/- 0.092)
train_neg RMSE,-21.806 (+/- 0.312),-5.552 (+/- 0.042),-9.820 (+/- 0.068),0.000 (+/- 0.000),-2.592 (+/- 0.027),-1.531 (+/- 0.066),-2.784 (+/- 0.027),-9.095 (+/- 0.098),-6.639 (+/- 0.113),-2.771 (+/- 0.032)
test_r2,-0.003 (+/- 0.002),0.929 (+/- 0.009),0.794 (+/- 0.008),0.910 (+/- 0.010),0.962 (+/- 0.004),0.960 (+/- 0.004),0.961 (+/- 0.004),0.819 (+/- 0.013),0.899 (+/- 0.011),0.961 (+/- 0.004)
train_r2,0.000 (+/- 0.000),0.935 (+/- 0.002),0.797 (+/- 0.003),1.000 (+/- 0.000),0.986 (+/- 0.000),0.995 (+/- 0.000),0.984 (+/- 0.001),0.826 (+/- 0.004),0.907 (+/- 0.004),0.984 (+/- 0.000)
test_mape,-21.041 (+/- 2.937),-5.116 (+/- 0.418),-9.745 (+/- 1.072),-5.193 (+/- 0.596),-3.635 (+/- 0.340),-3.593 (+/- 0.359),-3.600 (+/- 0.354),-8.964 (+/- 1.469),-5.962 (+/- 0.649),-3.619 (+/- 0.365)
train_mape,-21.031 (+/- 0.605),-4.934 (+/- 0.093),-9.696 (+/- 0.216),0.000 (+/- 0.000),-2.328 (+/- 0.047),-1.263 (+/- 0.037),-2.421 (+/- 0.058),-8.716 (+/- 0.247),-5.735 (+/- 0.162),-2.417 (+/- 0.058)
