In [None]:
import numpy as np
import pandas as pd

from catboost import CatBoostRegressor
from lightgbm.sklearn import LGBMRegressor
from xgboost import XGBRegressor, plot_importance
from sklearn.ensemble import RandomForestRegressor
from sklearn.dummy import DummyRegressor

from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import (
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor, export_graphviz

from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
import pickle
import torch
import csv

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device.type}")

### Define scoring metrics and CV score function

In [None]:
scoring_metrics = {
    "neg RMSE": "neg_root_mean_squared_error",
}

In [None]:
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

### Load CSV files

In [None]:
if device.type == "cuda":
    df = pd.read_csv('../input/mds-snowbies/train.csv')
    X_test_submit = pd.read_csv('../input/mds-snowbies/test.csv')
else:
    df = pd.read_csv('../data/train.csv')
    X_test_submit = pd.read_csv('../data/test.csv')

### Any manual feature engineering before column transformation

In [None]:
if device.type == "cuda":
    facility_class = pd.read_csv("../input/mds-snowbies/f_type.csv")
else:
    facility_class = pd.read_csv("f_type.csv")
    
facility_class["facility_class"].unique()

In [None]:
df = pd.merge(df, facility_class, on="facility_type")

df.head(3)

In [None]:
df.shape

In [None]:
value = df["direction_max_wind_speed"]
df['dir_max_wind_speed'] = np.where(value > 337.5, "N",
                                np.where(value > 292.5, "NE",
                                        np.where(value > 247.5, "E",
                                                 np.where(value > 202.5, "SE",
                                                          np.where(value > 157.5, "S",
                                                                   np.where(value > 112.5, "SW",
                                                                            np.where(value > 67.5, "W",
                                                                                     np.where(value > 22.5, "NW", "N"))))))))

value = df["direction_peak_wind_speed"]
df['dir_peak_wind_speed'] = np.where(value > 337.5, "N",
                                np.where(value > 292.5, "NE",
                                        np.where(value > 247.5, "E",
                                                 np.where(value > 202.5, "SE",
                                                          np.where(value > 157.5, "S",
                                                                   np.where(value > 112.5, "SW",
                                                                            np.where(value > 67.5, "W",
                                                                                     np.where(value > 22.5, "NW", "N"))))))))

##### Checking the data I realized that the mean wind direction is 62 degrees which aligns with NE that we are getting above

### Ditch MICE use KNN

### Group columns for transformations

In [None]:
target = "site_eui"

numeric_features = [
    "floor_area",
    "Year_Factor",
    #    "year_built",
    "energy_star_rating",  # Imputed by facility_class + site_eui, take the average per facility_class
    #    "ELEVATION",
    "january_min_temp",
    "january_avg_temp",
    "january_max_temp",
    #    "february_min_temp", # removed similar temperature columns
    #    "february_avg_temp",
    #    "february_max_temp",
    #    "march_min_temp",
    #    "march_avg_temp",
    #    "march_max_temp",
    #    "april_min_temp",
    #    "april_avg_temp",
    #    "april_max_temp",
    #    "may_min_temp",
    #    "may_avg_temp",
    #    "may_max_temp",
    #    "june_min_temp",
    #    "june_avg_temp",
    #    "june_max_temp",
    "july_min_temp",
    "july_avg_temp",
    "july_max_temp",
    #    "august_min_temp",
    #    "august_avg_temp",
    #    "august_max_temp",
    #    "september_min_temp", # removed similar temperature columns
    #    "september_avg_temp",
    #    "september_max_temp",
    #    "october_min_temp",
    #    "october_avg_temp",
    #    "october_max_temp",
    #    "november_min_temp",
    #    "november_avg_temp",
    #    "november_max_temp",
    #    "december_min_temp",
    #    "december_avg_temp",
    #    "december_max_temp",
    "cooling_degree_days",
    "heating_degree_days",
    "precipitation_inches",
    "snowfall_inches",
    "snowdepth_inches",
    #    "avg_temp",
    #    "days_below_30F",
    "days_below_20F",
    #    "days_below_10F",
    #    "days_below_0F",
    #    "days_above_80F",
    "days_above_90F",
    #    "days_above_100F",
    #    "days_above_110F",
    #    "direction_max_wind_speed",
    #    "direction_peak_wind_speed",
    #    "max_wind_speed",
    #    "days_with_fog",
    "february_avg_temp_diff",
    "march_avg_temp_diff",
    "april_avg_temp_diff",
    "may_avg_temp_diff",
    "june_avg_temp_diff",
    "july_avg_temp_diff",
    "august_avg_temp_diff",
    "september_avg_temp_diff",
    "october_avg_temp_diff",
    "november_avg_temp_diff",
    "december_avg_temp_diff"
]

year_features = ["year_built"]
ordinal_features = [] 
categorical_features = [
    "State_Factor",
    "facility_class",
    "facility_type",
    "dir_max_wind_speed",  # Added new feature
    "dir_peak_wind_speed",
]  

drop_features = [
    "id",
    "building_class",  # Moved this one here
    "ELEVATION",    
    "direction_max_wind_speed",
    "direction_peak_wind_speed", 
    "february_min_temp",
    "february_avg_temp",
    "february_max_temp",
    "march_min_temp",
    "march_avg_temp",
    "march_max_temp",
    "april_min_temp",
    "april_avg_temp",
    "april_max_temp",
    "may_min_temp",
    "may_avg_temp",
    "may_max_temp",
    "june_min_temp",
    "june_avg_temp",
    "june_max_temp",     
    "august_min_temp",
    "august_avg_temp",
    "august_max_temp",
    "september_min_temp",
    "september_avg_temp",
    "september_max_temp",
    "october_min_temp",
    "october_avg_temp",
    "october_max_temp",
    "november_min_temp",
    "november_avg_temp",
    "november_max_temp",
    "december_min_temp",
    "december_avg_temp",
    "december_max_temp",
    "avg_temp",    
    "days_below_30F",
    "days_below_10F",
    "days_below_0F",
    "days_above_80F",
    "days_above_100F",
    "days_above_110F",
    "max_wind_speed",
    "days_with_fog"
]

assert df.columns.shape[0] == len(
    numeric_features
    + year_features
    + ordinal_features
    + categorical_features
    + [target]
    + drop_features
)

### Split data for CV

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=123)
X_train, y_train = train_df.drop(columns=[target]), train_df[target]
X_test, y_test = test_df.drop(columns=[target]), test_df[target]

### Knn Regression for energy star rating

In [None]:
knn_features = ["floor_area", "year_built", "facility_type"]

train_df_energy_notna = X_train[X_train["energy_star_rating"].notna()]
X_energy, y_energy = train_df_energy_notna[knn_features], train_df_energy_notna["energy_star_rating"]

# for imputataion from knn prediction later
train_df_energy_isna = X_train[X_train["energy_star_rating"].isna()]
test_df_energy_isna = X_test[X_test["energy_star_rating"].isna()]

In [None]:
#numeric_transformer_energy = make_pipeline(SimpleImputer(strategy="constant", fill_value=0), StandardScaler())
numeric_transformer_energy = make_pipeline(KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean'), StandardScaler())
year_transformer_energy = make_pipeline(SimpleImputer(strategy="constant", fill_value=1930), StandardScaler())


categorical_transformer_energy = make_pipeline(
    OneHotEncoder(handle_unknown="ignore", sparse=True),
)

preprocessor_energy = make_column_transformer(
    (numeric_transformer_energy, ["floor_area"]),
    (year_transformer_energy, ["year_built"]),
    (categorical_transformer_energy, ["facility_type"]),
)

### KNN Hyperparameter tuning

In [None]:
# for i in np.arange(1,10):

#     pipe_knn_energy = make_pipeline(
#         preprocessor_energy, KNeighborsRegressor(n_neighbors=i,n_jobs=-1)
#     )

#     results_energy["knn_"+str(i)] = mean_std_cross_val_scores(
#         pipe_knn_energy, X_train_energy, y_train_energy, return_train_score=True, scoring=scoring_metrics
#     )

In [None]:
# pd.DataFrame(results_energy).T

### Run if requires retrain

In [None]:
pipe_knn_energy = make_pipeline(
        preprocessor_energy, KNeighborsRegressor(n_neighbors=3,n_jobs=-1, weights="distance")
    )

In [None]:
pipe_knn_fitted = pipe_knn_energy.fit(X_energy, y_energy)

### Save the trained KNN regression model

### Load the trained KNN regression model

### Impute X_train

In [None]:
X_train_df_na = train_df_energy_isna[knn_features]
#y_pred = pipe_knn_fitted.predict(X_train_df_na)
y_pred = [100 if y > 100 else y for y in pipe_knn_fitted.predict(X_train_df_na)]
y_pred_df = pd.DataFrame({"energy_star_rating": y_pred}, index = X_train_df_na.index)
X_train.fillna(value = y_pred_df, inplace = True)

### Impute X_test (our validation set)

In [None]:
X_test_df_na = test_df_energy_isna[knn_features]
y_pred = [100 if y > 100 else y for y in pipe_knn_fitted.predict(X_test_df_na)]
y_pred_df = pd.DataFrame({"energy_star_rating": y_pred}, index = X_test_df_na.index)
X_test.fillna(value = y_pred_df, inplace = True)

In [None]:
assert X_train["energy_star_rating"].isna().sum() + X_test["energy_star_rating"].isna().sum() == 0

### End of Knn

### Column transformation & preprocessors

In [None]:
#numeric_transformer = make_pipeline(SimpleImputer(strategy="constant", fill_value=0), StandardScaler())
numeric_transformer = make_pipeline(KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean'), StandardScaler())
year_transformer = make_pipeline(SimpleImputer(strategy="constant", fill_value=1930), StandardScaler())
categorical_transformer = make_pipeline(
    OneHotEncoder(handle_unknown="ignore", sparse=True),
)

In [None]:
preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features),
    (year_transformer, year_features),        
    (categorical_transformer, categorical_features),
    ("drop", drop_features),
)

### Check transformed df

In [None]:
X_train_transformed = preprocessor.fit_transform(X_train)

In [None]:
column_names = (
    numeric_features
    + year_features    
    + preprocessor.named_transformers_["pipeline-3"]
    .named_steps["onehotencoder"]
    .get_feature_names_out(categorical_features)
    .tolist()
)

X_train_transformed_df = pd.DataFrame(
    X_train_transformed.toarray(), columns=column_names, index=X_train.index
)

X_train_transformed_df.head()

### Dummy regressor as baseline

In [None]:
results = {}
pipe_dummy = DummyRegressor()
results["Dummy"] = mean_std_cross_val_scores(
    pipe_dummy, X_train, y_train, return_train_score=True, scoring='neg_mean_squared_error'
)
pd.DataFrame(results).T

### Feature selection

### Train several models (CV) and retrieve the score

In [None]:
pipe_ridge = make_pipeline(preprocessor, Ridge(random_state=123))

pipe_rf = make_pipeline(
    preprocessor, RandomForestRegressor(n_estimators = 300, random_state=123, n_jobs=-1)
)

pipe_xgb = make_pipeline(
    preprocessor, XGBRegressor(random_state=123, 
                               n_jobs=-1, 
                               verbosity=0, 
                               n_estimators=10000, #28000
                               #tree_method='gpu_hist', 
                               #gpu_id=0
                              )
)

pipe_lgbm = make_pipeline(preprocessor, LGBMRegressor(#num_leaves = 32, 
                                                      #min_child_samples=100, 
                                                      random_state=123,
                                                      #feature_fraction = 0.9,
                                                      #lambda_l1 = 10, lambda_l2 = 10,
                                                      #bagging_freq=1,
                                                      #bagging_fraction=0.9,
                                                      verbose = 0,
                                                      n_estimators=10000, #28000
                                                      #device='gpu',
                                                      #force_col_wise=True,
                                                      # boosting="dart",
                                                      drop_seed=123
                                                     ))

pipe_catboost = make_pipeline( preprocessor, CatBoostRegressor( verbose=0,
                                                                early_stopping_rounds=10,
                                                                random_seed=123,
                                                                max_depth=12,
                                                                learning_rate=0.025,
                                                                loss_function='RMSE',
                                                                eval_metric= 'RMSE',
                                                                iterations=20000, #28000,
                                                                # task_type='GPU',
                                                                
                                                            ))

models = {
    #"Ridge": pipe_ridge,
    #"Random Forest": pipe_rf,
    "XGBoost": pipe_xgb,
    "LightGBM": pipe_lgbm,
    "CatBoost": pipe_catboost,
    #"kNN": pipe_kNN,
}


### Hyperparameter tuning

## Averaging

## Stacking

In [None]:
stacking_model = StackingRegressor(
    list(models.items())
)  # need the list() here for cross-validation to work!

### Test the selected model

### Generate csv for submission

In [None]:
# Transformation of test set
X_test_submit = pd.merge(X_test_submit, facility_class, on="facility_type")

# Impute energy_star_rating by kNN
X_test_submit_energy_isna = X_test_submit[X_test_submit["energy_star_rating"].isna()]
X_test_submit_na = X_test_submit_energy_isna[knn_features]
y_pred = [100 if y > 100 else y for y in pipe_knn_fitted.predict(X_test_submit_na)]
y_pred_df = pd.DataFrame({"energy_star_rating": y_pred}, index = X_test_submit_na.index)
X_test_submit.fillna(value = y_pred_df, inplace = True)



value = X_test_submit["direction_max_wind_speed"]
X_test_submit['dir_max_wind_speed'] = np.where(value > 337.5, "N",
                                            np.where(value > 292.5, "NE",
                                                    np.where(value > 247.5, "E",
                                                             np.where(value > 202.5, "SE",
                                                                      np.where(value > 157.5, "S",
                                                                               np.where(value > 112.5, "SW",
                                                                                        np.where(value > 67.5, "W",
                                                                                                 np.where(value > 22.5, "NW", "N"))))))))

value = X_test_submit["direction_peak_wind_speed"]
X_test_submit['dir_peak_wind_speed'] = np.where(value > 337.5, "N",
                                            np.where(value > 292.5, "NE",
                                                    np.where(value > 247.5, "E",
                                                             np.where(value > 202.5, "SE",
                                                                      np.where(value > 157.5, "S",
                                                                               np.where(value > 112.5, "SW",
                                                                                        np.where(value > 67.5, "W",
                                                                                                 np.where(value > 22.5, "NW", "N"))))))))

#value_floor = X_test_submit["floor_area"]
#X_test_submit['ord_floor_area'] =  np.where(value_floor > 261980, 7,
#                              np.where(value_floor > 148466, 6,
#                                     np.where(value_floor > 105070, 5,
#                                            np.where(value_floor > 80088, 4,
#                                                  np.where(value_floor > 65333, 3,
#                                                         np.where(value_floor > 53250, 2, 1))))))

## Select your submission model

In [None]:
select_model = pipe_xgb #stacking_model

In [None]:
X_final = pd.concat([X_train, X_test],ignore_index=True)
y_final = pd.concat([y_train, y_test],ignore_index=True)
pipe_fitted = select_model.fit(X_final, y_final);

In [None]:
submission = pd.DataFrame({'id': X_test_submit["id"], 'site_eui': select_model.predict(X_test_submit)})
submission.head()

In [None]:
message = "add monthly temp diff with xgboost"

if device.type == "cuda":
    path = "../working/test.csv"
    submission.to_csv( path, index=False)
else:
    path = "submission/test.csv"
    submission.to_csv("submission/test.csv", index=False)

## Submit to Kaggle

In [None]:
submission = !kaggle competitions submit -c widsdatathon2022  -f $path -m "$message"

In [None]:
# show your latest score
result = !kaggle competitions submissions -c widsdatathon2022 -v
latest = pd.DataFrame(data=result)[0].str.split(',',expand=True).iloc[1:3,0:5]
latest.columns = latest.iloc[0]
latest = latest[1:]
latest

## Logging (after submission)

In [None]:
log = {}
result = !kaggle competitions submissions -c widsdatathon2022 -v
log['time'] = pd.DataFrame(data=result)[0].str.split(',',expand=True).loc[2,1]
log['model'] = pipe_xgb
log['columns'] = column_names
log['score'] =pd.DataFrame(data=result)[0].str.split(',',expand=True).loc[2, 4]
log['message'] = pd.DataFrame(data=result)[0].str.split(',',expand=True).loc[2,2]

In [None]:
with open(r'log.csv', 'a') as f:
    writer = csv.writer(f)
    writer.writerow(list(log.values()))