In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

import scipy.optimize as so
from scipy.optimize import minimize
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error # Requires sklearn 0.24 (December 2020), update with conda/pip if needed.

from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold

from sklearn.linear_model import Ridge
from sklearn.metrics import roc_curve, precision_recall_curve, auc
from sklearn.preprocessing import PolynomialFeatures
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import make_scorer
seed = 42

# Statistical Significance 

In [3]:
df = pd.read_csv('forestfires.csv')

In [4]:
((df.month.value_counts()/df.month.value_counts().sum())*100).round(2)

aug    35.59
sep    33.27
mar    10.44
jul     6.19
feb     3.87
jun     3.29
oct     2.90
apr     1.74
dec     1.74
jan     0.39
may     0.39
nov     0.19
Name: month, dtype: float64

In [5]:
# Group insignificant labels into two new statistically significant labels.
df.month.replace({'feb':'fj', 'jun':'fj'}, inplace=True)

df.month.replace({'oct':'oadjmn', 'apr':'oadjmn', 'dec':'oadjmn', 
    'jan':'oadjmn', 'may':'oadjmn', 
    'nov':'oadjmn'}, inplace=True)

In [6]:
((df.month.value_counts()/df.month.value_counts().sum())*100).round(2)

aug       35.59
sep       33.27
mar       10.44
oadjmn     7.35
fj         7.16
jul        6.19
Name: month, dtype: float64

# Data Convert
convert all categorical data into numerical data using `get_dummies`.

In [None]:
df = pd.get_dummies(df, drop_first=True)

# Pipeline

In [None]:
M1 = Pipeline([
    ('lr1', LinearRegression())
])

M2 = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('lr3', LinearRegression())
])

## Custom Transformer

`Temperature (temp)` and `Rain (rain)` may be important features, so let's extend model 1 by adding a *cubed* term for temp and a *squared* term for rain.

In [None]:
class KeyFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.assign(temp2 = X.temp**3)
        X = X.assign(rain2 = X.rain**2)
        return X

# Create a pipeline for model 3 (M3) [ /8 marks]
M3 = Pipeline([
    ('temp_cubic_rain_square', KeyFeatures()),
    ('lr2', LinearRegression())
])

# K-flod cross-validation

In [None]:
def MSE(y, ypr):
    return np.mean((y-ypr)**2)
    
kf = KFold(n_splits=4)
sc = make_scorer(MSE) 

cvsc1 = cross_val_score(M1, Xtrain, ytrain, cv=kf, scoring=sc)
cvsc2 = cross_val_score(M2, Xtrain, ytrain, cv=kf, scoring=sc)
cvsc3 = cross_val_score(M3, Xtrain, ytrain, cv=kf, scoring=sc)

print(f"M1 loss: %.4f +/- %.4f" % (cvsc1.mean(), cvsc1.std()))
print(f"M2 loss: %.4f +/- %.4f" % (cvsc2.mean(), cvsc2.std()))
print(f"M3 loss: %.4f +/- %.4f" % (cvsc3.mean(), cvsc3.std()))

# Trees

## Random Forest

In [None]:
df = pd.read_csv('energy_appliances_standard.csv')
y = df["Appliances"]
X =  df.drop("Appliances",axis=1)
RANDOM_STATE = 20201107
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y, test_size = 0.3, random_state = seed)

In [None]:
ff = RandomForestRegressor(
            n_estimators = 250,
            max_features=None,
            oob_score=True,
            random_state=RANDOM_STATE,
        )

ff.fit(Xtrain, ytrain)

# Calculate error over test set
y_pred = ff.predict(Xtest)
err_test = mean_absolute_percentage_error(y_pred, ytest)

print("MAPE (test set): %f" % err_test)

## XGboosting

In [None]:
df = pd.read_csv('energy_appliances_standard.csv')
y = df["Appliances"]
X =  df.drop("Appliances",axis=1)
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y, test_size = 0.3, random_state = RANDOM_STATE)

XGB_opt = XGBRegressor(learning_rate = 0.1,  
                            max_depth =  7,
                            n_estimators = 450,
                            verbosity=1,                  # If to show more errors or not.
                            objective='reg:squarederror',       # Type of target variable.
                            booster='gbtree',             # What to boost. Trees in this case.
                            n_jobs=-1,                    # Parallel jobs to run. Set your processor number.
                            gamma=0.001,                  # Minimum loss reduction required to make a further partition on a leaf node of the tree. (Controls growth!)
                            subsample=0.632,              # Subsample ratio. Can set lower
                            colsample_bytree=1,           # Subsample ratio of columns when constructing each tree.
                            colsample_bylevel=1,          # Subsample ratio of columns when constructing each level. 0.33 is similar to random forest.
                            colsample_bynode=1,           # Subsample ratio of columns when constructing each split.
                            base_score=0.5,               # Global bias. Set to average of the target rate.
                            random_state=RANDOM_STATE        # Seed
                            )


XGB_opt.fit(Xtrain, ytrain)

In [None]:
y_pred_xgb = XGB_opt.predict(Xtest)
err_test_xgb = mean_absolute_percentage_error(y_pred_xgb, ytest)

In [None]:
# Variable importance
importances = XGB_opt.feature_importances_
indices = np.argsort(importances)[::-1] 

f, ax = plt.subplots(figsize=(3, 8))
plt.title("Variable Importance - XGBoosting")
sns.set_color_codes("pastel")
sns.barplot(y=[Xtrain.columns[i] for i in indices], x=importances[indices], 
            label="Total", color="b")
ax.set(ylabel="Variable",
       xlabel="Variable Importance")
sns.despine(left=True, bottom=True)

# Dimension Reduction

In [None]:
import umap.umap_ as umap
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.linear_model import LogisticRegressionCV

# Clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score