# Importing Modules

In [1]:
# Standard imports
import os
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import plotly.figure_factory as ff

import seaborn as sns
from tqdm import trange
from colorama import Fore
from glob import glob
import json
from pprint import pprint
import time
import cv2
from enum import Enum
from IPython.display import display
from pandas_profiling import ProfileReport
from IPython.display import HTML, display
import random
import inspect

# For Data preparation
from sklearn.preprocessing import *
from sklearn.model_selection import *
from sklearn.metrics import *

# Regression Models
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, VotingRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.neighbors import KNeighborsRegressor

from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor




import warnings
warnings.filterwarnings("ignore")

# Config

In [2]:
class Config(Enum):
    '''
    It basically contains all the path location and other stuffs
    '''

    def __str__(self):
        return self.value

    TRAIN_CSV = "../input/wipro-challenge/train.csv"
    TEST_CSV = "../input/wipro-challenge/test.csv"
    SAMPLE_CSV = "../input/wipro-challenge/sample_submission.csv"
    FEATURES = [
            'Year',
            'Month',
            'Day',
            'Hour',
            'Minute',
            'Cloud Type',
            'Dew Point',
            'Temperature',
            'Pressure',
            'Relative Humidity',
            'Solar Zenith Angle',
            'Precipitable Water',
            'Wind Direction',
            'Wind Speed',
            'Fill Flag'
            ]
    LABEL = [
            'Clearsky DHI', 
            'Clearsky DNI', 
            'Clearsky GHI',
            ]

# Helper functions

In [24]:
def plotCorrelation(df: "dataFrame"):
    """
    Helper function to plot correlation plot
    """
    data = [
        go.Heatmap(
            z=df.corr().values,
            x=df.columns.values,
            y=df.columns.values,
            colorscale='Rainbow',
            reversescale=False,
            #                 text = True,
            opacity=1.0)
    ]

    layout = go.Layout(
        title='Pearson Correlation plot',
        title_x=0.5,
        xaxis=dict(ticks='', nticks=36),
        yaxis=dict(ticks=''),
        width=900, height=700)

    fig = go.Figure(data=data, layout=layout)
    fig.show()
    
    
def plot_scatterMatrix(df: "dataframe", cols: list):
    """
    Helper function to plot scatter matrix
    """
    data_matrix = df.loc[:, cols]
    data_matrix["index"] = np.arange(1, len(data_matrix)+1)
    # scatter matrix
    fig = ff.create_scatterplotmatrix(data_matrix, diag='box', index='index', colormap='Portland',
                                      colormap_type='cat',
                                      height=700, width=700)
    fig.show()
    

def create_folds(data, target="label", regression=True, num_splits=5):
    """
    Helper function to create folds
    """
    data["kfold"] = -1
    data = data.sample(frac=1).reset_index(drop=True)
    kf = StratifiedKFold(n_splits=num_splits)

    if regression:
        # Applying Sturg's rule to calculate the no. of bins for target
        num_bins = int(1 + np.log2(len(data)))

        data.loc[:, "bins"] = pd.cut(data[target], bins=num_bins, labels=False)
        for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
            data.loc[v_, 'kfold'] = f
        data = data.drop(["bins"], axis=1)
    else:
        for f, (t_, v_) in enumerate(kf.split(X=data, y=data[target].values)):
            data.loc[v_, 'kfold'] = f

    return data


def rmse_score(y_label, y_preds):
    """
    Gives RMSE score
    """
    return np.sqrt(mean_squared_error(y_label, y_preds))


def trainRegModels(df: "data_file", useStandardization: bool, features: list, label: str, sortByRMSE = True):
    """
    To automate the training of regression models. Considering
        > RMSE
        > R2 score
    """
    regModels = {
        "LinearRegression": LinearRegression(),
        "KNeighborsRegressor": KNeighborsRegressor(n_neighbors=2),
        "AdaBoostRegressor": AdaBoostRegressor(random_state=0, n_estimators=100),
        "LGBMRegressor": LGBMRegressor(),
        "Ridge": Ridge(alpha=1.0),
        "ElasticNet": ElasticNet(random_state=0),
        "GradientBoostingRegressor": GradientBoostingRegressor(random_state=0),
        "DecisionTreeRegressor": DecisionTreeRegressor(),
        "ExtraTreesRegressor": ExtraTreesRegressor(n_jobs=-1),
        "RandomForestRegressor": RandomForestRegressor(n_jobs=-1),
        "XGBRegressor": XGBRegressor(n_jobs=-1),
        "CatBoostRegressor": CatBoostRegressor(iterations=900, depth=5, learning_rate=0.05, loss_function='RMSE'),
    }

    # Will return this as a data frame
    summary = {
        "Model": [],
        "Avg R2 Train Score": [],
        "Avg R2 Val Score": [],
        "Avg RSME Train Score": [],
        "Avg RSME Val Score": []
    }

    # Training
    folds = 1 + max(df.kfold.values)
    for idx in trange(len(regModels.keys()), desc=f"Models are training, LABEL: {label}...", bar_format="{l_bar}%s{bar:50}%s{r_bar}" % (Fore.CYAN, Fore.RESET), position=0, leave=True):
        name = list(regModels.keys())[idx]
        model = regModels[name]

        # Initializing all the scores to 0
        r2_train = 0
        r2_val = 0
        rmse_train = 0
        rmse_val = 0

        # Running K-fold Cross-validation on every model
        for fold in range(folds):
            train_df = df.loc[df.kfold != fold].reset_index(drop=True)
            val_df = df.loc[df.kfold == fold].reset_index(drop=True)

            train_X = train_df[features]
            train_Y = train_df[label]
            val_X = val_df[features]
            val_Y = val_df[label]
            
            if useStandardization:
                ss = StandardScaler()
                ss.fit_transform(train_X)
                ss.transform(val_X)
                
            cur_model = model
            if name == 'CatBoostRegressor':
                cur_model.fit(train_X, train_Y, verbose=False)
            else:
                cur_model.fit(train_X, train_Y)

            Y_train_preds = model.predict(train_X)
            Y_val_preds = model.predict(val_X)

            # Collecting the scores
            r2_train += r2_score(train_Y, Y_train_preds)
            r2_val += r2_score(val_Y, Y_val_preds)

            rmse_train += rmse_score(train_Y, Y_train_preds)
            rmse_val += rmse_score(val_Y, Y_val_preds)

        # Pushing the scores and the Model names
        summary["Model"].append(name)
        summary["Avg R2 Train Score"].append(r2_train/folds)
        summary["Avg R2 Val Score"].append(r2_val/folds)
        summary["Avg RSME Train Score"].append(rmse_train/folds)
        summary["Avg RSME Val Score"].append(rmse_val/folds)

    # Finally returning the summary dictionary as a dataframe
    summary_df = pd.DataFrame(summary)
    if sortByRMSE:
        summary_df = summary_df.sort_values(["Avg RSME Val Score", "Avg R2 Val Score"], ascending = True)
    else:
        summary_df = summary_df.sort_values(["Avg R2 Val Score", "Avg RSME Val Score", ], ascending = True)
    return summary_df


def train2Test(train_df: "train dataframe", test_df: "test dataframe", useStandardization: bool, model_name: str, features: list, label: str, submission_name = "submission_df"):
    """
    Helper function to:
    > Train the given Model
    > Perform standardization in set to true
    > Perform Inference on the test data
    > Return the test data with the label
    > Saves the test data in `Inference Results` folder
    
    """
    regModels = {
        "LinearRegression": LinearRegression(),
        "KNeighborsRegressor": KNeighborsRegressor(n_neighbors=2),
        "AdaBoostRegressor": AdaBoostRegressor(random_state=0, n_estimators=100),
        "LGBMRegressor": LGBMRegressor(),
        "Ridge": Ridge(alpha=1.0),
        "ElasticNet": ElasticNet(random_state=0),
        "GradientBoostingRegressor": GradientBoostingRegressor(random_state=0),
        "DecisionTreeRegressor": DecisionTreeRegressor(),
        "ExtraTreesRegressor": ExtraTreesRegressor(n_jobs=-1),
        "RandomForestRegressor": RandomForestRegressor(n_jobs=-1),
        "XGBRegressor": XGBRegressor(n_jobs=-1),
        "CatBoostRegressor": CatBoostRegressor(iterations=900, depth=5, learning_rate=0.05, loss_function='RMSE'),
    }
    
    if model_name not in regModels:
        print("[INFO] Please select among the available models are the follows: ")
        pprint(list(regModels.keys()))
        return
    else:
        model = regModels[model_name]
        train_X = train_df[features]
        train_Y = train_df[label]
        test_features = test_df[features]
        if useStandardization:
            ss = StandardScaler()
            ss.fit_transform(train_X)
            ss.transform(test_features)
        
        model.fit(train_X, train_Y)
        
        #  Performing Inference
        test_preds = model.predict(test_features)
        test_df[label] = list(test_preds)
        
        # Saving the test preds
        if not os.path.exists("./Inference Results"):
            os.mkdir("./Inference Results")
        
        test_df.to_csv(f"./Inference Results/{submission_name}.csv", index = False)
        return test_df
    

# Loading Data

In [5]:
data_df = pd.read_csv(Config.TRAIN_CSV.value)[['Year', 'Month', 'Day', 'Hour', 'Minute', 'Cloud Type', 'Dew Point',
       'Temperature', 'Pressure', 'Relative Humidity', 'Solar Zenith Angle',
       'Precipitable Water', 'Wind Direction', 'Wind Speed', 'Fill Flag','Clearsky DNI', 'Clearsky GHI', 'Clearsky DHI']]
test_df = pd.read_csv(Config.TEST_CSV.value)
sample_df = pd.read_csv(Config.SAMPLE_CSV.value)

print(len(data_df), len(test_df), len(sample_df))
data_df.head()

In [6]:
sample_df.head()

# Building Model

In [30]:
%%time
for label in Config.LABEL.value:
    tmp_df = train2Test(train_df = data_df,
                        test_df = test_df,
                        useStandardization = True,
                        model_name = 'XGBRegressor',
                        features = Config.FEATURES.value,
                        label = label,
                        submission_name = f"{label}_submission_df.csv"
                      )
    

# Creating submission csv

In [31]:
submission_df = test_df[Config.LABEL.value]
submission_df.to_csv('submission_df.csv', index = False)
submission_df