In [None]:
import os
import json
import pickle
import warnings
import datetime
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import combinations

warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.base import BaseEstimator, ClassifierMixin

In [None]:
from flask import Flask
from flask_mail import Mail
from flask_restful import Api
from apispec import APISpec
from flask_cors import CORS
from apispec.ext.marshmallow import MarshmallowPlugin
from flask_apispec.extension import FlaskApiSpec
from flask_restful import Resource, fields
from flask_apispec import marshal_with, doc, use_kwargs
from marshmallow import Schema, fields
from flask_apispec.views import MethodResource
from pymongo.mongo_client import MongoClient
from flask_cors import  cross_origin


In [None]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Conv1D, LSTM, Dense, Flatten
from keras.callbacks import (
    EarlyStopping,
    ModelCheckpoint,
    ReduceLROnPlateau,
    TensorBoard,
)

In [None]:
print("tensorflow version:", tf.__version__)
physicalDevices = tf.config.list_physical_devices("GPU")
print(physicalDevices)

if len(physicalDevices) > 0:
    tf.config.experimental.set_memory_growth(physicalDevices[0], True)

In [None]:
ModelDir = "./Models/" + datetime.datetime.now().strftime("%Y%m%d-%H") + "/"
if not os.path.exists(ModelDir):
    os.mkdir(ModelDir)

In [None]:
%reload_ext tensorboard
modelPath = ModelDir + "model1.sav"
logsDir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H")
tensorboardCBK = TensorBoard(log_dir=logsDir, histogram_freq=1)
earlyStoppingCBK = EarlyStopping(
    monitor='val_loss', patience=10, verbose=0, mode='min')
modelCBK = ModelCheckpoint(
    modelPath+'.mcp.hdf5', save_best_only=True, monitor='val_loss', mode='min')
reduceLRPlateauCBK = ReduceLROnPlateau(
    monitor='val_loss', factor=0.1, patience=7, verbose=1, mode='min')
callbacks = [earlyStoppingCBK, 
             reduceLRPlateauCBK, tensorboardCBK]


In [None]:
# from metadata import BICYCLEMETADATA

In [None]:
class BicycleDataset:
    def __init__(
        self,
        bicycleFolderPath: str,
        metaDataFilepath: str,
        columnsRetain: list = ["day", "Total"],
    ):
        self.folderPath = bicycleFolderPath
        self.columnsRetain = columnsRetain
        self.metaData = json.load(open(metaDataFilepath, "r"))["BICYCLEMETADATA"]
        self.directionsMapping = {
            "NorthBound": 1,
            "SouthBound": 2,
            "WestBound": 3,
            "EastBound": 4,
        }
        self.bicycleDataFrame = self.LoadDataSet()

    def DropColumns(
        self,
        dataFrame: pd.DataFrame,
        renameColumns: dict,
    ):
        dataFrame = dataFrame[self.columnsRetain]
        dataFrame = dataFrame.rename(columns=renameColumns)
        return dataFrame

    def ConvertDaytoDateTime(self, dataFrame: pd.DataFrame):
        dataFrame["day"] = pd.to_datetime(dataFrame["day"])
        dataFrame = dataFrame.sort_values(by="day")
        return dataFrame

    def Get1HrIntervals(self, dataFrame: pd.DataFrame, columnName: str):
        dataFrame = dataFrame.resample("1H", on=columnName).sum().reset_index()
        return dataFrame

    def ConcatDataFrames(self, dataFrames: list):
        dataFrame = pd.concat(dataFrames, axis=1)
        retainColumns = ~dataFrame.columns.duplicated()
        dataFrame = dataFrame.loc[:, retainColumns]
        return dataFrame

    def FindBestDirections(self, row: np.ndarray):
        maxValue = row.max()
        return [
            self.directionsMapping[direction]
            for direction in row.index
            if row[direction] == maxValue
        ]

    def LoadDataSet(self):
        if os.path.isdir(self.folderPath) and self.folderPath[-1] != "/":
            print("enter a valid folderPath")
        else:
            bicycleDataFrame = None
            for data in self.metaData:
                print("Reading DataSet from", data["filename"])
                dataFrame = pd.read_csv(
                    self.folderPath + data["filename"], index_col=None, header=0
                )
                dataFrame = self.ConvertDaytoDateTime(dataFrame)
                dataFrame = self.Get1HrIntervals(dataFrame, "day")
                dataFrame = self.DropColumns(
                    dataFrame, renameColumns=data["renameColumns"]
                )
                dataFrame["Zipcode"] = data["Zipcode"]
                DFColumns = list(dataFrame.columns)
                columnsRearrange = [DFColumns[0], DFColumns[-1]] + DFColumns[1:-1]
                dataFrame = dataFrame[columnsRearrange]
                if type(bicycleDataFrame) == type(None):
                    bicycleDataFrame = dataFrame
                else:
                    bicycleDataFrame = pd.merge(
                        bicycleDataFrame, dataFrame, on=["day", "Zipcode"], how="outer"
                    )

        bicycleDataFrame = bicycleDataFrame.dropna()
        bicycleDataFrame["day"] = pd.to_datetime(bicycleDataFrame["day"])
        # bicycleDataFrame["EastBound"] = 0
        bicycleDataFrame["BestDirections"] = bicycleDataFrame[
            ["NorthBound", "SouthBound", "WestBound"]
        ].apply(self.FindBestDirections, axis=1)
        return bicycleDataFrame

In [None]:
bicycleDatasetFolderPath = "Dataset/Bicycle Dataset/"
bicycleMetaDataFilepath = "Dataset/Bicycle Dataset/metadata/metadata.json"
bicycleData = BicycleDataset(bicycleDatasetFolderPath, bicycleMetaDataFilepath)
bicycleDataFrame = bicycleData.bicycleDataFrame
print("bicycleDataFrame Shape", bicycleDataFrame.shape)
bicycleDataFrame.head()

In [None]:
class WeatherDataset:
    def __init__(self, weatherDatasetFolderPath):
        self.folderPath = weatherDatasetFolderPath
        self.replaceDirection = {
            "ESE": "E",
            "SSE": "S",
            "WSW": "W",
            "NNE": "N",
            "ENE": "E",
            "NNE": "N",
            "SSW": "S",
            "WNW": "W",
            "NNW": "N",
        }
        self.windDirectionEncoder = None
        self.climateEncoder = None
        self.weatherDataFrame = self.LoadDataSet()

    def LoadDataSet(self):
        weatherDataFrame = []
        fileList = os.listdir(self.folderPath)
        for fileName in tqdm(fileList):
            with open(self.folderPath + fileName, "r") as jsonFile:
                fileData = json.load(jsonFile)
            for date, weather in fileData.items():
                fileData = {}
                fileData["day"] = date
                for key, value in weather.items():
                    fileData[key] = value
                weatherDataFrame.append(fileData)

        weatherDataFrame = pd.DataFrame(weatherDataFrame)
        weatherDataFrame["Zipcode"] = 80309
        weatherDataFrame["day"] = pd.to_datetime(weatherDataFrame["day"])
        weatherDataFrame = self.PreprocessDataset(weatherDataFrame)
        return weatherDataFrame

    def PreprocessDataset(self, weatherDataFrame: pd.DataFrame):
        weatherDataFrame.replace(self.replaceDirection, inplace=True)
        windDirectionColumns = [
            columnName
            for columnName in weatherDataFrame.columns
            if columnName.__contains__("windDir")
        ]

        if self.windDirectionEncoder == None:
            self.windDirectionEncoder = LabelEncoder()
            uniquewindDirectionValues = []
            for column in windDirectionColumns:
                uniquewindDirectionValues += list(weatherDataFrame[column].unique())
            self.windDirectionEncoder = self.windDirectionEncoder.fit(
                uniquewindDirectionValues
            )

        for column in windDirectionColumns:
            weatherDataFrame[column] = self.windDirectionEncoder.transform(
                weatherDataFrame[column]
            )

        climateColumns = [
            columnName
            for columnName in weatherDataFrame.columns
            if columnName.__contains__("weather")
        ]

        if self.climateEncoder == None:
            self.climateEncoder = LabelEncoder()
            uniqueClimateValues = []
            for column in climateColumns:
                uniqueClimateValues += list(weatherDataFrame[column].unique())
            self.climateEncoder = self.climateEncoder.fit(uniqueClimateValues)

        for column in climateColumns:
            weatherDataFrame[column] = self.climateEncoder.transform(
                weatherDataFrame[column]
            )

        return weatherDataFrame

In [None]:
weatherDatasetFolderPath = "Dataset/Weather Dataset/JsonFiles/"
weatherData = WeatherDataset(weatherDatasetFolderPath)
weatherDataFrame = weatherData.weatherDataFrame
print("weatherDataFrame Shape", weatherDataFrame.shape)
weatherDataFrame.head()

In [None]:
finalDataFrame = pd.merge(
    bicycleDataFrame, weatherDataFrame, on=["day", "Zipcode"], how="outer"
)

In [None]:
finalDataFrame = finalDataFrame.dropna()
finalDataFrame = finalDataFrame.drop(columns=["day", "Zipcode"])
finalDataFrame.columns

In [None]:
finalDataFrame.head()

In [None]:
y = finalDataFrame["BestDirections"]
finalDataFrame = finalDataFrame.drop(
    columns=["NorthBound", "SouthBound", "WestBound", "BestDirections"]
)

In [None]:
MlBinarizer = MultiLabelBinarizer()
MlBinarizer = MlBinarizer.fit(y)
y = MlBinarizer.transform(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    finalDataFrame, y, test_size=0.2, random_state=42
)

In [None]:
print("X_train shape", X_train.shape)
print("X_test shape", X_test.shape)
print("y_train shape", y_train.shape)
print("y_test shape", y_test.shape)

In [None]:
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()

In [None]:
def CreateXGBClassifier(
    parameters: dict = {"tree_method": "hist", "device": "cuda", "verbosity": 1}
):
    XGBModel = XGBClassifier(**parameters)
    return XGBModel

In [None]:
def CreateLGBClassifier(parameters: dict = {"device": "gpu", "verbosity": 1}):
    LGBModel = LGBMClassifier(**parameters)
    return LGBModel

In [None]:
def CreateCBClassifier(
    parameters: dict = {
        "task_type": "GPU",
        "devices": "0:1",
        "verbose": 1,
        "iterations": 100,
    }
):
    CBModel = CatBoostClassifier(**parameters)
    return CBModel

In [None]:
def CreateLRClassifier(
    parameters: dict = {
        "n_jops": -1,
    }
):
    LRModel = LogisticRegression(**parameters)
    return LRModel

In [None]:
class CNNClassifier(BaseEstimator, ClassifierMixin):
    def __init__(
        self,
        inputShape,
        numClasses,
        epochs,
        batchSize,
        lossFunction,
        optimizer,
        metrics,
        verbose,
    ):
        self.verbose = verbose
        self.lossFunction = lossFunction
        self.optimizer = optimizer
        self.metrics = metrics
        self.inputShape = inputShape
        self.numClasses = numClasses
        self.epochs = epochs
        self.batchSize = batchSize
        self.classes_ = np.arange(self.numClasses)
        self.model = self.CreateCNNModel()

    def fit(self, X, y):
        X = X.reshape((X.shape[0], self.inputShape[0], self.inputShape[1]))

        self.model.fit(
            X, y, epochs=self.epochs, batch_size=self.batchSize, verbose=self.verbose
        )
        return self

    def predict(self, X):
        X = X.reshape((X.shape[0], self.inputShape[0], self.inputShape[1]))

        predictions = self.model.predict(X)
        return (predictions > 0.5).astype("int32")

    def predict_proba(self, X):
        return self.model.predict(X)

    def CreateCNNModel(self):
        model = Sequential()
        model.add(
            Conv1D(
                filters=64,
                kernel_size=3,
                activation="relu",
                input_shape=self.inputShape,
            )
        )
        model.add(Flatten())
        model.add(Dense(50, activation="relu"))
        model.add(Dense(self.numClasses, activation="sigmoid"))
        model.compile(
            loss=self.lossFunction,
            optimizer=self.optimizer,
            metrics=self.metrics,
        )
        return model

In [None]:
class LSTMClassifier(BaseEstimator, ClassifierMixin):
    def __init__(
        self,
        inputShape,
        numClasses,
        epochs,
        batchSize,
        lossFunction,
        optimizer,
        metrics,
        verbose,
    ):
        self.verbose = verbose
        self.lossFunction = lossFunction
        self.optimizer = optimizer
        self.metrics = metrics
        self.inputShape = inputShape
        self.numClasses = numClasses
        self.epochs = epochs
        self.batchSize = batchSize
        self.classes_ = np.arange(self.numClasses)
        self.model = self.CreateLSTMModel()

    def fit(self, X, y):
        X = X.reshape((X.shape[0], self.inputShape[0], self.inputShape[1]))
        # y = to_categorical(y, self.numClasses)
        self.model.fit(
            X, y, epochs=self.epochs, batch_size=self.batchSize, verbose=self.verbose
        )
        return self

    def predict(self, X):
        X = X.reshape((X.shape[0], self.inputShape[0], self.inputShape[1]))

        predictions = self.model.predict(X)
        return (predictions > 0.5).astype("int32")

    def predict_proba(self, X):
        return self.model.predict(X)

    def CreateLSTMModel(self):
        model = Sequential()
        model.add(LSTM(50, return_sequences=True, input_shape=self.inputShape))
        model.add(LSTM(50))
        model.add(Dense(50, activation="relu"))
        model.add(Dense(self.numClasses, activation="sigmoid"))
        model.compile(
            loss=self.lossFunction,
            optimizer=self.optimizer,
            metrics=self.metrics,
        )
        return model

In [None]:
def CreateEnsembleCombinations(MLModelsNames: list, minimumModels: int):
    EnsembleCombinations = []
    for length in range(minimumModels, len(MLModelsNames) + 1):
        for combo in combinations(MLModelsNames, length):
            EnsembleCombinations.append(list(combo))

    return EnsembleCombinations

In [None]:
lossFunction = "binary_crossentropy"
optimizer = "adam"
metrics = ["accuracy"]
inputShape = (X_train.shape[1], 1)
numClasses = y_train.shape[1]
print("inputShape:", inputShape)
print("numClasses:", numClasses)

In [None]:
XGBModel = MultiOutputClassifier(CreateXGBClassifier())
LGBModel = MultiOutputClassifier(CreateLGBClassifier())
CBModel = MultiOutputClassifier(CreateCBClassifier())
CNNModel = CNNClassifier(
    inputShape=inputShape,
    numClasses=numClasses,
    epochs=100,
    batchSize=32,
    lossFunction=lossFunction,
    optimizer=optimizer,
    metrics=metrics,
    verbose=1,
)
LSTMModel = LSTMClassifier(
    inputShape=inputShape,
    numClasses=numClasses,
    epochs=100,
    batchSize=32,
    lossFunction=lossFunction,
    optimizer=optimizer,
    metrics=metrics,
    verbose=1,
)
MLModels = {
    # "XGBModel": XGBModel,
    # "LGBModel": LGBModel,
    # "CBModel": CBModel,
    "CNNModel": CNNModel,
    "LSTMModel": LSTMModel,
}

In [None]:
EnsembleCombinations = CreateEnsembleCombinations(list(MLModels.keys()), 2)
EnsembleCombinations.reverse()
# finalEstimator = MultiOutputClassifier(LogisticRegression(n_jobs=-1))
finalEstimator = XGBModel
for EnsembleCombination in EnsembleCombinations:
    print(EnsembleCombination)

In [None]:
for EnsembleCombination in EnsembleCombinations:
    estimators = []
    for modelName in EnsembleCombination:
        estimators.append((modelName, MLModels[modelName]))
    print("EnsembleClassifer combination:", EnsembleCombination)
    EnsembleClassifer = StackingClassifier(
        estimators=estimators, verbose=1, final_estimator=finalEstimator
    )
    EnsembleClassifer.fit(X_train, y_train)

    y_pred = EnsembleClassifer.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"Ensemble model accuracy: {accuracy}")
    c = 0
    for ypred, yacc in zip(
        MlBinarizer.inverse_transform(y_pred), MlBinarizer.inverse_transform(y_test)
    ):
        # print( ypred, yacc)
        if any(label in yacc for label in ypred):
            c += 1
    customAcc = c / len(y_test)
    print(customAcc)

    pklFileName = "-".join(EnsembleCombination)
    with open(pklFileName + ".pkl", "wb") as pklFile:
        pickle.dump(EnsembleClassifer, pklFile)
    break

In [None]:
with open("MlBinarizer" + ".pkl", "wb") as pklFile:
    pickle.dump(MlBinarizer, pklFile)

In [None]:
with open("climateEncoder" + ".pkl", "wb") as pklFile:
    pickle.dump(weatherData.climateEncoder, pklFile)

In [None]:
with open("windDirectionEncoder" + ".pkl", "wb") as pklFile:
    pickle.dump(weatherData.windDirectionEncoder, pklFile)

In [None]:
# Best Params:
# XGBModel params: {'device': 'cuda', 'tree_method': 'hist', 'verbosity': 0, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0}
# LGBModel params: {'device': 'gpu', 'verbosity': 0, 'num_leaves': 31, 'max_depth': -1, 'learning_rate': 0.01, 'n_estimators': 100, 'reg_lambda': 0, 'reg_alpha': 0}
# CBModel params: {'task_type': 'GPU', 'devices': '0:1', 'verbose': 0, 'depth': 5, 'learning_rate': 0.1, 'iterations': 1000}
# LRModel params: {'max_iter': 100, 'n_jobs': -1}

In [None]:
# hyperParametersRanges = {
#     "XGB:device": ["cuda"],
#     # "XGB:objective": ["multi:softmax"],
#     "XGB:tree_method": ["hist"],
#     "XGB:verbosity": [0],
#     "XGB:max_depth": [3, 4, 5],
#     "XGB:learning_rate": [0.1, 0.2, 0.3, 0.4, 0.5],
#     "XGB:gamma": [0, 0.1],
#     "LGB:device": ["gpu"],
#     "LGB:verbosity": [0],
#     "LGB:num_leaves": [31, 50, 100, 150],
#     "LGB:max_depth": [-1, 5, 15, 20],
#     "LGB:learning_rate": [0.01, 0.05, 0.1, 0.2],
#     "LGB:n_estimators": [100, 500, 1000],
#     "LGB:reg_lambda": [0, 0.01, 0.1],
#     "LGB:reg_alpha": [0, 0.01, 0.1],
#     "CB:task_type": ["GPU"],
#     "CB:devices": ["0:1"],
#     "CB:verbose": [0],
#     "CB:depth": [5, 10],
#     "CB:learning_rate": [0.01, 0.1],
#     "CB:iterations": [100, 500, 1000],
#     # "LR:penalty": ["l1", "l2", "elasticnet", "none"],
#     # "LR:C": [0.001, 0.01, 0.1, 1],
#     # "LR:solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
#     "LR:max_iter": [100, 500],
#     # "LR:l1_ratio": [0, 0.5, 1],
#     "LR:n_jobs": [-1],
# }

In [None]:
# paramGrids = [
#     dict(zip(hyperParametersRanges.keys(), values))
#     for values in product(*hyperParametersRanges.values())
# ]

In [None]:
# XGBParams = {key.split(':')[1]: value for key, value in paramGrid.items() if key.startswith('XGB:')}
# LGBParams = {key.split(':')[1]: value for key, value in paramGrid.items() if key.startswith('LGB:')}
# CBParams = {key.split(':')[1]: value for key, value in paramGrid.items() if key.startswith('CB:')}

In [None]:
# for paramGrid in paramGrids:
#     XGBParams = {
#         key.split(":")[1]: value
#         for key, value in paramGrid.items()
#         if key.startswith("XGB:")
#     }
#     LGBParams = {
#         key.split(":")[1]: value
#         for key, value in paramGrid.items()
#         if key.startswith("LGB:")
#     }
#     CBParams = {
#         key.split(":")[1]: value
#         for key, value in paramGrid.items()
#         if key.startswith("CB:")
#     }
#     LRParams = {
#         key.split(":")[1]: value
#         for key, value in paramGrid.items()
#         if key.startswith("LR:")
#     }
#     XGBModel = MultiOutputClassifier(CreateXGBClassifier(XGBParams))
#     LGBModel = MultiOutputClassifier(CreateLGBClassifier(LGBParams))
#     CBModel = MultiOutputClassifier(CreateCBClassifier(CBParams))
#     LRModel = MultiOutputClassifier(CreateLRClassifier(LRParams))
#     MLModels = {
#         "XGBModel": XGBModel,
#         "LGBModel": LGBModel,
#         "CBModel": CBModel,
#     }
#     # EnsembleCombinations = CreateEnsembleCombinations(list(MLModels.keys()), 2)
#     # for EnsembleCombination in EnsembleCombinations:
#     # print(EnsembleCombination)
#     # for EnsembleCombination in EnsembleCombinations:
#     print("-"*30)
#     print("XGBModel params:", XGBParams)
#     print("LGBModel params:", LGBParams)
#     print("CBModel params:", CBParams)
#     print("LRModel params:", LRParams)
#     estimators = []

#     for key in MLModels.keys():
#         estimators.append((key, MLModels[key]))

#     EnsembleClassifer = StackingClassifier(
#         estimators=estimators, verbose=1, final_estimator=LRModel
#     )
#     EnsembleClassifer.fit(X_train, y_train)

#     y_pred = EnsembleClassifer.predict(X_test)

#     accuracy = accuracy_score(y_test, y_pred)
#     print(f"Ensemble model accuracy: {accuracy}")
#     c = 0
#     for ypred, yacc in zip(
#         MlBinarizer.inverse_transform(y_pred), MlBinarizer.inverse_transform(y_test)
#     ):
#         if any(label in yacc for label in ypred):
#             c += 1
#     print(f"Ensemble model custom accuracy: {c / len(y_test)}")

In [None]:
# import requests

# emailConfig = {
#     "sendersEmailId": "99kalitkar@gmail.com",
#     "sendersMessage": "BigData Project executed",
#     "sendersSubject": f"Accuracy: {accuracy}, {customAcc}",
# }
# response = requests.post(
#     url="https://www.restapi.99kalitkar.in/email",
#     json=emailConfig,
#     headers={"Content-Type": "application/json"},
# )
# print(response.json())
# if response.json().get("success", False):
#     print("Thanks for your Email. I will respond as soon as possible!")