# Importing Libraries and Packages

In [None]:
import pandas as pd
import numpy as np
import os
import csv
import datetime
from datetime import date
import calendar
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix , mean_squared_error
from sklearn.linear_model import LogisticRegression 
from sklearn.datasets import make_regression
#from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import linear_model
from pandas.plotting import scatter_matrix

# Reading the file

In [None]:
os.getcwd()
os.chdir("C:/Users/Ayushri Bhargava/Desktop/Spring '18/Machine Learning/MSA8150 Projects/Amusement Park")
df = pd.read_csv("Train.csv")

# Feature Engineering

### Creating new date, new time, day of the week, month, US holidays and seasons

In [None]:
df['TimeStamp'] = pd.to_datetime(df['TimeStamp'])
df['new_date'] = [d.date() for d in df['TimeStamp']]
df['new_time'] = [d.time() for d in df['TimeStamp']]
df['new_date'] = pd.to_datetime(df['new_date'])
df['day_of_week'] = df['new_date'].dt.weekday_name
df['day_of_week'] = pd.DatetimeIndex(df['new_date']).weekday
df['month'] = pd.DatetimeIndex(df['new_date']).month
df['hour'] = pd.DatetimeIndex(df['TimeStamp']).hour

In [None]:
dr = pd.date_range(start='2066-01-01', end='2068-12-31')
cal = calendar()
holidays = cal.holidays(start=dr.min(), end=dr.max())
df['Holiday'] = df['new_date'].isin(holidays)
df.loc[df['Holiday'] == True, 'Holiday'] = 1
df.loc[df['Holiday'] != True, 'Holiday'] = 0

In [None]:
def put_season(row):
    if row['month'] in [1,2,3] :
        return 'Winter'
    elif row['month'] in [4,5,6] :
        return 'Spring'
    elif row['month'] in [7,8,9]:
        return 'Summer'
    elif row['month'] in [10,11,12]:
        return 'Winter'
    else:
        return 'Other'

df['season'] = df.iloc[0:].apply (lambda row: put_season(row),axis=1)

## Creating dummy variables

In [None]:
df1 = pd.get_dummies(data = df, columns = ['season','day_of_week'])
df1= df1.rename(columns={'day_of_week_0': 'Monday', 'day_of_week_1': 'Tuesday','day_of_week_2' :'Wednesday','day_of_week_3':'Thursday',
                         'day_of_week_4':'Friday','day_of_week_5': 'Saturday','day_of_week_6': 'Sunday','season_Spring' : 'Spring',
                        'season_Winter' : 'Winter','season_Summer':'Summer'})

In [None]:
Final_data = df1[['Ticket1','Ticket2','StandardTemperature','Humidity','Wind','month','Holiday','Spring','Summer',
                  'Winter','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']]

# Exploratory Data Analysis

Summary of the Data

In [None]:
Final_data.describe()

Creating a histogram for all the features

In [None]:
Final_data.hist()

Number of tickets sold per month

In [None]:
%matplotlib inline
a =  Final_data[["Ticket1","Ticket2","month"]]
b = a.groupby(['month']).count().reset_index()
b.head()
b.plot.bar(x = 'month', y=['Ticket1','Ticket2'])

Forming scatter matrix

In [None]:
scatter_matrix(Final_data, alpha=0.2, figsize=(6, 6), diagonal='kde')

Creating correlation matrix and heatmap

In [None]:
corr = Final_data.corr()
sns.heatmap(corr, xticklabels=corr.columns.values,yticklabels=corr.columns.values)

# Building the model

In [None]:
X = Final_data.iloc[:,2:]
y = Final_data.iloc[:,0:2].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X

### LASSO REGRESSION

In [None]:
lasmodel1 = linear_model.Lasso(alpha = 0.45,tol = 0.01).fit(X_train,y_train)
lasmodel1.score(X_test,y_test)

### RIDGE REGRESSION

In [None]:
lasmodel2 = linear_model.Ridge(alpha = 0.45,tol = 0.00001).fit(X_train,y_train)
lasmodel2.score(X_test,y_test)

### MULTIOUTPUT REGRESSOR

In [None]:
multi_output = MultiOutputRegressor(GradientBoostingRegressor(random_state=40))
multi_output_fit = multi_output.fit(X_train,y_train)
pred = multi_output_fit.predict(X_test)
multi_MSE = mean_squared_error(y_test, pred)
multi_MSE

### ADA BOOST REGRESSOR

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error
rfr = MultiOutputRegressor(AdaBoostRegressor(n_estimators = 70,random_state=42, learning_rate = 0.4)).fit(X_train,y_train)
y_pred = rfr.predict(X_test)
rfr_MSE = mean_squared_error(y_test,y_pred)
rfr_MSE

### RANDOM FOREST

### GridSearch CV using K-fold

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Finding the rank of parameters

In [None]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validreport(random_search.cv_results_)ation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
report(rf_random.cv_results_)

### Fitting the model

In [None]:
regr_rf = RandomForestRegressor(bootstrap = True, min_samples_leaf = 2, n_estimators = 2000, max_features = 'auto', 
                                min_samples_split = 2, max_depth= 90)
fit = regr_rf.fit(X_train, y_train)
fit.score(X_test,y_test)
y_rf = fit.predict(X_test)
MSE = mean_squared_error(y_test, y_rf)
MSE