In [0]:
"""
##################################### Creating Dataset #################################################
"""
import pandas as pd
import numpy as np
from datetime import date
from sklearn import metrics
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgb
from itertools import product
from catboost import CatBoostRegressor

""" Functions """

def extract_hour(duration):
    if 'h' in duration:
        return int(duration.split(' ')[0].strip('h'))
    else:
        return 0
    
def extract_minute(duration):
    if len(duration.split(' '))==1:
        if 'm' in duration:
            return int(duration.strip('m'))
        else:
            return 0
    else:
        return int(duration.split(' ')[1].strip('m'))
    
def Num_stops(Total_stops):
    if Total_stops=='non-stop':
        return 0
    else:
        return int(Total_stops.split(' ')[0])
    
def stop(route,n):
    stops = route.split(' → ')
    if (len(stops)>=n+2):
        return route.split(' → ')[n]
    else:
        return '0'

def get_booking_day(journey_date):
    ref_date = date(2019,3,1)
    day = journey_date-ref_date
    return [i.days for i in day]

def get_total_duration(minutes,hours):
    hour_minutes = hours.replace({i:i*60 for i in range(0,35)})
    total_duration = minutes+hour_minutes
    return total_duration

def price_flag(Price):
    if Price >0 and Price <=8000:
        return 0
    if Price >8000 and Price <=15000:
        return 1
    if Price >15000:
        return 2

def my_custom_loss_func(y_true, y_pred):
    msle = metrics.mean_squared_error(np.log(y_true+1), np.log(y_pred+1))
    return np.sqrt(msle)


""" External Data """

National_Holidays = ['21/03/2019','17/04/2019','19/04/2019']
Restricted_Holidays = ['4/03/2019','13/04/2019','18/05/2019','5/06/2019']
Holidays = National_Holidays+Restricted_Holidays

"""
Data Prepration and Preprocessing
"""

def preprocess(TrainData):
    TrainData['Destination'].replace({'New Delhi':'Delhi'},inplace=True)
    TrainData['Journey_date'] = TrainData['Date_of_Journey'].apply(lambda x: date(*map(int, reversed(x.split("/")))))
    TrainData['Journey_weekday'] = TrainData['Journey_date'].apply(lambda x: x.weekday())
    TrainData['Journey_month'] = TrainData['Journey_date'].apply(lambda x: x.month)
    TrainData['Journey_day'] = TrainData['Journey_date'].apply(lambda x: x.day)
    TrainData['Dep_Time_hour'] = TrainData['Dep_Time'].apply(lambda x: int(x.split(":")[0]))
    TrainData['Arrival_Time_hour'] = TrainData['Arrival_Time'].apply(lambda x: int(x.split(" ")[0].split(":")[0]))
    TrainData['Duration_hour'] = TrainData['Duration'].apply(lambda x: extract_hour(x))
    TrainData['Duration_minute'] = TrainData['Duration'].apply(lambda x: extract_minute(x))
    TrainData = TrainData[TrainData['Total_Stops']!='Not Avialable']
    TrainData = TrainData[TrainData['Total_Stops'].isnull() == False]
    TrainData['Num_stops'] = TrainData['Total_Stops'].apply(lambda x: Num_stops(x))
    TrainData['Booking_day'] = get_booking_day(TrainData['Journey_date'])
    TrainData['Total_Duration'] = get_total_duration(TrainData['Duration_minute'],TrainData['Duration_hour'])
    TrainData['Overnight'] = np.where([len(x)>5 for x in TrainData['Arrival_Time']],1,0)
    TrainData['Holiday'] = np.where(TrainData['Date_of_Journey'].isin(Holidays),1,0)
    TrainData['Dep_slot'] = TrainData['Dep_Time_hour']
    TrainData['Arrival_slot'] = TrainData['Arrival_Time_hour']
    TrainData['SourceDestCombo'] = TrainData['Source']
    TrainData['Additional_Info'] = TrainData['Additional_Info'].replace('No Info','No info')
    TrainData['Weekday_Flag'] = TrainData['Journey_weekday'].apply(lambda x:1 if x in [4,6] else 0)
    TrainData['luxury_flag'] = TrainData['Airline'].apply(lambda x:1 if x in ['Jet Airways Business'] else 0)

#    One Hot Encoding  for Route 
    for i in range(1,max(TrainData['Num_stops'])+1):
        TrainData['Stop_'+str(i)] = TrainData['Route'].apply(lambda x: stop(x,i))
        
#    One Hot Encoding
    for col in ['Airline', 'SourceDestCombo', 'Additional_Info','Dep_slot','Arrival_slot','Journey_weekday']:
        dfDummies = pd.get_dummies(TrainData[col], prefix = 'Encoded_'+col)
        TrainData = pd.concat([TrainData, dfDummies], axis=1)
        
#    One Hot Encoding  for Route 
    for i in range(1,max(TrainData['Num_stops'])+1):
        TrainData['Stop_'+str(i)] = TrainData['Route'].apply(lambda x: stop(x,i))
        dfDummies = pd.get_dummies(TrainData['Stop_'+str(i)],prefix = 'Encoded_Stop_'+str(i),drop_first=True)
        TrainData = pd.concat([TrainData, dfDummies], axis=1)

    return TrainData

"""
##################################### Model Training ###################################################
"""

""" Loading Data """
TrainData = pd.read_excel('file:///C:/Users/vadaga/Desktop/machine hack/Data_Train.xlsx')
TrainData = preprocess(TrainData)
TestData = pd.read_excel('file:///C:/Users/vadaga/Desktop/machine hack/Test_set.xlsx')
TestData = preprocess(TestData)

""" Subsetting Data """
#For One Hot Encoded
OneHotEncodedColumns = [col for col in TrainData.columns if col.startswith('Encoded_')]
independent = OneHotEncodedColumns + ['Booking_day','Num_stops','Total_Duration','Journey_day','Journey_month','Journey_weekday','Dep_slot','Arrival_slot']
dependent = ['Price']
X_train = TrainData[independent]
y_train = TrainData[dependent]



"""RF"""
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 2888,max_depth=24,max_features=0.646, random_state = 42)
rf.fit(X_train, y_train)
for col in [col for col in independent if col not in TestData.columns]:
     TestData[col] = np.NAN
y_scored = rf.predict(TestData[independent])

submission = pd.DataFrame({'Price':y_scored})
submission.to_excel('C:/Users/vadaga/Desktop/machine hack/rf_pt_1.xlsx',index=False)


""" XGBoost """
est = GradientBoostingRegressor(max_depth=7,n_estimators=1000,max_features=0.55,
                                learning_rate=0.1,loss='ls',subsample=1.0,random_state=0)
est.fit(X_train, y_train)
y_pred = est.predict(X_train)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred)))
print('Root Mean Squared logrithmic Error:', np.sqrt(metrics.mean_squared_error(np.log(y_train+1), np.log(y_pred+1))))

"""
Model Scoring
"""

for col in [col for col in independent if col not in TestData.columns]:
     TestData[col] = 0
y_scored = est.predict(TestData[independent])

submission = pd.DataFrame({'Price':y_scored})
submission.to_excel('C:/Users/vadaga/Desktop/machine hack/Base3_v4.xlsx',index=False)

###########################################################################################################3
"""
LIGHTGBM
"""

#params = {'max_depth':6,'min_data_in_leaf':1,'learning_rate':0.0664,'subsample':0.585,'colsample_bytree':0.45695,'max_bin':417,'n_estimators':1083}
params = {'num_leaves':50,'learning_rate':0.051134,'n_estimators':1150,'subsample':0.79467,'col_sample_bytree':0.44483,'min_data_in_leaf':1,'max_bin':445}
train_data = lgb.Dataset(X_train, label=y_train, feature_name=independent)
lgbm = lgb.train(params, train_data)

"""
Model Scoring
"""
for col in [col for col in independent if col not in TestData.columns]:
     TestData[col] = np.NAN
y_scored = lgbm.predict(TestData[independent])


submission = pd.DataFrame({'Price':y_scored})
submission.to_excel('C:/Users/vadaga/Desktop/machine hack/lgbm_train.xlsx',index=True)


### Creating Models by  tuning n_estimators  by varying from its values from 20 to 80 in steps of 10
param_test1 = {'n_estimators':np.arange(20,81,10)}
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, min_samples_split=500,min_samples_leaf=50,max_depth=8,max_features='sqrt',subsample=0.8,random_state=10), 
param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1.fit(Independent,Dependent)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

Learning_Rate = np.array([0.1])
max_depth = np.array([8])
max_bin = np.array([400])
min_data_in_leaf = np.array([1])
feature_fraction = np.array([1])
count = 0
for i,j,k,l,m in product(min_data_in_leaf,Learning_Rate,max_depth,feature_fraction,max_bin):
    params = {'feature_fraction': l, 'max_bin': m, 'max_depth': k, 'min_data_in_leaf': i,'bagging_fraction':1,'num_iterations':1000}
    lgbm = lgb.train(params, train_data)
    for col in [col for col in independent if col not in TestData.columns]:
        TestData[col] = np.NAN
    y_scored = lgbm.predict(TestData[independent])
    submission = pd.DataFrame({'Price':y_scored})
    file_name = 'Base'+ 'x' + str(l) + 'x' + str(m) + 'x' + str(k) + 'x' + str(i) + 'x' + str(j)
    count = count+1
    print(count)
    submission.to_excel('C:/Users/vadaga/Desktop/machine hack/grid_submissions_1/' + file_name + '.xlsx',index=False)

#######################################################################################################################3
y_test = pd.read_excel('C:/Users/vadaga/Desktop/machine hack/Mean_V3.xlsx')

"""CATBOOST"""

def my_custom_loss_func(y_true, y_pred):
    msle = metrics.mean_squared_error(np.log(y_true+1), np.log(y_pred+1))
    return np.sqrt(msle)

model=CatBoostRegressor(iterations=2000, depth=8, learning_rate=0.01, loss_function = 'RMSE',l2_leaf_reg = 3,border_count=5)
model.fit(X_train, y_train,eval_set=(X_test,y_test),plot=True)

for col in [col for col in independent if col not in TestData.columns]:
     TestData[col] = np.NAN
y_scored = model.predict(TestData[independent])

submission = pd.DataFrame({'Price':y_scored})
submission.to_excel('C:/Users/vadaga/Desktop/machine hack/Base4_V3.xlsx',index=False)

""" MLPRegressor """

model = MLPRegressor(hidden_layer_sizes=(75,75,75),max_iter=1500,learning_rate_init = 0.01)
model.fit(X_train,y_train)

for col in [col for col in independent if col not in TestData.columns]:
     TestData[col] = 0
y_scored = model.predict(TestData[independent])
submission = pd.DataFrame({'Price':y_scored})
submission.to_excel('Submissions/MLPR_14.xlsx',index=False)

################################################ Parameter Tunning ##################################################
""" GridSearchCV """
# Grid Search for Algorithm Tuning
from scipy.stats import uniform, randint
from sklearn.model_selection import RandomizedSearchCV

""" Custom Scorer """
from sklearn.metrics import make_scorer
def my_custom_loss_func(y_true, y_pred):
    msle = metrics.mean_squared_error(np.log(y_true+1), np.log(y_pred+1))
    return np.sqrt(msle)
score = make_scorer(my_custom_loss_func, greater_is_better=False)

""" Hyperparam grid """
num_leaves = randint(10,1000)
learning_rate = uniform(loc=0.01, scale=0.3)
n_estimators = randint(100,2000)
subsample = uniform(loc=0.5, scale=0.5)
colsample_bytree = uniform(loc=0.5, scale=0.5)
max_depth = randint(2,10)
#feature_fraction = uniform(0,1)
min_data_in_leaf = randint(1,50)
max_bin = randint(20,500)
hyperparameters = dict(num_leaves=num_leaves,learning_rate=learning_rate,n_estimators=n_estimators,subsample=subsample,
                       colsample_bytree=colsample_bytree,max_depth = max_depth,min_data_in_leaf=min_data_in_leaf)

# Create and fit a XGBoost model, testing each parameter
model = lgb.LGBMRegressor()
randgrid = RandomizedSearchCV(model, hyperparameters, random_state=1, n_iter=2, cv=3, n_jobs=-1,scoring=score)
randgrid.fit(X_train, y_train)

# summarize the results of the grid search
print('Best num_leaves:', randgrid.best_estimator_.get_params()['num_leaves'])
print('Best learning_rate:', randgrid.best_estimator_.get_params()['learning_rate'])
print('Best n_estimators:', randgrid.best_estimator_.get_params()['n_estimators'])
print('Best subsample:', randgrid.best_estimator_.get_params()['subsample'])
print('Best colsample_bytree:', randgrid.best_estimator_.get_params()['colsample_bytree'])
print('Best max_depth:', randgrid.best_estimator_.get_params()['max_depth'])
print('Best feature_fraction:', randgrid.best_estimator_.get_params()['feature_fraction'])
print('Best min_data_in_leaf:', randgrid.best_estimator_.get_params()['min_data_in_leaf'])
print('Best max_bin:', randgrid.best_estimator_.get_params()['max_bin'])
print(randgrid)

def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

report(randgrid.cv_results_)

###################################### LGBM TUNING ##########################################################
""" RandomSearchCV """
from scipy.stats import uniform, randint
from sklearn.model_selection import RandomizedSearchCV

""" Custom Scorer """
from sklearn.metrics import make_scorer
def my_custom_loss_func(y_true, y_pred):
    msle = metrics.mean_squared_error(np.log(y_true+1), np.log(y_pred+1))
    return np.sqrt(msle)
score = make_scorer(my_custom_loss_func, greater_is_better=False)

""" Hyperparam grid """
n_estimators = randint(800,1300)
min_data_in_leaf = randint(1,3)
max_bin = randint(20,600)
feature_fraction = uniform(loc=0.01,scale=0.9)
max_depth = randint(1,10)
subsample = uniform(loc=0.1,scale=0.8)
colsample_bytree = uniform(loc=0.45,scale=0.7)
hyperparameters = dict(n_estimators=n_estimators,subsample=subsample,
                       colsample_bytree=colsample_bytree,min_data_in_leaf=min_data_in_leaf,max_bin=max_bin,feature_fraction=feature_fraction,
                       max_depth = max_depth
                      )

""" Create and fit the model, testing each parameter """
model = lgb.LGBMRegressor()
randgrid = RandomizedSearchCV(model, hyperparameters, random_state=1, n_iter=300, cv=3, n_jobs=-1, scoring=score)
randgrid.fit(X_train, y_train)

"""
################################## Summarize the results of the grid search #########################################
"""

print('Best num_leaves:', randgrid.best_estimator_.get_params()['num_leaves'])
print('Best learning_rate:', randgrid.best_estimator_.get_params()['learning_rate'])
print('Best n_estimators:', randgrid.best_estimator_.get_params()['n_estimators'])
print('Best subsample:', randgrid.best_estimator_.get_params()['subsample'])
print('Best colsample_bytree:', randgrid.best_estimator_.get_params()['colsample_bytree'])
print('Best min_data_in_leaf:', randgrid.best_estimator_.get_params()['min_data_in_leaf'])
print('Best max_bin:', randgrid.best_estimator_.get_params()['max_bin'])
print('Best max_depth:', randgrid.best_estimator_.get_params()['max_depth'])
print('Best feature_fraction:', randgrid.best_estimator_.get_params()['feature_fraction'])
print(randgrid)

def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

report(randgrid.cv_results_)
np.save('RandomSearchCVResultsLGBMR.npy')


###############################################################################################
"""STACKING & ENSEMLING MODEL PREDICTIONS BASED ON CORRELATION & ACCURACY"""

##CORRELATION##
pred_1 = pd.read_excel('file:///C:/Users/vadaga/Desktop/machine hack/mean/Mean_2_V2.xlsx')
pred_2 = pd.read_excel('file:///C:/Users/vadaga/Desktop/machine hack/mean/RF.xlsx')

np.corrcoef(pred_1['Price'],pred_2['Price'])

##STACKING
import os
files = os.listdir('C:/Users/vadaga/Desktop/machine hack/mean')
file_names = [file.split('.')[0] for file in files]
path = 'C:/Users/vadaga/Desktop/machine hack/mean/'
df = pd.DataFrame()
for i in range(len(files)):
    file_names[i] = pd.read_excel(path + files[i])
    df['Price'+str(i)] = file_names[i]['Price']
df['mean'] = df.mean(axis=1)

mean_price = pd.DataFrame({'Price':df['mean']})
mean_price.to_excel('C:/Users/vadaga/Desktop/machine hack/best_stack_5.xlsx',index=False)
############################################################################################################
"""Weighted Stacking"""

import os
files = os.listdir('C:/Users/vadaga/Desktop/machine hack/mean')
file_names = [file.split('.')[0] for file in files]
path = 'C:/Users/vadaga/Desktop/machine hack/mean/'
df = pd.DataFrame()
w = [2,1,1]
for i in range(len(files)):
    file_names[i] = pd.read_excel(path + files[i])
    df['Price'+str(i)] = file_names[i]['Price']*w[i]
df['mean'] = df.mean(axis=1)
mean_price = pd.DataFrame({'Price':df['mean']})
mean_price.to_excel('C:/Users/vadaga/Desktop/machine hack/best_stack_655.xlsx',index=False)

###########################################################################################################3
