In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import CategoricalEncoder
import category_encoders as ce
import collections as col
from cvxpy import *

In [None]:
air_store = pd.read_csv('data/air_store_info.csv')

date_info = pd.read_csv('data/date_info.csv')
date_info_drop = date_info.drop('day_of_week', axis=1)
date_info_drop.columns.values[0]='visit_date'

store_id = pd.read_csv('data/store_id_relation.csv', index_col='air_store_id')
air_visit1 = pd.read_csv('data/air_visit_data.csv')
air_reserve1 = pd.read_csv('data/air_reserve.csv')

weather_data1 = pd.read_csv('data/merged_rest_data_weather_measurements.csv')
weather_data = weather_data1[['air_store_id', 
                              'visit_date', 
                              'avg_temperature1', 
                              'high_temperature1', 
                              'low_temperature1',
                              'hours_sunlight1']]

In [None]:
# drop duplicates in air_reserve data, and sum according to dates
air_reserve = air_reserve1[air_reserve1.duplicated()==False]
#air_reserve.duplicated().sum()
air_reserve['visit_datetime'] = air_reserve['visit_datetime'].apply(lambda _: _[:10])
air_reserve = air_reserve.drop(['reserve_datetime'], axis=1)\
              .groupby(['air_store_id', 'visit_datetime'])['reserve_visitors']\
              .sum().reset_index().rename(columns = {'visit_datetime':'visit_date'})

In [None]:
id_pool = list(set(air_visit1['air_store_id']))
dictionary = {}
for storeid in id_pool:
    dictionary[storeid] = air_visit1.loc[air_visit1['air_store_id'] == storeid]
    dictionary[storeid]['minus7days'] = dictionary[storeid]['visitors'].shift(7)
    dictionary[storeid]['minus1days'] = dictionary[storeid]['visitors'].shift(1)
    dictionary[storeid]['MA'] = dictionary[storeid].rolling(window=7)['minus1days'].mean()
frames = []
for storeid in id_pool:
    frames.append(dictionary[storeid])
df = pd.concat(frames).dropna(axis=0, how='any')

tmp0 = pd.merge(df,
                date_info_drop,
                on='visit_date',
                how = 'left')
tmp0['holiday_flg'] = tmp0['holiday_flg'].astype('category')
tmp1 = pd.merge(tmp0,
                air_reserve,
                on=['air_store_id', 'visit_date'],
                how = 'left')
tmp2 = pd.merge(tmp1,
                air_store,
                on='air_store_id',
                how = 'left')
tmp3 = pd.merge(tmp2,
               weather_data,
               on=['air_store_id', 'visit_date'],
               how = 'left')
air_visit_reserve_merge = tmp3.dropna(axis=0, how='any').drop_duplicates()

air_visit_reserve_merge['visit_date'] = pd.to_datetime(air_visit_reserve_merge['visit_date'])
air_visit_reserve_merge['day'] = air_visit_reserve_merge['visit_date'].dt.day
air_visit_reserve_merge['weekday'] = air_visit_reserve_merge['visit_date'].dt.weekday

air_visit_reserve_merge['day'] = air_visit_reserve_merge['day'].astype('category')
air_visit_reserve_merge['weekday'] = air_visit_reserve_merge['weekday'].astype('category')

onehot_day_weekday = pd.get_dummies(air_visit_reserve_merge[['day', 'weekday']])

In [None]:
geoweekday = pd.pivot_table(air_visit_reserve_merge[['weekday', 'visitors', 'air_area_name']], 
               index='weekday', 
               columns=['air_area_name'], 
               aggfunc=np.sum).fillna(0)

area_list=[]
for i in [0,6,9,12,20,37]:
    area_list.append(geoweekday.columns.tolist()[i][1])

l1 = air_visit_reserve_merge['air_area_name'].tolist()
l2 = air_visit_reserve_merge['weekday'].tolist()
l1bool = [elem in area_list for elem in l1]
l2bool = [elem==5 for elem in l2]
lbool = [int(l1bool[i]&l2bool[i]) for i in range(len(l1))]
air_visit_reserve_merge['geotemp'] = lbool

In [None]:
#holiday_flg, day, weekday, genre_label_encoding, area_label_encoding
df_with_label_encoding[['holiday_flg', 'day', 'weekday', 'genre_label_encoding', 'area_label_encoding']].describe()

In [None]:
for name in ['holiday_flg', 'day', 'weekday', 'genre_label_encoding', 'area_label_encoding', 'geotemp']:
        print(df_with_label_encoding[name].value_counts())

In [None]:
import seaborn as sns
%matplotlib inline
corr_df = air_visit_reserve_merge[['visitors',
                                   'minus7days', 
                                     'minus1days', 
                                     'MA', 
                                     'reserve_visitors', 
                                     'avg_temperature1', 
                                     'high_temperature1', 
                                     'low_temperature1', 
                                     'hours_sunlight1']].corr()

sns.heatmap(corr_df)

In [None]:
geodate = pd.pivot_table(air_visit_reserve_merge[['visit_date', 'visitors', 'air_area_name']], 
               index='visit_date', 
               columns=['air_area_name'], 
               aggfunc=np.sum).fillna(0)

In [None]:
import plotly
import plotly.graph_objs as go
plotly.tools.set_credentials_file(username="", api_key="")

data = [go.Surface(z=geodate.values.tolist(), colorscale='Viridis')]

layout = go.Layout(
    width=800,
    height=700,
    autosize=False,
    title='geo_weekday',
    scene=dict(
        xaxis=dict(
            gridcolor='rgb(255, 255, 255)',
            zerolinecolor='rgb(255, 255, 255)',
            showbackground=True,
            backgroundcolor='rgb(230, 230,230)'
        ),
        yaxis=dict(
            gridcolor='rgb(255, 255, 255)',
            zerolinecolor='rgb(255, 255, 255)',
            showbackground=True,
            backgroundcolor='rgb(230, 230,230)'
        ),
        zaxis=dict(
            gridcolor='rgb(255, 255, 255)',
            zerolinecolor='rgb(255, 255, 255)',
            showbackground=True,
            backgroundcolor='rgb(230, 230,230)'
        ),
        aspectratio = dict( x=1, y=1, z=0.7 ),
        aspectmode = 'manual'
    )
)

fig = dict(data=data, layout=layout)

plotly.plotly.iplot(fig, filename='geodate')

In [None]:
geoday = pd.pivot_table(air_visit_reserve_merge[['day', 'visitors', 'air_area_name']], 
               index='day', 
               columns=['air_area_name'], 
               aggfunc=np.sum).fillna(0)

In [None]:
air_visit_reserve_merge['air_genre_name'] =\
    air_visit_reserve_merge['air_genre_name'].astype('category')
air_visit_reserve_merge['air_area_name'] =\
    air_visit_reserve_merge['air_area_name'].astype('category')

In [None]:
le_genre = LabelEncoder()
le_genre.fit(air_visit_reserve_merge['air_genre_name'])
df_genre_label_encoding = pd.DataFrame(le_genre.transform(air_visit_reserve_merge['air_genre_name']),
                                       columns=['genre_label_encoding'])

le_area = LabelEncoder()
le_area.fit(air_visit_reserve_merge['air_area_name'])
df_area_label_encoding = pd.DataFrame(le_area.transform(air_visit_reserve_merge['air_area_name']),
                                       columns=['area_label_encoding'])

df_label_encoding = pd.merge(df_genre_label_encoding, df_area_label_encoding, left_index=True, right_index=True)
df_label_encoding['genre_label_encoding'] = df_label_encoding['genre_label_encoding'].astype('category')
df_label_encoding['area_label_encoding'] = df_label_encoding['area_label_encoding'].astype('category')

In [None]:
df_label_encoding.index = air_visit_reserve_merge.index

df_with_label_encoding = pd.merge(air_visit_reserve_merge, 
                                  df_label_encoding, 
                                  right_index=True, left_index=True)\
                            .dropna(axis=0, how='any')\
                            .drop(['air_genre_name',
                                    'air_area_name',
                                    'latitude',
                                    'longitude'], axis=1)

In [None]:
df_with_onehot1 = pd.get_dummies(air_visit_reserve_merge, 
                               columns=['air_genre_name','air_area_name'])\
                    .dropna(axis=0, how='any')\
                    .drop(['latitude',
                            'longitude'], axis=1)

In [None]:
df_with_onehot = pd.merge(df_with_onehot1, 
         onehot_day_weekday, 
         how='outer', 
         left_index=True, 
         right_index=True).drop(['day','weekday'], 
                                axis=1)

In [None]:
encoder_target = ce.TargetEncoder(cols = ['air_genre_name', 'air_area_name'])
tmp_encoder_target = encoder_target.fit_transform(air_visit_reserve_merge[['air_genre_name', 
                                                                           'air_area_name']],
                                                  air_visit_reserve_merge['visitors'])
tmp_encoder_target = tmp_encoder_target.rename(index=str, 
                                               columns={'air_genre_name':'air_genre_name_encoding',
                                                        'air_area_name':'air_area_name_encoding'})
tmp_encoder_target['air_genre_name_encoding'] = tmp_encoder_target['air_genre_name_encoding'].astype('category')
tmp_encoder_target['air_area_name_encoding'] = tmp_encoder_target['air_area_name_encoding'].astype('category')

In [None]:
tmp_encoder_target.index = tmp_encoder_target.index.map(int)
df_with_target_encoding = pd.concat([air_visit_reserve_merge, tmp_encoder_target], axis=1)\
                            .dropna(axis=0, how='any')\
                            .drop(['air_genre_name',
                                    'air_area_name',
                                    'latitude',
                                    'longitude'], axis=1)

In [None]:
df_baseline = air_visit_reserve_merge[['air_store_id', 'visit_date', 'visitors', 'minus7days']]
df_baseline['intercept'] = 1

In [None]:
new_dates = geodate[206:].index.astype(str).tolist()
new_df_list = []
name_list = [df_baseline, df_with_onehot, df_with_label_encoding]
for name in name_list:
    new_df_list.append(name[name['visit_date'].isin(new_dates)])

In [None]:
def designed_train_test_split(df):
    df['visit_date'] = df['visit_date'].astype(str)
    test_date_selector = df["visit_date"].str.startswith("2017-04")
    val_date_selector = df["visit_date"].str.startswith("2017-03")
    df_test = df[test_date_selector].drop(['air_store_id', 'visit_date'], axis=1)
    df_val = df[val_date_selector].drop(['air_store_id', 'visit_date'], axis=1)
    df_train = df[~(test_date_selector | val_date_selector)].drop(['air_store_id', 'visit_date'], axis=1)
    
    df_train_y = df_train['visitors']
    df_train_X = df_train.drop(['visitors'], axis=1)
    df_val_y = df_val['visitors']
    df_val_X = df_val.drop(['visitors'], axis=1)
    df_test_y = df_test['visitors'] #true y
    df_test_X = df_test.drop(['visitors'], axis=1)     

    return df_train_X, df_train_y, df_val_X, df_val_y, df_test_X, df_test_y

In [None]:
TrainValTest_dict = col.OrderedDict([
    ("df_train_X", []),
    ("df_train_y", []),
    ("df_val_X", []),
    ("df_val_y", []),
    ("df_test_X", []),
    ("df_test_y", []),
])
df_name_list = [df_baseline, df_with_target_encoding, df_with_onehot, df_with_label_encoding]
for name in df_name_list:
    tmp_train_X, tmp_train_y, tmp_val_X, tmp_val_y, tmp_test_X, tmp_test_y = designed_train_test_split(name)
    TrainValTest_dict['df_train_X'].append(tmp_train_X)
    TrainValTest_dict['df_train_y'].append(tmp_train_y)
    TrainValTest_dict['df_val_X'].append(tmp_val_X)
    TrainValTest_dict['df_val_y'].append(tmp_val_y)
    TrainValTest_dict['df_test_X'].append(tmp_test_X)
    TrainValTest_dict['df_test_y'].append(tmp_test_y)

In [None]:
def score_func(prediction, true):
    num_samples = true.shape[0]
    evaluation = np.linalg.norm(np.divide(prediction-true, true), ord=1)/num_samples
    return evaluation

In [None]:
import lightgbm as lgb

lgb_train = lgb.Dataset(TrainValTest_dict['df_train_X'][3], TrainValTest_dict['df_train_y'][3])
lgb_eval = lgb.Dataset(TrainValTest_dict['df_val_X'][3], TrainValTest_dict['df_val_y'][3], reference=lgb_train)

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'mape',
    'num_leaves': 29,
    'learning_rate': 0.05,
    'max_depth': 4,
    'min_data_in_leaf': 380,
}

gbm = lgb.train(params, 
               lgb_train, 
               num_boost_round=1000, 
               valid_sets=lgb_eval, 
               feature_name='auto', 
               categorical_feature='auto', 
               early_stopping_rounds=None,
               )

y_pred = gbm.predict(TrainValTest_dict['df_test_X'][3], num_iteration=gbm.best_iteration)

score_func(y_pred, TrainValTest_dict['df_test_y'][3])

In [None]:
for i in range(4):
    TrainValTest_dict['df_train_X'][i] = np.array(TrainValTest_dict['df_train_X'][i])
    TrainValTest_dict['df_val_X'][i] = np.array(TrainValTest_dict['df_val_X'][i])
    TrainValTest_dict['df_test_X'][i] = np.array(TrainValTest_dict['df_test_X'][i])
    TrainValTest_dict['df_train_y'][i] = TrainValTest_dict['df_train_y'][i].values
    TrainValTest_dict['df_val_y'][i] = TrainValTest_dict['df_val_y'][i].values
    TrainValTest_dict['df_test_y'][i] = TrainValTest_dict['df_test_y'][i].values

In [None]:
class DesignedLinearModel:
    def __init__(self, reg=None, lam=None):
        if reg is None:
            assert lam is None
        else:
            assert reg in ("l1", "l2")
        self.reg = reg
        self.lam = lam

        self.w = None
        self.fitted=False

    @classmethod
    def _objective_func_wo_reg(cls, X, y, w):
        m = X.shape[0]
        obj_func = norm(mul_elemwise(inv_pos(y), X*w-y), 1)/m
        return obj_func  
    
    def fit(self, X, y):
        n = X.shape[1]
        w = Variable(n, 1)
        if self.reg == 'l1':
            prob = Problem(Minimize(self._objective_func_wo_reg(X, y, w)+self.lam*norm(w,1)))
        elif self.reg == 'l2':
            prob = Problem(Minimize(self._objective_func_wo_reg(X, y, w)+self.lam*sum_squares(w)))
        elif self.reg is None:
            prob = Problem(Minimize(self._objective_func_wo_reg(X, y, w)))
        else:
            raise KeyError()

        prob.solve()
#       print (prob.value)
#       print (prob.solver_stats)
        self.w = np.array(w.value).reshape(-1)
        self.fitted=True
        return self

    def predict(self, X):
        assert self.fitted==True
        result = X @ self.w
        return result

In [None]:
model=DesignedLinearModel()
model.fit(X=TrainValTest_dict['df_train_X'][0], y=TrainValTest_dict['df_train_y'][0])
pred_val = model.predict(X=TrainValTest_dict['df_val_X'][0])
pred_test = model.predict(X=TrainValTest_dict['df_test_X'][0])
score_val = score_func(pred_val, TrainValTest_dict['df_val_y'][0])
score_test = score_func(pred_test, TrainValTest_dict['df_test_y'][0])
print(score_val); print(score_test)

In [None]:
model=DesignedLinearModel(reg='l1', lam=0.00001)
model.fit(X=tmp_dict['df_train_X'][2], y=tmp_dict['df_train_y'][2])
pred_val = model.predict(X=tmp_dict['df_val_X'][2])
pred_test = model.predict(X=tmp_dict['df_test_X'][2])
score_val = score_func(pred_val, tmp_dict['df_val_y'][2])
score_test = score_func(pred_test, tmp_dict['df_test_y'][2])
print(score_val); print(score_test)

In [None]:
def l1_model(lmd, X_train, y_train, X_val, y_val):
    model=DesignedLinearModel(reg='l1', lam=lmd)
    model.fit(X_train, y_train)
    pred_val = model.predict(X_val)
    score_val = score_func(pred_val, y_val)
    return score_val

In [None]:
def create_test(store_id):
    """
    expecting store id (in string)
    and return two data sets of that preticular store
    """
    df_tmp = df_with_onehot[df_with_onehot['air_store_id'] == store_id]
    X_test = df_tmp.drop(['air_store_id','visit_date','visitors'],1)
    X_test = X_test.reset_index(drop=True)
    X_test = X_test.as_matrix()

    y_test = df_tmp['visitors']
    y_test = y_test.reset_index(drop=True)
    y_test = y_test.as_matrix()
    return X_test, y_test

def predict_result(model_use, X_test, y_test):
    """
    assuming a model is already trained
    """
    pred_y = model_use.predict(X_test)
    score_test = score_func(pred_y, y_test)
    return score_test

score_store = []
for id in store_id_list:
    [x_tmp, y_tmp] = create_test(id)
    score = predict_result(model, x_tmp, y_tmp) 
    score_store.append(score)
    print("running for store %s, test score = %.6f" %(id, score))
print('finished') 