In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import CategoricalEncoder
import category_encoders as ce
import collections as col
from cvxpy import *

In [2]:
#Load air data
air_store = pd.read_csv('data/air_store_info.csv')
#Load other data
date_info = pd.read_csv('data/date_info.csv')
# drop'day of week' in date_info; rename its column name
date_info_drop = date_info.drop('day_of_week', axis=1)
date_info_drop.columns.values[0]='visit_date'

store_id = pd.read_csv('data/store_id_relation.csv', index_col='air_store_id')
air_visit1 = pd.read_csv('data/air_visit_data.csv')
air_reserve1 = pd.read_csv('data/air_reserve.csv')

In [3]:
weather_data1 = pd.read_csv('data/merged_rest_data_weather_measurements.csv')

In [4]:
weather_data = weather_data1[['air_store_id', 
                              'visit_date', 
                              'avg_temperature1', 
                              'high_temperature1', 
                              'low_temperature1',
                              'hours_sunlight1']]

In [7]:
weather_data.isnull().sum()

air_store_id             0
visit_date               0
avg_temperature1     29329
high_temperature1    30393
low_temperature1     30393
hours_sunlight1      47092
dtype: int64

In [8]:
# drop duplicates in air_reserve data, and sum according to dates
air_reserve = air_reserve1[air_reserve1.duplicated()==False]
#air_reserve.duplicated().sum()
air_reserve['visit_datetime'] = air_reserve['visit_datetime'].apply(lambda _: _[:10])
air_reserve = air_reserve.drop(['reserve_datetime'], axis=1)\
              .groupby(['air_store_id', 'visit_datetime'])['reserve_visitors']\
              .sum().reset_index().rename(columns = {'visit_datetime':'visit_date'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [9]:
id_pool = list(set(air_visit1['air_store_id']))
dictionary = {}
for storeid in id_pool:
    dictionary[storeid] = air_visit1.loc[air_visit1['air_store_id'] == storeid]
    dictionary[storeid]['minus7days'] = dictionary[storeid]['visitors'].shift(7)
    dictionary[storeid]['MA'] = dictionary[storeid].rolling(window=7)['visitors'].mean()
frames = []
for storeid in id_pool:
    frames.append(dictionary[storeid])
df = pd.concat(frames).dropna(axis=0, how='any')

tmp0 = pd.merge(df,
                date_info_drop,
                on='visit_date',
                how = 'left')
tmp1 = pd.merge(tmp0,
                air_reserve,
                on=['air_store_id', 'visit_date'],
                how = 'left')
tmp2 = pd.merge(tmp1,
                air_store,
                on='air_store_id',
                how = 'left')
tmp3 = pd.merge(tmp2,
               weather_data,
               on=['air_store_id', 'visit_date'],
               how = 'left')
air_visit_reserve_merge = tmp3.dropna(axis=0, how='any').drop_duplicates()

air_visit_reserve_merge['visit_date'] = pd.to_datetime(air_visit_reserve_merge['visit_date'])
air_visit_reserve_merge['month'] = air_visit_reserve_merge['visit_date'].dt.month
air_visit_reserve_merge['day'] = air_visit_reserve_merge['visit_date'].dt.day
air_visit_reserve_merge['weekday'] = air_visit_reserve_merge['visit_date'].dt.weekday

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [11]:
air_visit_reserve_merge['air_genre_name'] =\
    air_visit_reserve_merge['air_genre_name'].astype('category')
air_visit_reserve_merge['air_area_name'] =\
    air_visit_reserve_merge['air_area_name'].astype('category')

### label encoding

In [12]:
le_genre = LabelEncoder()
le_genre.fit(air_visit_reserve_merge['air_genre_name'])
df_genre_label_encoding = pd.DataFrame(le_genre.transform(air_visit_reserve_merge['air_genre_name']),
                                       columns=['genre_label_encoding'])

le_area = LabelEncoder()
le_area.fit(air_visit_reserve_merge['air_area_name'])
df_area_label_encoding = pd.DataFrame(le_area.transform(air_visit_reserve_merge['air_area_name']),
                                       columns=['area_label_encoding'])

df_label_encoding = pd.merge(df_genre_label_encoding, df_area_label_encoding, left_index=True, right_index=True)

In [13]:
df_label_encoding.index = air_visit_reserve_merge.index

df_with_label_encoding = pd.merge(air_visit_reserve_merge, 
                                  df_label_encoding, 
                                  right_index=True, left_index=True)\
                            .dropna(axis=0, how='any')\
                            .drop(['air_genre_name',
                                    'air_area_name',
                                    'latitude',
                                    'longitude'], axis=1)

### onehot dense encoding

In [14]:
#T: pd.get_dummies
encoder_onehot_dense = CategoricalEncoder(encoding='onehot-dense')
tmp_encoder_onehot_dense = encoder_onehot_dense.fit_transform(air_visit_reserve_merge[['air_genre_name', 
                                                                                       'air_area_name']])
tmp_encoder_onehot_categories = encoder_onehot_dense.categories_

df_onehot_encoding = pd.DataFrame(tmp_encoder_onehot_dense, 
                                  columns=np.append(tmp_encoder_onehot_categories[0], 
                                                    tmp_encoder_onehot_categories[1]))

In [15]:
df_with_onehot = pd.get_dummies(air_visit_reserve_merge, 
                               columns=['air_genre_name','air_area_name'])\
                    .dropna(axis=0, how='any')\
                    .drop(['latitude',
                            'longitude'], axis=1)

### target encoding

In [16]:
encoder_target = ce.TargetEncoder(cols = ['air_genre_name', 'air_area_name'])
tmp_encoder_target = encoder_target.fit_transform(air_visit_reserve_merge[['air_genre_name', 
                                                                           'air_area_name']],
                                                  air_visit_reserve_merge['visitors'])
tmp_encoder_target = tmp_encoder_target.rename(index=str, 
                                               columns={'air_genre_name':'air_genre_name_encoding',
                                                        'air_area_name':'air_area_name_encoding'})

In [17]:
tmp_encoder_target.index = tmp_encoder_target.index.map(int)
df_with_target_encoding = pd.concat([air_visit_reserve_merge, tmp_encoder_target], axis=1)\
                            .dropna(axis=0, how='any')\
                            .drop(['air_genre_name',
                                    'air_area_name',
                                    'latitude',
                                    'longitude'], axis=1)

### baseline data

In [18]:
df_baseline = air_visit_reserve_merge[['air_store_id', 'visit_date', 'visitors', 'minus7days']]
df_baseline['intercept'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


### test val train split

In [19]:
def designed_train_test_split(df):
    df['visit_date'] = df['visit_date'].astype(str)
    test_date_selector = df["visit_date"].str.startswith("2017-04") | df["visit_date"].str.startswith("2017-03")
    val_date_selector = df["visit_date"].str.startswith("2017-02")
    df_test = df[test_date_selector].drop(['air_store_id', 'visit_date'], axis=1)
    df_val = df[val_date_selector].drop(['air_store_id', 'visit_date'], axis=1)
    df_train = df[~(test_date_selector | val_date_selector)].drop(['air_store_id', 'visit_date'], axis=1)
    
    df_train_y = df_train['visitors']
    df_train_X = df_train.drop(['visitors'], axis=1)
    df_val_y = df_val['visitors']
    df_val_X = df_val.drop(['visitors'], axis=1)
    df_test_y = df_test['visitors'] #true y
    df_test_X = df_test.drop(['visitors'], axis=1)     

    return df_train_X, df_train_y, df_val_X, df_val_y, df_test_X, df_test_y

In [20]:
TrainValTest_dict = col.OrderedDict([
    ("df_train_X", []),
    ("df_train_y", []),
    ("df_val_X", []),
    ("df_val_y", []),
    ("df_test_X", []),
    ("df_test_y", []),
])
df_name_list = [df_baseline, df_with_target_encoding, df_with_onehot, df_with_label_encoding]
for name in df_name_list:
    tmp_train_X, tmp_train_y, tmp_val_X, tmp_val_y, tmp_test_X, tmp_test_y = designed_train_test_split(name)
    TrainValTest_dict['df_train_X'].append(tmp_train_X)
    TrainValTest_dict['df_train_y'].append(tmp_train_y)
    TrainValTest_dict['df_val_X'].append(tmp_val_X)
    TrainValTest_dict['df_val_y'].append(tmp_val_y)
    TrainValTest_dict['df_test_X'].append(tmp_test_X)
    TrainValTest_dict['df_test_y'].append(tmp_test_y)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [21]:
tmp_dict = TrainValTest_dict.copy()
for i in range(4):
    tmp_dict['df_train_X'][i] = np.array(tmp_dict['df_train_X'][i])
    tmp_dict['df_val_X'][i] = np.array(tmp_dict['df_val_X'][i])
    tmp_dict['df_test_X'][i] = np.array(tmp_dict['df_test_X'][i])
    tmp_dict['df_train_y'][i] = tmp_dict['df_train_y'][i].values
    tmp_dict['df_val_y'][i] = tmp_dict['df_val_y'][i].values
    tmp_dict['df_test_y'][i] = tmp_dict['df_test_y'][i].values

In [26]:
class DesignedLinearModel:
    def __init__(self, reg=None, lam=None):
        if reg is None:
            assert lam is None
        else:
            assert reg in ("l1", "l2")
        self.reg = reg
        self.lam = lam

        self.w = None
        self.fitted=False

    @classmethod
    def _objective_func_wo_reg(cls, X, y, w):
        m = X.shape[0]
        obj_func = norm(mul_elemwise(inv_pos(y), X*w-y), 1)/m
        return obj_func  
    
    def fit(self, X, y):
        n = X.shape[1]
        w = Variable(n, 1)
        if self.reg == 'l1':
            prob = Problem(Minimize(self._objective_func_wo_reg(X, y, w)+self.lam*norm(w,1)))
        elif self.reg == 'l2':
            prob = Problem(Minimize(self._objective_func_wo_reg(X, y, w)+self.lam*sum_squares(w)))
        elif self.reg is None:
            prob = Problem(Minimize(self._objective_func_wo_reg(X, y, w)))
        else:
            raise KeyError()

        prob.solve()
#       print (prob.value)
#       print (prob.solver_stats)
        self.w = w.value
        self.fitted=True
        return self

    def predict(self, X):
        assert self.fitted==True
        return X*self.w

In [27]:
def score_func(prediction, true):
    num_samples = true.shape[0]
    evaluation = np.linalg.norm(np.divide(prediction-true, true), ord=1)/num_samples
    return evaluation

In [24]:
%matplotlib inline
def hyperparamtune(df_train_X, df_train_y, df_val_X, df_val_y, params, reg):
    train_eval = []
    val_eval = []
    for param in params:
        model=DesignedLinearModel(reg=reg, lam=param)
        model.fit(X=df_train_X, y=df_train_y)
        pred_train = model.predict(X=df_train_X)
        pred_val = model.predict(X=df_val_X)
        train_eval.append(score_func(pred_train, df_train_y))
        val_eval.append(score_func(pred, df_val_y))
    
    plt.plot(train_eval)
    plt.plot(val_eval)
    plt.xlabel('params')
    plt.ylabel('evaluation')
    
    return val_eval

In [None]:
hyperparamtune

In [None]:
model=DesignedLinearModel()
model.fit(X=tmp_dict['df_train_X'][0], y=tmp_dict['df_train_y'][0])
pred_val = model.predict(X=tmp_dict['df_train_X'][0])
pred_test = model.predict(X=tmp_dict['df_test_X'][0])
score_val = score_func(pred_val, tmp_dict['df_train_y'][0])
score_test = score_func(pred_test, tmp_dict['df_test_y'][0])
print(score_val); print(score_test)

In [29]:
from sklearn.linear_model import LinearRegression
regr = LinearRegression()
regr.fit(tmp_dict['df_train_X'][0], tmp_dict['df_train_y'][0])
pred_val = regr.predict(tmp_dict['df_val_X'][0])
score_val = score_func(pred_val, tmp_dict['df_val_y'][0])
pred_test = regr.predict(tmp_dict['df_test_X'][0])
score_test = score_func(pred_test, tmp_dict['df_test_y'][0])
print(score_val); print(score_test)

0.79823879656
0.784903923274


### toy example

In [None]:
# dX = np.array(TrainValTest_dict['df_train_X'][0])
# dy = TrainValTest_dict['df_train_y'][0].values
# X1 = np.array(TrainValTest_dict['df_test_X'][0])
# y1 = TrainValTest_dict['df_test_y'][0].values
X=np.random.rand(2,3)
y=np.random.rand(2)
y1 = np.random.rand(4)
X1 = np.random.rand(4,3)

In [None]:
w = Variable(3,1)
tmp_size = X.shape[0]
prob = Problem(Minimize(norm(mul_elemwise(inv_pos(y), 
                                          X*w-y), 1)/tmp_size+norm(w,1)))
prob.solve()
w=w.value
pred = X1*w

In [None]:
evaluation=np.linalg.norm(np.divide(pred-y1, y1), ord=1)
evaluation

In [None]:
a = [2,3,4,5,6]
b = [1,2,3,9,10]

In [None]:
%matplotlib inline
plt.plot(a)
plt.plot(b)
plt.xlabel('params')
plt.ylabel('score')
plt.axis([0, 7, 0, 20])