In [19]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#read in data
train = pd.read_csv("train_2.csv")
key = pd.read_csv("key_2.csv")

In [3]:
submission = pd.read_csv("sample_submission_2.csv")

In [56]:
train.shape

(145063, 804)

In [4]:
df_train = train
df_train = df_train.fillna(0)

In [5]:
df_train['agent'] = df_train['Page'].str.split('_').str[-1]
df_train['access'] = df_train['Page'].str.split('_').str[-2]
df_train['project'] = df_train['Page'].str.split('_').str[-3]
df_train['language'] = df_train['project'].str.split('.').str[0]

In [9]:
df_train['Page'].str.split('_')[0]

['2NE1', 'zh.wikipedia.org', 'all-access', 'spider']

In [7]:
df_train.head()

Unnamed: 0,Page,2015-07-01,2015-07-02,2015-07-03,2015-07-04,2015-07-05,2015-07-06,2015-07-07,2015-07-08,2015-07-09,...,2017-09-05,2017-09-06,2017-09-07,2017-09-08,2017-09-09,2017-09-10,agent,access,project,language
0,2NE1_zh.wikipedia.org_all-access_spider,18.0,11.0,5.0,13.0,14.0,9.0,9.0,22.0,26.0,...,16.0,27.0,29.0,23.0,54.0,38.0,spider,all-access,zh.wikipedia.org,zh
1,2PM_zh.wikipedia.org_all-access_spider,11.0,14.0,15.0,18.0,11.0,13.0,22.0,11.0,10.0,...,54.0,25.0,26.0,23.0,13.0,81.0,spider,all-access,zh.wikipedia.org,zh
2,3C_zh.wikipedia.org_all-access_spider,1.0,0.0,1.0,1.0,0.0,4.0,0.0,3.0,4.0,...,4.0,7.0,3.0,4.0,7.0,6.0,spider,all-access,zh.wikipedia.org,zh
3,4minute_zh.wikipedia.org_all-access_spider,35.0,13.0,10.0,94.0,4.0,26.0,14.0,9.0,11.0,...,6.0,16.0,19.0,30.0,38.0,4.0,spider,all-access,zh.wikipedia.org,zh
4,52_Hz_I_Love_You_zh.wikipedia.org_all-access_s...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,20.0,23.0,28.0,14.0,8.0,7.0,spider,all-access,zh.wikipedia.org,zh


In [15]:
df_train['agent'].unique()

array(['spider', 'all-agents'], dtype=object)

In [16]:
df_train['access'].unique()

array(['all-access', 'desktop', 'mobile-web'], dtype=object)

In [17]:
df_train['project'].unique()

array(['zh.wikipedia.org', 'fr.wikipedia.org', 'en.wikipedia.org',
       'commons.wikimedia.org', 'ru.wikipedia.org', 'www.mediawiki.org',
       'de.wikipedia.org', 'ja.wikipedia.org', 'es.wikipedia.org'], dtype=object)

In [27]:
df_train['language'].unique()

array(['zh', 'fr', 'en', 'commons', 'ru', 'www', 'de', 'ja', 'es'], dtype=object)

### Define evaluation metric

In [90]:
# Approximated differentiable SMAPE for one prediction
def differentiable_smape(true, predicted):
    epsilon = 0.1
    true_o = true
    pred_o = predicted
    summ = np.maximum(np.abs(true_o) + np.abs(pred_o) + epsilon, 0.5 + epsilon)
    smape = (np.abs(pred_o - true_o) / summ) * 2
    return smape

In [40]:
def feature_normalization(train, test):
    """Rescale the data so that each feature in the training set is in
    the interval [0,1], and apply the same transformations to the test
    set, using the statistics computed on the training set, and adding
    bias term for both train and test set.

    Args:
        train - training set, a 2D numpy array of size (num_instances, num_features)
        test  - test set, a 2D numpy array of size (num_instances, num_features)
    Returns:
        train_normalized - training set after normalization
        test_normalized  - test set after normalization
    """
    train_normalized = []
    test_normalized = []
    for ind in range(train.shape[1]):
        train_f = train[:,ind]
        test_f = test[:,ind]
        if len(set(train_f)) > 1:
            min_f = min(train_f)
            max_f = max(train_f)
            train_normalized.append((train_f - min_f)/(max_f-min_f))
            test_normalized.append((test_f - min_f)/(max_f-min_f))
    train_normalized = np.asarray(train_normalized).T
    test_normalized = np.asarray(test_normalized).T
    
    train_norm_bias = np.hstack((train_normalized,np.ones((train.shape[0],1))))
    test_norm_bias = np.hstack((test_normalized,np.ones((test.shape[0],1))))
    return (train_norm_bias, test_norm_bias)

### Flatten train into page, date, visit & Feature Engineering

In [7]:
train = train.fillna(0)
train_flattened = pd.melt(train, id_vars='Page', var_name='date', value_name='Visits')
train_flattened['date'] = train_flattened['date'].astype('datetime64[ns]')
train_flattened['weekend'] = ((train_flattened.date.dt.dayofweek) // 5 == 1).astype(float)


In [8]:
# Median by page
df_median = pd.DataFrame(train_flattened.groupby(['Page'])['Visits'].median())
df_median.columns = ['median']

# Average by page
df_mean = pd.DataFrame(train_flattened.groupby(['Page'])['Visits'].mean())
df_mean.columns = ['mean']

# Merging data
train_flattened = train_flattened.set_index('Page').join(df_mean).join(df_median)


In [11]:
train_flattened.reset_index(drop=False,inplace=True)

In [10]:
train_flattened['weekday'] = train_flattened['date'].apply(lambda x: x.weekday())

In [12]:
# Feature engineering with the date
train_flattened['year']=train_flattened.date.dt.year 
train_flattened['month']=train_flattened.date.dt.month 
train_flattened['day']=train_flattened.date.dt.day

In [13]:
train_dropped = train_flattened.drop(['date'], axis=1)

In [14]:
one_page = train_dropped[:803]

In [15]:
one_page = one_page.drop(['Page'],axis=1)

In [47]:
one_page_ftr = one_page.drop(['Visits'],axis=1)
one_page_target = one_page['Visits']

In [25]:
# one_page_ftr.shape

(803, 5)

In [179]:
X_train = one_page_ftr[:563]
X_val = one_page_ftr[563:743]
X_test = one_page_ftr[743:]
y_train = one_page_target[:563]
y_val = one_page_target[563:743]
y_test = one_page_target[743:]

### Normalize features

In [180]:
X_tr_norm,X_val_norm = feature_normalization(X_train.values,X_val.values)

In [181]:
train_normalized = []
test_normalized = []
if len(set(y_train)) > 1:
    min_t = min(y_train)
    max_t = max(y_train)
    train_normalized.append((y_train - min_t)/(max_t-min_t))
    test_normalized.append((y_val - min_t)/(max_t-min_t))
y_tr_norm = np.asarray(train_normalized).T
y_val_norm = np.asarray(test_normalized).T

In [126]:
np.argmax(y_train)

275

In [66]:
one_page_target[567]

1160760.0

In [89]:
one_page.iloc[571]

Visits       20.0
weekend       1.0
median        3.0
year       2017.0
month         1.0
day          22.0
Name: 571, dtype: float64

## Ridge Regression Model for one page

In [91]:
ridge = Ridge()

In [92]:
ridge.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [93]:
pred = ridge.predict(X_test)

In [105]:
def compute_mean_smape(pred,y_test):
    smape = []
    for i in range(len(pred)):
        fore = pred[i]
        actural = y_test[i]
        sm = differentiable_smape(actural,fore)
        smape.append(sm)
    mean_sm = np.asarray(smape).mean()
    return mean_sm

### Tuning hyperparameter alpha

In [171]:
alpha = [10**-9,10**-8,10**-7,10**-6,10**-5,10**-4,10**-3,10**-2,10**-1,1,10,100,1000,10**4]

In [182]:
loss_hist = []
for a in alpha:
    model = Ridge(alpha=a)
    model.fit(X_tr_norm,y_tr_norm)
    pred = model.predict(X_val_norm)
    loss = compute_mean_smape(pred,y_val_norm)
    loss_hist.append(loss)

In [183]:
np.argmin(loss_hist)

0

### Retrain model using train+val set

In [191]:
X_trv = np.vstack((X_train,X_val))
y_train = y_train.reshape([len(y_train),1])
y_val = y_val.reshape([len(y_val),1])
y_trv = np.vstack((y_train,y_val))

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [195]:
X_trv_norm,X_test_norm = feature_normalization(X_trv,X_test.values)

In [200]:
model = Ridge(alpha=10**-9)
model.fit(X_trv_norm,y_trv)
pred = model.predict(X_test_norm)
loss = compute_mean_smape(pred,y_test.values)

In [201]:
loss

1.9860510352708083