# Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')

import json
import pandas as pd
import numpy as np
import typing as tp
import time
from contextlib import contextmanager
from pathlib import Path
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import RidgeCV

# Utils

In [None]:
DATADIR = Path('stage1')
OUTDIR = Path('output')
t = 15117

In [None]:
@contextmanager
def timer(logger=None, format_str='{:.3f}[s]', prefix=None, suffix=None):
    if prefix: format_str = str(prefix) + format_str
    if suffix: format_str = format_str + str(suffix)
    start = time.time()
    yield
    d = time.time() - start
    out_str = format_str.format(d)
    if logger:
        logger.info(out_str)
    else:
        print(out_str)

# Data Loading

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd "drive/My Drive/probspace_citations"

Mounted at /content/drive
/content/drive/My Drive/probspace_citations


In [None]:
stage1 = pd.read_csv(DATADIR / 'stage1.csv')

In [None]:
stage1.cites = np.log1p(stage1.cites)
stage1

Unnamed: 0,cites,pred_888feat_lgb1,pred_888feat_lgb2,pred_888feat_cat2,pred_888feat_cat1,pred_888feat_lgb3,pred_888feat_cat3,pred_888feat_xgb2,pred_888feat_lgb4,pred_888feat_lgb5,pred_888feat_lgb6
0,2.079442,2.278676,2.304112,2.288384,2.281731,2.292609,2.282578,2.288522,2.284216,2.294980,2.297394
1,5.241747,4.858338,4.859591,4.925864,4.951648,4.859817,4.951661,4.859949,4.877359,4.889964,4.870561
2,2.197225,2.209237,2.212626,2.228004,2.194563,2.194233,2.203873,2.205231,2.197676,2.217161,2.213708
3,1.098612,2.523394,2.532064,2.771250,2.715540,2.502572,2.721692,2.614595,2.538510,2.586742,2.481297
4,2.708050,2.669004,2.692888,2.629841,2.653717,2.699101,2.627605,2.667642,2.712665,2.691833,2.685774
...,...,...,...,...,...,...,...,...,...,...,...
74196,,3.295104,3.293745,3.312605,3.303358,3.280196,3.309491,3.312790,3.296582,3.286474,3.282828
74197,,1.790466,1.793509,1.802541,1.799175,1.795068,1.798720,1.829115,1.801530,1.801121,1.794718
74198,,2.490395,2.486748,2.400805,2.399785,2.489409,2.397732,2.454541,2.483456,2.490948,2.489340
74199,,1.744697,1.745020,1.763783,1.764632,1.740299,1.759669,1.802432,1.743573,1.761053,1.741536


In [None]:
X_stage1 = stage1[['pred_888feat_lgb1', 'pred_888feat_lgb2', 'pred_888feat_lgb3', 'pred_888feat_lgb4', 'pred_888feat_lgb5', 'pred_888feat_lgb6', 'pred_888feat_cat1', 'pred_888feat_cat2', 'pred_888feat_cat3']][:t].values.reshape(-1,9)
y_stage1 = stage1['cites'][:t].values.reshape(-1,1)

# Ridge

In [None]:
alphas_list = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100]
clf = RidgeCV(alphas=alphas_list, cv=5)
clf.fit(X_stage1, y_stage1)

RidgeCV(alphas=array([1.e-03, 5.e-03, 1.e-02, 5.e-02, 1.e-01, 5.e-01, 1.e+00, 5.e+00,
       1.e+01, 5.e+01, 1.e+02]),
        cv=5, fit_intercept=True, gcv_mode=None, normalize=False, scoring=None,
        store_cv_values=False)

In [None]:
pred = clf.predict(X_stage1)
output = clf.predict(stage1[['pred_888feat_lgb1', 'pred_888feat_lgb2', 'pred_888feat_lgb3', 'pred_888feat_lgb4', 'pred_888feat_lgb5', 'pred_888feat_lgb6', 'pred_888feat_cat1', 'pred_888feat_cat2', 'pred_888feat_cat3']][t:].values.reshape(-1,9))

rmse = np.sqrt(mean_squared_error(y_stage1, pred))
print(rmse)

0.48273134622685787


In [None]:
df_test = pd.read_json('input/test_data.json', lines=True)
output = np.expm1(output)
df_sub = pd.DataFrame()
df_sub["id"] = df_test["id"]
df_sub["cites"] = output
df_sub.to_csv(OUTDIR / 'submission.csv', index=False)