In [None]:
!wget https://lodmedia.hb.bizmrg.com/case_files/768812/train_dataset_train.csv

In [None]:
!wget https://lodmedia.hb.bizmrg.com/case_files/768812/test_dataset_test.csv

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

In [None]:
test_df = pd.read_csv("test_dataset_test.csv")
test_df.publish_date = pd.to_datetime(test_df.publish_date, format="%Y-%m-%d %H:%M:%S")
test_df = test_df.sort_values('publish_date')
test_df

In [None]:
train_df = pd.read_csv("train_dataset_train.csv")
train_df.publish_date = pd.to_datetime(train_df.publish_date, format="%Y-%m-%d %H:%M:%S")
train_df = train_df.sort_values('publish_date')
train_df

In [None]:
train_df['ts'] =  train_df.publish_date.values.astype(np.int64) // 10 ** 9
test_df['ts'] =  test_df.publish_date.values.astype(np.int64) // 10 ** 9

In [None]:
def split_array(val):
   val = val[1:-1]
   vals = val.split(",")
   vals = [v[1:-1] for v in vals]
   return vals

In [None]:
def compute_stats(df):

  ts_mean = df['ts'].values.mean()
  ts_std = df['ts'].values.std()

  ctr_mean = df['ctr'].values.mean()
  ctr_std = df['ctr'].values.std()

  views_mean = df.views.values.mean()
  views_std = df.views.values.std()
  depth_mean = df.depth.values.mean()
  depth_std = df.depth.values.std()
  reads_mean = df.full_reads_percent.values.mean()
  reads_std = df.full_reads_percent.values.std()

  autors_dict_v = {}
  autors_dict_d = {}
  autors_dict_f = {}
  for autors in train_df.authors.values:
    for autor in split_array(autors): 
      autors_dict_v[autor] = []
      autors_dict_d[autor] = []
      autors_dict_f[autor] = []
  
  tags_dict_v = {}
  tags_dict_d = {}
  tags_dict_f = {}
  for tags in train_df.tags.values:
    for tag in split_array(tags): 
      tags_dict_v[tag] = []
      tags_dict_d[tag] = []
      tags_dict_f[tag] = []

  cat_dict_v = {}
  cat_dict_d = {}
  cat_dict_f = {}
  for cat in train_df.category.values:
      cat_dict_v[cat] = []
      cat_dict_d[cat] = []
      cat_dict_f[cat] = []
      
  for _, row in train_df.iterrows():
      for autor in split_array(row.authors):
        autors_dict_v[autor].append(row.views)
        autors_dict_d[autor].append(row.depth)
        autors_dict_f[autor].append(row.full_reads_percent)

  for _, row in train_df.iterrows():
      for tag in split_array(row.tags):
        tags_dict_v[tag].append(row.views)
        tags_dict_d[tag].append(row.depth)
        tags_dict_f[tag].append(row.full_reads_percent)

  for _, row in train_df.iterrows():
        cat_dict_v[row.category].append(row.views)
        cat_dict_d[row.category].append(row.depth)
        cat_dict_f[row.category].append(row.full_reads_percent)

  autors_dict_v_agg = {}
  autors_dict_d_agg = {}
  autors_dict_f_agg = {}
  autors_dict_count = {}
  for k, v in autors_dict_v.items(): autors_dict_count[k] = len(v)
  for k, v in autors_dict_v.items(): autors_dict_v_agg[k] = {"mean": np.array(v).mean(), "std": np.array(v).std()}
  for k, v in autors_dict_d.items(): autors_dict_d_agg[k] = {"mean": np.array(v).mean(), "std": np.array(v).std()}
  for k, v in autors_dict_f.items(): autors_dict_f_agg[k] = {"mean": np.array(v).mean(), "std": np.array(v).std()}

  tags_dict_v_agg = {}
  tags_dict_d_agg = {}
  tags_dict_f_agg = {}
  tags_dict_count = {}
  for k, v in tags_dict_v.items(): tags_dict_count[k] = len(v)
  for k, v in tags_dict_v.items(): tags_dict_v_agg[k] = {"mean": np.array(v).mean(), "std": np.array(v).std()}
  for k, v in tags_dict_d.items(): tags_dict_d_agg[k] = {"mean": np.array(v).mean(), "std": np.array(v).std()}
  for k, v in tags_dict_f.items(): tags_dict_f_agg[k] = {"mean": np.array(v).mean(), "std": np.array(v).std()}

  cat_dict_v_agg = {}
  cat_dict_d_agg = {}
  cat_dict_f_agg = {}
  cat_dict_count = {}
  for k, v in cat_dict_v.items(): cat_dict_count[k] = len(v)
  for k, v in cat_dict_v.items(): cat_dict_v_agg[k] = {"mean": np.array(v).mean(), "std": np.array(v).std()}
  for k, v in cat_dict_d.items(): cat_dict_d_agg[k] = {"mean": np.array(v).mean(), "std": np.array(v).std()}
  for k, v in cat_dict_f.items(): cat_dict_f_agg[k] = {"mean": np.array(v).mean(), "std": np.array(v).std()}

  return {
      "ts_mean" : ts_mean,
      "ts_std" : ts_std,
      "ctr_mean" : ctr_mean,
      "ctr_std" : ctr_std,
      "views_mean" : views_mean,
      "views_std" : views_std,
      "depth_mean" : depth_mean,
      "depth_std" : depth_std,
      "reads_mean" : reads_mean,
      "reads_std" : reads_std,
      "autors_dict_v_agg" : autors_dict_v_agg,
      "autors_dict_d_agg" : autors_dict_d_agg,
      "autors_dict_f_agg" : autors_dict_f_agg,
      "autors_dict_count" : autors_dict_count,
      "tags_dict_v_agg" : tags_dict_v_agg,
      "tags_dict_d_agg" : tags_dict_d_agg,
      "tags_dict_f_agg" : tags_dict_f_agg,
      "tags_dict_count" : tags_dict_count,
      "cat_dict_v_agg" : cat_dict_v_agg,
      "cat_dict_d_agg" : cat_dict_d_agg,
      "cat_dict_f_agg" : cat_dict_f_agg,
      "cat_dict_count" : cat_dict_count
  }

In [None]:
def collect_data_train(df, stats):
    all_data = []
    all_targ = []
    for i, row in tqdm(df.iterrows()):
      
        autors_set = [autor for autor in split_array(row.authors)]
        tags_set = [tag for tag in split_array(row.tags)]
        autors_set = autors_set if len(autors_set) > 0 else [""]
        tags_set = tags_set if len(tags_set) > 0 else [""]
        
        for autor in autors_set:
          for tag in tags_set:
              all_targ.append([row.views, row.depth, row.full_reads_percent])
              all_data.append([
                    row.ts,
                    row.ctr,
                    stats["cat_dict_v_agg"][row.category]["mean"],
                    stats["cat_dict_d_agg"][row.category]["mean"],
                    stats["cat_dict_f_agg"][row.category]["mean"],
                    stats["cat_dict_v_agg"][row.category]["std"],
                    stats["cat_dict_d_agg"][row.category]["std"],
                    stats["cat_dict_f_agg"][row.category]["std"],
                    stats["cat_dict_count"][row.category],

                    stats["autors_dict_v_agg"][autor]["mean"] if autor in stats["autors_dict_count"] else stats["views_mean"],
                    stats["autors_dict_d_agg"][autor]["mean"] if autor in stats["autors_dict_count"] else stats["depth_mean"],
                    stats["autors_dict_f_agg"][autor]["mean"] if autor in stats["autors_dict_count"] else stats["reads_mean"],
                    stats["autors_dict_v_agg"][autor]["std"] if autor in stats["autors_dict_count"] else stats["views_std"],
                    stats["autors_dict_d_agg"][autor]["std"] if autor in stats["autors_dict_count"] else stats["depth_std"],
                    stats["autors_dict_f_agg"][autor]["std"] if autor in stats["autors_dict_count"] else stats["reads_std"],
                    stats["autors_dict_count"][autor] if autor in stats["autors_dict_count"] else 0,

                    stats["tags_dict_v_agg"][tag]["mean"] if tag in stats["tags_dict_count"] else stats["views_mean"],
                    stats["tags_dict_d_agg"][tag]["mean"] if tag in stats["tags_dict_count"] else stats["depth_mean"],
                    stats["tags_dict_f_agg"][tag]["mean"] if tag in stats["tags_dict_count"] else stats["reads_mean"],
                    stats["tags_dict_v_agg"][tag]["std"] if tag in stats["tags_dict_count"] else stats["views_std"],
                    stats["tags_dict_d_agg"][tag]["std"] if tag in stats["tags_dict_count"] else stats["depth_std"],
                    stats["tags_dict_f_agg"][tag]["std"] if tag in stats["tags_dict_count"] else stats["reads_std"],
                    stats["tags_dict_count"][tag] if tag in stats["tags_dict_count"] else 0,
              ])
    return all_data, all_targ

In [None]:
features_train = []
targets_train = []
COLLENTS_COUNT = 20
for i in range(COLLENTS_COUNT):
  print("PROCESS:", i)
  rnd = np.random.rand(len(train_df))
  STATS_MASK = rnd < 0.8
  TRAIN_MASK = rnd >= 0.8
  local_stats_dataframe = train_df[STATS_MASK]
  local_train_dataframe = train_df[TRAIN_MASK]
  stat = compute_stats(local_stats_dataframe)
  f, t = collect_data_train(local_train_dataframe, stat)
  features_train.extend(f)
  targets_train.extend(t)
features_train = np.array(features_train)
targets_train = np.array(targets_train)

In [None]:
from catboost import CatBoostRegressor, Pool

In [None]:
grid = {'iterations': [100, 150, 200],
        'learning_rate': [0.03, 0.1],
        'depth': [2, 4, 6, 8],
        'l2_leaf_reg': [0.2, 0.5, 1, 3]}

train_pool = Pool(features_train, label=targets_train[:, 0])

model_v = CatBoostRegressor(iterations=500)
model_v.grid_search(grid, train_pool)

In [None]:
grid = {'iterations': [100, 150, 200],
        'learning_rate': [0.03, 0.1],
        'depth': [2, 4, 6, 8],
        'l2_leaf_reg': [0.2, 0.5, 1, 3]}

train_pool = Pool(features_train, label=targets_train[:, 1])

model_d = CatBoostRegressor(iterations=500)
model_d.grid_search(grid, train_pool)

In [None]:
grid = {'iterations': [100, 150, 200],
        'learning_rate': [0.03, 0.1],
        'depth': [2, 4, 6, 8],
        'l2_leaf_reg': [0.2, 0.5, 1, 3]}

train_pool = Pool(features_train, label=targets_train[:, 2])

model_f = CatBoostRegressor(iterations=500)
model_f.grid_search(grid, train_pool)

In [None]:
stats = compute_stats(train_df)

In [None]:
result_dict = {"document_id":[], "views":[], "depth":[], "full_reads_percent":[]}
for i, row in tqdm(test_df.iterrows()):
      
        autors_set = [autor for autor in split_array(row.authors)]
        tags_set = [tag for tag in split_array(row.tags)]
        autors_set = [a for a in autors_set if a in stats["autors_dict_count"]]
        tags_set = [a for a in tags_set if a in stats["tags_dict_count"]]
        autors_set = autors_set if len(autors_set) > 0 else [""]
        tags_set = tags_set if len(tags_set) > 0 else [""]
        
        views_result = []
        depth_result = []
        reads_result = []
        for autor in autors_set:
          for tag in tags_set: 
              features = np.array([
                    row.ts,
                    row.ctr,
                    stats["cat_dict_v_agg"][row.category]["mean"],
                    stats["cat_dict_d_agg"][row.category]["mean"],
                    stats["cat_dict_f_agg"][row.category]["mean"],
                    stats["cat_dict_v_agg"][row.category]["std"],
                    stats["cat_dict_d_agg"][row.category]["std"],
                    stats["cat_dict_f_agg"][row.category]["std"],
                    stats["cat_dict_count"][row.category],

                    stats["autors_dict_v_agg"][autor]["mean"] if autor in stats["autors_dict_count"] else stats["views_mean"],
                    stats["autors_dict_d_agg"][autor]["mean"] if autor in stats["autors_dict_count"] else stats["depth_mean"],
                    stats["autors_dict_f_agg"][autor]["mean"] if autor in stats["autors_dict_count"] else stats["reads_mean"],
                    stats["autors_dict_v_agg"][autor]["std"] if autor in stats["autors_dict_count"] else stats["views_std"],
                    stats["autors_dict_d_agg"][autor]["std"] if autor in stats["autors_dict_count"] else stats["depth_std"],
                    stats["autors_dict_f_agg"][autor]["std"] if autor in stats["autors_dict_count"] else stats["reads_std"],
                    stats["autors_dict_count"][autor] if autor in stats["autors_dict_count"] else 0,

                    stats["tags_dict_v_agg"][tag]["mean"] if tag in stats["tags_dict_count"] else stats["views_mean"],
                    stats["tags_dict_d_agg"][tag]["mean"] if tag in stats["tags_dict_count"] else stats["depth_mean"],
                    stats["tags_dict_f_agg"][tag]["mean"] if tag in stats["tags_dict_count"] else stats["reads_mean"],
                    stats["tags_dict_v_agg"][tag]["std"] if tag in stats["tags_dict_count"] else stats["views_std"],
                    stats["tags_dict_d_agg"][tag]["std"] if tag in stats["tags_dict_count"] else stats["depth_std"],
                    stats["tags_dict_f_agg"][tag]["std"] if tag in stats["tags_dict_count"] else stats["reads_std"],
                    stats["tags_dict_count"][tag] if tag in stats["tags_dict_count"] else 0,
              ])
              views_result.append(model_v.predict(features[np.newaxis, :])[0])
              depth_result.append(model_d.predict(features[np.newaxis, :])[0])
              reads_result.append(model_f.predict(features[np.newaxis, :])[0])
              
        result_dict["document_id"].append(row.document_id)
        result_dict["views"].append(np.array(views_result).mean())
        result_dict["depth"].append(np.array(depth_result).mean())
        result_dict["full_reads_percent"].append(np.array(reads_result).mean())

result_df = pd.DataFrame(result_dict)
result_df

In [None]:
result_df.to_csv("submit.csv", index=False)