In [134]:
import numpy as np
import warnings
warnings.simplefilter("ignore", UserWarning)

import pandas as pd
import pickle
from collections import Counter
from typing import Type, List
import optuna
from datetime import datetime
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import wordpunct_tokenize

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder

from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [135]:
DATA_PATH = "./scrapped_data/scrapped_train_emb.csv"
SUBMISSION_DATA_PATH = "./scrapped_data/scrapped_test_emb.csv"
MEAN_TARGET_ENCODING_PATH = "./mean_target_encoding"
TEST_RATIO = 0.2

In [136]:
data = pd.read_csv(DATA_PATH, index_col=0)
submission_data = pd.read_csv(SUBMISSION_DATA_PATH, index_col=0)
train_data, test_data = train_test_split(data, test_size=TEST_RATIO)
train_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)
submission_data.reset_index(drop=True, inplace=True)

# assert data.isna().sum().sum() == 0
# assert train_data.isna().sum().sum() == 0
print(train_data.shape, test_data.shape, submission_data.shape)

(3004, 332) (752, 332) (500, 331)


In [137]:
def estimate_mean_target_regularization(category, global_mean: float, category_means: pd.Series,
                                        category_counts: pd.Series, alpha: float = 10) -> float:
    try:
        return (category_counts[category] * category_means[category] + global_mean * alpha) / (category_counts[category] * alpha)
    except:
        return global_mean

In [138]:
def mean_target_regularization(dataset: pd.DataFrame, column: str, target_name: str,
                               train: bool = True, alpha: float = 10) -> pd.Series:
    encoding_file_path = MEAN_TARGET_ENCODING_PATH + "_" + column + ".pickle"
    if train:
        encoding = {
            "global_mean": dataset[target_name].mean(),
            "counts": dataset.groupby(dataset[column]).count()[target_name],
            "category_means": dataset.groupby(dataset[column]).mean()[target_name]
        }
        with open(encoding_file_path, "wb") as output_file:
            pickle.dump(encoding, output_file)
    else:
        with open(encoding_file_path, "rb") as input_file:
            encoding = pickle.load(input_file)
    
    global_mean = encoding["global_mean"]
    counts = encoding["counts"]
    category_means = encoding["category_means"]
    
    target_mean = dataset[column].apply(
        lambda category: estimate_mean_target_regularization(
            category,
            global_mean=global_mean,
            category_means=category_means,
            category_counts=counts,
            alpha=alpha
        )
    )
    
    return target_mean

In [210]:
def preprocess_data(data: pd.DataFrame, train: bool = True):
    MOST_COMMON_COUNT = 50
    result_data = data.copy()
    
    # Mean target encoding with regularization
    result_data["author_mean_target"] = mean_target_regularization(result_data, "author", "target", train=train)
    result_data["publication_mean_target"] = mean_target_regularization(result_data, "publication", "target", train=train)
    
    
    # Add title len as a feature
    result_data["title_char_len"] = result_data["title"].apply(len)
    result_data["title_token_len"] = result_data["title"].apply(lambda title: len(wordpunct_tokenize(title)))
    
    # Encode dates
    dates = result_data["date"].apply(lambda date: pd.Timestamp(date))
    result_data["day"] = dates.dt.day
    result_data["month"] = dates.dt.month
    result_data["year"] = dates.dt.year
    result_data["ago"] = abs(result_data["date"].apply(lambda date: pd.Timestamp(date).date()) - datetime.today().date())
    result_data["ago"] = pd.to_numeric(result_data['ago'].dt.days, downcast='integer')
    # Season encoding
    season_dict = {
        1: 'winter',
        2: 'winter',
        3: 'spring',
        4: 'spring',
        5: 'spring',
        6: 'summer',
        7: 'summer',
        8: 'summer',
        9: 'fall',
        10: 'fall',
        11: 'fall',
        12: 'winter'
    }
    result_data['season'] = result_data['month'].map(lambda x: season_dict[x])
    result_data['winter'] = [1 if season=='winter' else 0 for season in result_data['season']]
    result_data['summer'] = [1 if season=='summer' else 0 for season in result_data['season']]
    result_data['spring'] = [1 if season=='spring' else 0 for season in result_data['season']]
    result_data['fall'] = [1 if season=='fall' else 0 for season in result_data['season']]
    
    keyword_columns = [f"key_word_{idx}" for idx in range(1, 11)]

    num_cols = ['followers', 'reading_time', 'n_words', 'n_code_chunks', 'n_images', 'bold_text_count',
       'italic_text_count', 'mean_image_width', 'mean_image_height',
       'n_images', 'n_lists', 'n_vids', 'n_links']
    for col in num_cols:
        result_data[col] = result_data[col].astype('int64')

    # Filling nans in keyword_1, ..., keyword_10 columns
    for keyword_column in keyword_columns:
        result_data[keyword_column].fillna("<unk>", inplace=True)
        
    # Concat topics in one column
    all_topics = result_data[keyword_columns[0]]
    for keyword_column in keyword_columns[1:]:
        all_topics += " " + result_data[keyword_column]
    
    assert all_topics.isna().sum() == 0
    result_data["all_topics"] = all_topics
    
    # Lemmatize keyword columns
    lemmatizer = WordNetLemmatizer()
    result_data[keyword_columns] = result_data[keyword_columns].applymap(lemmatizer.lemmatize)
    
    if train:
        # Count topics for keyword_columns
        topics = []
        for keyword_column in keyword_columns:
            topics += result_data[keyword_column].tolist()

        topics_counter = Counter(topics)

        most_common_topics = set([topic for topic, _ in topics_counter.most_common()[:MOST_COMMON_COUNT]])
        
        with open("most_common_topics.pickle", "wb") as output_file:
            pickle.dump(most_common_topics, output_file)
            
        # Use CountVectorizer on topics
        vectorizer = CountVectorizer(vocabulary=most_common_topics)
        vectorizer.fit(result_data["all_topics"])
        with open("topics_vectorizer.pickle", "wb") as output_file:
            pickle.dump(vectorizer, output_file)
            
        # One-hot encode language, analytic, polarity, emotion
        one_hot_encoder = OneHotEncoder(handle_unknown="ignore")
        one_hot_encoder.fit(result_data[["language", "analytic", "polarity", "emotion"]])
        with open("one_hot_encoder.pickle", "wb") as output_file:
            pickle.dump(one_hot_encoder, output_file)
    else:
        with open("most_common_topics.pickle", "rb") as input_file:
            most_common_topics = pickle.load(input_file)
        with open("topics_vectorizer.pickle", "rb") as input_file:
            vectorizer = pickle.load(input_file)
        with open("one_hot_encoder.pickle", "rb") as input_file:
            one_hot_encoder = pickle.load(input_file)

    topics_vectorized = pd.DataFrame(vectorizer.transform(result_data["all_topics"]).toarray())
    result_data = pd.concat([result_data, topics_vectorized.add_suffix("_vectorizer")], axis=1)
    assert result_data.isna().sum().sum() == 0
    
    categorials_encoded = pd.DataFrame(one_hot_encoder.transform(result_data[["language"]]).toarray())
    result_data = pd.concat([result_data, categorials_encoded.add_suffix("_one_hot")], axis=1)
    assert result_data.isna().sum().sum() == 0
    
    assert len(most_common_topics) == MOST_COMMON_COUNT
    assert result_data[keyword_columns].isin(most_common_topics).sum().sum() > 0
    assert result_data.isna().sum().sum() == 0
    
    drop_columns = ["date", "author", "publication", "season", "pure_text", "title", "all_topics", "language"] + keyword_columns
    result_data.drop(drop_columns, inplace=True, axis=1)
    
    return result_data

In [211]:
train_data_preprocessed = preprocess_data(train_data, train=True)
test_data_preprocessed = preprocess_data(test_data, train=False)
submission_preprocessed = preprocess_data(submission_data, train=False)

In [157]:
set(train_data_preprocessed.columns) - set(submission_preprocessed.columns)

{'target'}

In [158]:
set(submission_preprocessed.columns) - set(train_data_preprocessed.columns)

set()

In [159]:
class Regressor:
    def train(self, X: np.ndarray, y: np.ndarray) -> None:
        raise NotImplementedError
        
    def predict(self, X: np.ndarray) -> np.ndarray:
        raise NotImplementedError

In [160]:
class MyCB(Regressor):
    def __init__(self):
        self.regressor = CatBoostRegressor(verbose=False, loss_function="MAE")
        
    def train(self, X: np.ndarray, y: np.ndarray) -> None:
        self.regressor.fit(X, y)
        
    def predict(self, X: np.ndarray) -> np.ndarray:
        return self.regressor.predict(X)
    
class MyLGBM(Regressor):
    def __init__(self):
        self.regressor = LGBMRegressor(objective="mae")
        
    def train(self, X: np.ndarray, y: np.ndarray) -> None:
        self.regressor.fit(X, y)
        
    def predict(self, X: np.ndarray) -> np.ndarray:
        return self.regressor.predict(X)
    
class MyXGB(Regressor):
    def __init__(self):
        self.regressor = XGBRegressor()
        
    def train(self, X: np.ndarray, y: np.ndarray) -> None:
        self.regressor.fit(X, y)
        
    def predict(self, X: np.ndarray) -> np.ndarray:
        return self.regressor.predict(X)

In [161]:
def cross_validate(trial, data: pd.DataFrame, RegressorClasses: List[Type[Regressor]]):
    NUM_FOLDS = 3
    kfold = KFold(NUM_FOLDS)
    
    # Optimize weights for model predictions
    weights = np.zeros(len(RegressorClasses))
    for idx in range(len(RegressorClasses) - 1):
        weights[idx] = trial.suggest_float(f"x_{idx}", 0, 1 - weights.sum())
    weights[-1] = 1 - weights.sum() # num_models x 1
    print("Current weights:", weights)
    
    mae_history = []
    for fold_idx, (train_idx, test_idx) in enumerate(kfold.split(data)):
        train_df = data.iloc[train_idx].reset_index(drop=True)
        test_df = data.iloc[test_idx].reset_index(drop=True)
        
        train_df_preprocessed = preprocess_data(train_df, train=True)
        test_df_preprocessed = preprocess_data(test_df, train=False)
        
        X_train = train_df_preprocessed.drop(["target"], axis=1)
        X_test = test_df_preprocessed.drop(["target"], axis=1)
        
        y_train = train_df_preprocessed["target"].to_numpy()
        y_test = test_df_preprocessed["target"].to_numpy()
        
        regressors = []
        for RegressorClass in RegressorClasses:
            current_regressor = RegressorClass()
            current_regressor.train(X_train, y_train)
            regressors.append(current_regressor)
        
        predictions = []
        for regressor in regressors:
            predictions.append(regressor.predict(X_test))
        predictions = np.vstack(predictions) # num_models x test_size
        
        final_predictions = np.sum(predictions * weights[:, None], axis=0)
        current_loss = mean_absolute_error(y_test, final_predictions)
        mae_history.append(current_loss)
        print(f"Fold number: {fold_idx + 1}, loss: {current_loss}")
        
    return np.mean(mae_history)

In [162]:
X_train = train_data_preprocessed.drop(["target"], axis=1).to_numpy()
X_test = test_data_preprocessed.drop(["target"], axis=1).to_numpy()

y_train = train_data_preprocessed["target"].to_numpy()
y_test = test_data_preprocessed["target"].to_numpy()

In [153]:
study = optuna.create_study()

objective_wrapper = lambda trial: cross_validate(trial, data, [MyCB, MyLGBM, MyXGB])
study.optimize(objective_wrapper, timeout=60*40)

[32m[I 2022-04-21 17:56:52,426][0m A new study created in memory with name: no-name-d9bc36ac-49ad-4637-ba59-099cc86a22f3[0m


Current weights: [0.5974105  0.05598622 0.34660328]
Fold number: 1, loss: 705.0918402334522
Fold number: 2, loss: 814.9163511149744


[32m[I 2022-04-21 17:57:17,346][0m Trial 0 finished with value: 773.1765043720076 and parameters: {'x_0': 0.5974105015642539, 'x_1': 0.05598622197469578}. Best is trial 0 with value: 773.1765043720076.[0m


Fold number: 3, loss: 799.5213217675962
Current weights: [0.55103208 0.07643921 0.37252871]
Fold number: 1, loss: 702.4294306473214
Fold number: 2, loss: 810.1182029659999


[32m[I 2022-04-21 17:57:42,414][0m Trial 1 finished with value: 770.3672716053898 and parameters: {'x_0': 0.5510320755042853, 'x_1': 0.07643921119285382}. Best is trial 1 with value: 770.3672716053898.[0m


Fold number: 3, loss: 798.554181202848
Current weights: [0.20708381 0.39666257 0.39625363]
Fold number: 1, loss: 730.8756404310446
Fold number: 2, loss: 814.2433356675651


[32m[I 2022-04-21 17:58:07,218][0m Trial 2 finished with value: 791.7143406949398 and parameters: {'x_0': 0.20708380846108487, 'x_1': 0.3966625663604391}. Best is trial 1 with value: 770.3672716053898.[0m


Fold number: 3, loss: 830.0240459862093
Current weights: [0.07242704 0.72566566 0.2019073 ]
Fold number: 1, loss: 803.0633348970796
Fold number: 2, loss: 868.1316018711457


[32m[I 2022-04-21 17:58:32,490][0m Trial 3 finished with value: 853.9721990782709 and parameters: {'x_0': 0.07242703935797068, 'x_1': 0.7256656625430156}. Best is trial 1 with value: 770.3672716053898.[0m


Fold number: 3, loss: 890.7216604665877
Current weights: [0.10144008 0.60696481 0.29159511]
Fold number: 1, loss: 773.1383528547428
Fold number: 2, loss: 843.7759141753742


[32m[I 2022-04-21 17:58:58,543][0m Trial 4 finished with value: 827.3542710842452 and parameters: {'x_0': 0.10144007983774816, 'x_1': 0.6069648098869286}. Best is trial 1 with value: 770.3672716053898.[0m


Fold number: 3, loss: 865.1485462226184
Current weights: [0.18503552 0.50232318 0.31264129]
Fold number: 1, loss: 755.1807611776098
Fold number: 2, loss: 833.4166432065925


[32m[I 2022-04-21 17:59:23,926][0m Trial 5 finished with value: 812.6101268789909 and parameters: {'x_0': 0.18503552403768642, 'x_1': 0.5023231811090868}. Best is trial 1 with value: 770.3672716053898.[0m


Fold number: 3, loss: 849.2329762527703
Current weights: [0.62890678 0.08371914 0.28737408]
Fold number: 1, loss: 716.3169354262229
Fold number: 2, loss: 826.6016848853735


[32m[I 2022-04-21 17:59:50,613][0m Trial 6 finished with value: 783.2824132582349 and parameters: {'x_0': 0.6289067808368878, 'x_1': 0.08371913861521872}. Best is trial 1 with value: 770.3672716053898.[0m


Fold number: 3, loss: 806.9286194631082
Current weights: [0.59131501 0.34185432 0.06683067]


KeyboardInterrupt: 

In [212]:
data_preprocessed = preprocess_data(data, train=True)
data_preprocessed

Unnamed: 0,followers,reading_time,n_words,n_code_chunks,bold_text_count,italic_text_count,mean_image_width,mean_image_height,n_images,n_lists,...,29_one_hot,30_one_hot,31_one_hot,32_one_hot,33_one_hot,34_one_hot,35_one_hot,36_one_hot,37_one_hot,38_one_hot
0,3700,23,4892,0,35,1,1150,426,30,3,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,256,6,889,9,6,1,516,304,13,1,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,4700,11,2217,0,5,27,994,582,6,2,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,239,4,523,4,5,0,753,512,2,2,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,173,10,1433,2,96,15,657,291,14,4,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3751,1800,1,153,0,9,28,1400,716,1,0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3752,124,6,1009,0,21,0,859,631,4,3,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3753,317,4,632,7,1,2,700,405,1,1,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3754,2700,7,399,10,88,48,826,457,17,0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [213]:
data_preprocessed.isna().sum().sum()

0

In [214]:
X = data_preprocessed.drop(["target"], axis=1).to_numpy()
y = data_preprocessed["target"].to_numpy()

In [215]:
cb = CatBoostRegressor(loss_function="MAE")
cb.fit(X, y)
lgb = LGBMRegressor(objective="mae")
lgb.fit(X, y)
xgb = XGBRegressor()
xgb.fit(X, y)

0:	learn: 1368.5560936	total: 8.33ms	remaining: 8.32s
1:	learn: 1351.6027915	total: 18.2ms	remaining: 9.06s
2:	learn: 1338.1233855	total: 27.1ms	remaining: 9.01s
3:	learn: 1327.6505554	total: 34.4ms	remaining: 8.56s
4:	learn: 1320.7919976	total: 42.9ms	remaining: 8.53s
5:	learn: 1310.2458421	total: 50.9ms	remaining: 8.43s
6:	learn: 1300.4786153	total: 58.5ms	remaining: 8.3s
7:	learn: 1288.2007581	total: 65.6ms	remaining: 8.14s
8:	learn: 1282.4446802	total: 72.1ms	remaining: 7.94s
9:	learn: 1277.3569293	total: 79.1ms	remaining: 7.83s
10:	learn: 1272.1344998	total: 85.9ms	remaining: 7.72s
11:	learn: 1263.6620942	total: 92.9ms	remaining: 7.65s
12:	learn: 1254.5252916	total: 100ms	remaining: 7.59s
13:	learn: 1246.9781741	total: 106ms	remaining: 7.49s
14:	learn: 1239.4542072	total: 113ms	remaining: 7.39s
15:	learn: 1232.4602357	total: 118ms	remaining: 7.27s
16:	learn: 1225.4798650	total: 124ms	remaining: 7.17s
17:	learn: 1217.8781719	total: 130ms	remaining: 7.08s
18:	learn: 1207.9358701	tot

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [167]:
study.best_params

{'x_0': 0.5510320755042853, 'x_1': 0.07643921119285382}

In [190]:
cb_weight = best_params["x_0"]#study.best_params["x_0"]
lgb_weight = best_params["x_1"]#study.best_params["x_1"]
xgb_weight = 1 - cb_weight - lgb_weight

In [191]:
sub_idx = pd.read_csv(SUBMISSION_DATA_PATH)["idx"]
submission_data_preprocessed = preprocess_data(submission_data, train=False)

In [192]:
X = submission_data_preprocessed.to_numpy()
# X.drop(columns=embeddings_cols)
cb_pred = cb.predict(X)
lgb_pred = lgb.predict(X)
xgb_pred = xgb.predict(X)

final_pred = cb_weight * cb_pred + lgb_weight * lgb_pred + xgb_weight * xgb_pred
final_pred[final_pred < 0] = 0
final_pred = np.round(final_pred)

In [193]:
pd.DataFrame({"id": sub_idx, "claps": final_pred.astype(int)}).to_csv("submission.csv", index=False)

In [120]:
embeddings_cols = data_preprocessed.columns[data_preprocessed.columns.get_loc('target')+1:data_preprocessed.columns.get_loc('author_mean_target')]

In [121]:
data_preprocessed = data_preprocessed.drop(columns=embeddings_cols)

In [93]:
i = range(1,38)

In [95]:
cols = [str(i_) + '_one_hot' for i_ in i]

In [107]:
data_preprocessed = data_preprocessed.drop(columns=cols)