In [None]:
# Basic libraries
import numpy as np
import pandas as pd
import pickle

# Sklearn tools
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors

# XGBoost's gradient boosting regressor
import xgboost

In [None]:
# Reading competition data csv's
dtypes = { 'id': int, 'timestamp': int, 'user_verified': bool, 'user_statuses_count': int, 'user_followers_count': int, 
          'user_friends_count': int, 'user_mentions': str, 'urls': str, 'hashtags': str, 'text': str }
train_df = pd.read_csv('data/train.csv')
eval_df = pd.read_csv('data/evaluation.csv')

# Create numpy array of data
train_y = np.array(train_df['retweet_count'])
train_features = np.array(train_df[['user_followers_count', 'user_statuses_count', 'user_friends_count', 'user_verified']]).astype(np.float32)
eval_features = np.array(eval_df[['user_followers_count', 'user_statuses_count', 'user_friends_count', 'user_verified']]).astype(np.float32)

In [None]:
# Load embeddings
with open('data/train.pkl', 'rb') as f:
    train_tweets = pickle.load(f).numpy()
with open('data/evaluation.pkl', 'rb') as f:
    eval_tweets = pickle.load(f).numpy()

In [None]:
# PCA of tweets embeddings
pca = PCA(2)
pca.fit(np.concatenate([train_tweets, eval_tweets]))
train_tweets = pca.transform(train_tweets)
eval_tweets = pca.transform(eval_tweets)

In [None]:
# Combine numerical and textual features
train_x = pd.DataFrame(np.concatenate([train_features, train_tweets], axis=1))
eval_x = pd.DataFrame(np.concatenate([eval_features, eval_tweets], axis=1))

In [None]:
def train_predict(train_x, train_y, test_x):
    # Parameters tuned in grid cross-validation
    n1, n2, alpha1, alpha2 = 40, 10, 0.4, 0.8
    n_estimators, learning_rate, max_depth = 200, 0.08, 10

    def filter_neighbors(neighbors_idx, neighbors_dist, kind):
        """
        Filters neighbors based on user_verified, by lowering the distance to those neighbors which have 
        user_verified equal to the element.
        @param neighbors_idx Index of the 'n1' neighbors for each element.
        @param neighbors_dist Distance of the 'n1' neighbors for each element.
        @param kind 'train' or 'test'
        @return new_neighbors_idx Index of the 'n2+1' neighbors for each element.
        """
        is_verified_train = train_x.iloc[:, 3]
        is_verified_test = test_x.iloc[:, 3]
        
        new_neighbors_idx = np.zeros((len(neighbors_idx), n2+1), dtype=int)
        for i in range(len(neighbors_idx)):
            # Calculate vector with user_verified for each neighbor
            neighbors_verified = is_verified_train.iloc[neighbors_idx[i]].astype(int)

            # Lowers the distance by factors alpha1 or alpha2
            if (kind == 'train' and is_verified_train.iloc[i]) or (kind == 'test' and is_verified_test.iloc[i]):
                neighbors_dist[i, neighbors_verified] *= alpha1
            else:
                neighbors_dist[i, neighbors_verified] *= alpha2

            # Sort neighbors with new distances
            idx = np.argsort(neighbors_dist[i])[:n2+1].astype(int)

            # Filter neighbors
            new_neighbors_idx[i] = neighbors_idx[i, idx]
        
        return new_neighbors_idx

    # First KNN, of n2 neighbors with 3 features: user_followers_count, user_friends_count and user_statuses_count
    nn = NearestNeighbors(n_neighbors=n1)
    nn.fit(train_x.iloc[:, :3], train_y)

    # Calculate neighbors for training set. This will be used to train the XGBRegressor
    dist, train_neighbors_idx = nn.kneighbors(train_x.iloc[:, :3])

    # Filter neighbors based on user_verified value, selecting only n2+1 neighbors
    train_neighbors_idx = filter_neighbors(train_neighbors_idx, dist, 'train')

    # Sort neighbors based on retweet_count, filtering itself
    train_neighbors_idx = np.array([i[np.argsort(train_y[i])] for i in train_neighbors_idx[:, 1:]])

    # Calculate neighbors for testing set
    dist, test_neighbors_idx = nn.kneighbors(test_x.iloc[:, :3])
    test_neighbors_idx = filter_neighbors(test_neighbors_idx, dist, 'test')

    # Sort neighbors based on retweet count, remove furthest one to keep only n2 neighbors
    test_neighbors_idx = np.array([i[np.argsort(train_y[i])] for i in test_neighbors_idx[:, :-1]])

    # Calculate prediction values for XBGRegressor, which are the index of the best neighbor of each element
    train_best_neighbors_idx = []
    for i in range(len(train_y)):
        diff = abs(train_y[i] - train_y[train_neighbors_idx[i]])
        train_best_neighbors_idx.append(np.argmin(diff))
    train_best_neighbors_idx = np.array(train_best_neighbors_idx)

    # Train regressor
    xgb = xgboost.XGBRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, tree_method='gpu_hist', objective='reg:squarederror')
    xgb.fit(train_x, train_best_neighbors_idx)
    pred = xgb.predict(test_x)

    # Truncate result (is better than rounding)
    pred = pred.astype(int)

    # Convert back to retweet_count
    pred = np.array([train_y[test_neighbors_idx[i, np.clip(pred[i], 0, n2-1)]] for i in range(len(test_x))])
    return pred

In [None]:
pred = train_predict(train_x, train_y, eval_x)

In [None]:
submission = pd.DataFrame({ 'TweetID': eval_df['id'], 'NoRetweets': pred })
submission.to_csv('submission.csv', index=False)