In [1]:
import numpy as np
import pandas as pd
import sklearn
import sklearn.neighbors
import matplotlib.pyplot as plt
import ast

In [2]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/evaluation.csv')
train_df.describe()

Unnamed: 0,retweets_count,favorites_count,followers_count,statuses_count,friends_count,verified,timestamp,TweetID
count,353969.0,353969.0,353969.0,353969.0,353969.0,353969.0,353969.0,353969.0
mean,15.83181,46.655442,20215.48,48085.07,1459.289003,0.030005,1647004000000.0,687250.3
std,241.986723,852.044385,259871.5,113385.4,2502.933271,0.170602,4846468000.0,417579.3
min,0.0,0.0,0.0,1.0,0.0,0.0,1301178000000.0,3.0
25%,0.0,0.0,160.0,2972.0,214.0,0.0,1647068000000.0,319449.0
50%,1.0,0.0,726.0,12501.0,693.0,0.0,1647292000000.0,671973.0
75%,3.0,1.0,2283.0,43522.0,1804.0,0.0,1647532000000.0,1049644.0
max,63674.0,122591.0,14417100.0,8183508.0,237269.0,1.0,1647727000000.0,1434456.0


In [3]:
train_df.head()

Unnamed: 0,text,retweets_count,favorites_count,followers_count,statuses_count,friends_count,mentions,urls,verified,hashtags,timestamp,TweetID
0,rt refarcir macron ans nom prépare,3,0,3682,453535,3628,[],[],0,[],1646978048000,832509
1,populaire,0,0,86,1016,284,[],[],0,[],1647694288000,1388011
2,faut dégager cinglé,3,1,1944,28234,1995,[],[],0,[],1647370048000,63896
3,enseignants mettre prescriptions président rép...,0,0,1,1072,0,[],['https://t.co/rytlted08g'],0,[],1647256282000,979251
4,mafieuse oppressive macron,0,0,13957,25311,10841,[],[],0,[],1647258374000,1040049


In [4]:
train_df_input = train_df.drop(['TweetID', 'timestamp', 'mentions', 'retweets_count', 'text'], axis=1)

train_df_input.urls = train_df_input.urls.apply(ast.literal_eval)
train_df_input.urls = train_df_input.urls.apply(len)

train_df_input.hashtags = train_df_input.hashtags.apply(ast.literal_eval)
train_df_input.hashtags = train_df_input.hashtags.apply(len)

train_df_input.head()

Unnamed: 0,favorites_count,followers_count,statuses_count,friends_count,urls,verified,hashtags
0,0,3682,453535,3628,0,0,0
1,0,86,1016,284,0,0,0
2,1,1944,28234,1995,0,0,0
3,0,1,1072,0,1,0,0
4,0,13957,25311,10841,0,0,0


In [5]:
test_df_input = test_df.drop(['TweetID', 'timestamp', 'mentions', 'text'], axis=1)

test_df_input.urls = test_df_input.urls.apply(ast.literal_eval)
test_df_input.urls = test_df_input.urls.apply(len)

test_df_input.hashtags = test_df_input.hashtags.apply(ast.literal_eval)
test_df_input.hashtags = test_df_input.hashtags.apply(len)

In [6]:
def kfold_cv(X: pd.DataFrame,
             y: pd.DataFrame,
             n_splits: int = 10,
             method: str = 'knn',
             n_neighbors: int = 1):
    kf = sklearn.model_selection.KFold(n_splits=n_splits, shuffle=True, random_state=123456)
    all_splits = [i for i in kf.split(X)]

    train_mse = []
    val_mse = []

    model = None
    if method == 'knn':
        model = sklearn.neighbors.KNeighborsRegressor(n_neighbors=n_neighbors)

    for k in range(n_splits):
        print('Training split', k)
        train_indexes, val_indexes = all_splits[k]

        train_X = X.iloc[train_indexes].values
        mean = train_X.mean(0)
        std = train_X.std(0)
        train_X = (train_X - mean) / std
        train_y = y.iloc[train_indexes].values

        val_X = X.iloc[val_indexes].values
        val_X = (val_X - mean) / std
        val_y = y.iloc[val_indexes].values

        model.fit(train_X, train_y)

        train_predictions = model.predict(train_X)
        val_predictions = model.predict(val_X)

        train_mse.append(np.abs(train_predictions - train_y).mean())
        val_mse.append(np.abs(val_predictions - val_y).mean())

    return (sum(train_mse) / len(train_mse), sum(val_mse) / len(val_mse))


In [11]:
# for n in range(7, 17, 2):
#     print(n, 'neighbors...')
#     print(kfold_cv(train_df_input, train_df['retweets_count'], n_splits=5, n_neighbors=n))

# Output of above loop
knn_mse = { 1: (0.0022840985090, 8.82755207099830),
            3: (5.2445855803790, 7.50757052066827),
            5: (5.9034562077925, 7.29391667684989),
            7: (6.2303256518832, 7.21462506305364),
            9: (6.4206208103262, 7.22025110296241),
            11: (6.5645450982183, 7.21998145087708),
            13: (6.6775331234898, 7.24705572946240),
            15: (6.7655345805517, 7.28415542343034)}

In [12]:
mean = train_df_input.values.mean(0)
std = train_df_input.values.std(0)

train_X = (train_df_input.values - mean) /  std
test_X = (test_df_input.values - mean) / std

test_ids = test_df[['TweetID']]
model = sklearn.neighbors.KNeighborsRegressor(n_neighbors=7)
model.fit(train_X, train_df['retweets_count'])
test_predictions = model.predict(test_X)

In [13]:
submission_df = pd.DataFrame(data={'retweets_count': test_predictions})
submission_df = pd.concat([test_ids, submission_df], axis=1)
submission_df.to_csv('data/submission.csv', index=False)