# Data Visualization

In [None]:
import numpy as np
import pandas as pd
import sklearn
import sklearn.neighbors
import sklearn.ensemble
import matplotlib.pyplot as plt
import ast
import gensim, logging
import xgboost
import lightgbm
import datetime

In [None]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/evaluation.csv')
train_df.describe()

In [None]:
train_df

In [None]:
plt.stem(train_df[train_df['retweets_count'] > 2000]['retweets_count'])
plt.show()

In [None]:
def format_df(df: pd.DataFrame,
              type: str = 'train',
              url_len_only: bool = True,
              hashtag_len_only: bool = True,
              keep_time: bool = False,
              extra_fts: bool = False):
    final_df = df.drop(['TweetID', 'mentions', 'timestamp'], axis=1)

    final_df.urls = final_df.urls.apply(ast.literal_eval)
    final_df.hashtags = final_df.hashtags.apply(ast.literal_eval)
    if url_len_only:
        final_df.urls = final_df.urls.apply(len)
    if hashtag_len_only:
        final_df.hashtags = final_df.hashtags.apply(len)

    if keep_time:
        timestamps = df.timestamp // 1000
        timestamps = timestamps.apply(datetime.datetime.fromtimestamp).apply(datetime.datetime.timetuple)

        time_df = pd.DataFrame(timestamps.tolist(), index=df.index,
                               columns=['tm_year', 'tm_mon', 'tm_mday', 'tm_hour', 'tm_min', 'tm_sec', 'tm_wday',
                                        'tm_yday', 'tm_isdst'])
        time_df = time_df.drop(['tm_year', 'tm_mon', 'tm_mday', 'tm_isdst'], axis=1)

        final_df = pd.concat([final_df, time_df], axis=1)

    return final_df

In [None]:
new_train_df = format_df(train_df, keep_time=False, url_len_only=True, hashtag_len_only=True)

In [None]:
new_train_df

In [None]:
fig = plt.figure()
ax = plt.gca()
ax.scatter(new_train_df['retweets_count'], new_train_df['favorites_count'])
ax.set_yscale('log')
ax.set_xscale('log')
plt.show()

In [None]:
train_df_input = new_train_df.drop(['retweets_count', 'text'], axis=1)
train_df_input

In [None]:
new_test_df = format_df(test_df, type='test', url_len_only=True, hashtag_len_only=True)
test_df_input = new_test_df.drop(['text'], axis=1)
test_df_input

In [None]:
input_with_gts = pd.concat([train_df['retweets_count'], train_df_input], axis=1)

num_fts = input_with_gts.shape[1]
corr = np.corrcoef(input_with_gts.values.T)

fig, ax = plt.subplots(figsize=(9, 9))
im = ax.imshow(corr, cmap='bwr')
ax.set_xticks(np.arange(num_fts), labels=input_with_gts.columns)
ax.set_yticks(np.arange(num_fts), labels=input_with_gts.columns)
plt.colorbar(im)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")
for i in range(num_fts):
    for j in range(num_fts):
        text = ax.text(j, i, '%.2f' % corr[i, j], ha="center", va="center", color="k")
fig.tight_layout()
plt.show()

# NLP

In [None]:
nlp_model = gensim.models.Word2Vec.load('models/word2vec.model')

In [None]:
text = train_df['text'].iloc[11].split(' ')
text_vecs = nlp_model.wv[text]
text_img = text_vecs.T @ text_vecs


print(text)
plt.imshow(text_img)
plt.show()

# Trees, Boosting

In [None]:
full_train_X = train_df_input.values
full_train_X = (full_train_X - train_df_input.values.mean(0)) / train_df_input.values.std(0)

full_train_y = train_df['retweets_count'].values

full_test_X = (test_df_input.values - train_df_input.values.mean(0)) / train_df_input.values.std(0)

use_text = False

if use_text:
    train_text = train_df['text']
    test_text = test_df['text']

    new_train_X = np.zeros((full_train_X.shape[0], full_train_X.shape[1] + nlp_model.vector_size))
    new_test_X = np.zeros((full_train_X.shape[0], full_train_X.shape[1] + nlp_model.vector_size))

    for i in range(len(full_train_X)):
        text_vec = nlp_model.wv[train_text.iloc[i].split(' ')].mean(0)
        new_train_X[i] = np.concatenate([full_train_X[i], text_vec])

    for i in range(len(full_test_X)):
        encoded_words = [word for word in test_text.iloc[i].split(' ') if word in nlp_model.wv]
        if encoded_words:
            text_vec = nlp_model.wv[encoded_words].mean(0)
            new_test_X[i] = np.concatenate([full_test_X[i], text_vec])
        else:
            new_test_X[i] = np.concatenate([full_test_X[i], np.zeros((nlp_model.vector_size,))])

    full_train_X = new_train_X
    full_test_X = new_test_X

In [None]:
perform_cv = False

if perform_cv:
    scores_xgboost = sklearn.model_selection.cross_validate(xgboost.XGBRegressor(verbosity=0,
                                                                                 max_depth=10),
                                                            full_train_X,
                                                            full_train_y,
                                                            cv=5,
                                                            scoring='neg_mean_absolute_error',
                                                            verbose=2)

    print(- scores_xgboost['test_score'].mean())

# Make submission

In [None]:
make_submission = False

if make_submission:
    test_ids = test_df[['TweetID']]
    model = xgboost.XGBRegressor(verbosity=2)
    model.fit(full_train_X, full_train_y)

    test_predictions = model.predict(full_test_X)

    submission_df = pd.DataFrame(data={'retweets_count': test_predictions})
    submission_df = pd.concat([test_ids, submission_df], axis=1)
    submission_df.to_csv('data/submission.csv', index=False)