In [1]:
import requests
import warnings
import string
import joblib
import multiprocessing
import torch
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from collections import defaultdict
from transformers import AutoTokenizer, AutoModel
from transformers import BertTokenizer
from transformers import BertModel
from torch.nn import functional as F


warnings.filterwarnings("ignore")

In [2]:
def load_tweets(tweets_file="../data/preprocessed_tweet_20201619.csv", 
                from_date="2017-01-01", 
                to_date="2020-06-01", 
                count=10):
    """
    Parameters: 
        tweet_file: directory
        from_date: str
        to_date: str
        count: int (remove the rows which sentence length are less than certain integer)
    """
    cols = ["date", "time", "username", "tweet", "clean_tweet", "hashtags", 
            "likes_count", "replies_count", "retweets_count", "slang_count"]
    df = pd.read_csv(tweets_file, usecols=cols)
    print("# of total tweets: {}".format(df.shape[0]))
    df.sort_values(by="date", ascending=True, inplace=True)
    df.set_index('date', inplace=True)
    df = df.loc[from_date:to_date]
    df.reset_index(drop=False, inplace=True)
    df.drop_duplicates(inplace=True)
    df.drop_duplicates(subset="tweet", inplace=True)
    df.drop_duplicates(subset="clean_tweet", inplace=True)
    df = df[df.clean_tweet.str.count('\s+').gt(count)]
    print("There are {} tweets we get.".format(df.shape[0]))
    return df

In [3]:
def transform_df(df, by="finance", k=10):
    """
    Parameters: 
        df: DataFrame
        by: "forex", "finance", "politics"
        k: int
    """
    # Group tweets by date and aggregate into a list
    df_temp = df.copy()
    df_temp = df_temp.sort_values(['date', 'finance'], ascending=False).groupby('date').head(100)
    df_temp = df_temp.groupby("date")['clean_tweet'].agg(list)
    df_temp = df_temp.reset_index(drop=False, inplace=False)
    df_temp.columns = ["date", "agg_tweets"]
    
    # Create top k tweet columns
    new_cols = ["Top {} Tweet".format(i+1) for i in range(k)]
    df_temp = df_temp.assign(**dict.fromkeys(new_cols, np.NaN))
    
    # Update every columns
    for index, row in tqdm(df_temp.iterrows(), total=df_temp.shape[0]):
        try:
            i = 1
            for tweet in row["agg_tweets"]:
                column = "Top {} Tweet".format(i)
                df_temp.loc[index, column] = tweet
                i += 1
                if i > k:
                    break
        except:
            pass
    df = df_temp.drop("agg_tweets", axis=1)
    
    return df
        
df_final = joblib.load("../data/df_final_v3.gzip")
df_final = transform_df(df_final, by="finance", k=20)
df_final.head()

HBox(children=(FloatProgress(value=0.0, max=1208.0), HTML(value='')))




Unnamed: 0,date,Top 1 Tweet,Top 2 Tweet,Top 3 Tweet,Top 4 Tweet,Top 5 Tweet,Top 6 Tweet,Top 7 Tweet,Top 8 Tweet,Top 9 Tweet,...,Top 11 Tweet,Top 12 Tweet,Top 13 Tweet,Top 14 Tweet,Top 15 Tweet,Top 16 Tweet,Top 17 Tweet,Top 18 Tweet,Top 19 Tweet,Top 20 Tweet
0,2017-01-01,Forex Of A Basis Of Financial Literacy Of Torr...,Forex trading Dollar| url fx forex fb fx forex...,"RichestLearn Forex Trading Crash Course, FX Tr...",trading forex binaryoptions Introduction to Fi...,Chart: The Most Traded Currencies in 2016 url...,url url Business Analyst Insurance - Kelly...,An insider is guide to CFD is and forex tradin...,trading forex binaryoptions USD/CHF Long on br...,How To trade the Forex Market| url fx forex fb...,...,"Sterling squeezes higher, cad rate decision ne...",Forex Trading Signals| url fx forex fb fx fore...,download Stoclye High Low Middle forex trading...,"""How High Can The Price Of Bitcoin Rise?"" by T...",News: The final forex closing levels for 2016 ...,Forex Trump Trading| url fx forex fb fx forex ...,"Maximum Profits, Minimum Time - binary Options...",Learn about forex trading| url fx forex fb fx ...,News: China Said to Boost Scrutiny of Foreign ...,MYREALESTATEBRAINDOTCOM SWFL REALESTATE ECONOM...
1,2017-01-03,"On Forbes' list of Most Powerful People,\nObam...",New Glaeser paper on real estate bubbles sugge...,,,,,,,,...,,,,,,,,,,
2,2017-01-04,China leading financial reformer Peoples Bank ...,,,,,,,,,...,,,,,,,,,,
3,2017-01-05,Zhou Xiaochuan: Life for dogs !! - Sign the Pe...,,,,,,,,,...,,,,,,,,,,
4,2017-01-06,Haruhiko Kuroda: A new phase of the global eco...,bis Haruhiko Kuroda: A new phase of the global...,Zhou Xiaochuan: Life for dogs !! - Sign the Pe...,,,,,,,...,,,,,,,,,,
