# Preprocessed data to hashtag analysis datasets
Takes the preprocessed data as input and outputs the source-target files necessary for hashtag analysis in network analysis. Also outputs all the hashtags per date - for the line plots over time.

In [87]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import networkx as nx
import re
import datetime
import itertools

In [88]:
data = pd.read_csv("data/english_preprocessed.csv")
data2 = pd.read_csv("data/english_preprocessed_08_26.csv")

data = pd.concat([data, data2]).reset_index(drop=True)

data.head()

Unnamed: 0.1,Unnamed: 0,created_at,user,verified,id_str,text,retweet_count,favorite_count,lang,follower_count,hashtags_bytwitter,tokens,text_clean,n_sent,token,lemma,upos,xpos,dependency relation,ner
0,0.0,2020-05-19 10:59:48,ChinaEUMission,1,1262699470946516992,All parties have reached consensus on a draft ...,0,0,en,14658,,"{'n_sent': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0...",all parties have reached consensus on a draft ...,,,,,,,
1,1.0,2020-05-19 11:22:28,ChinaEUMission,1,1262705175489363968,China's railway system is expected to reach 14...,0,0,en,14659,,"{'n_sent': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0...",china s railway system is expected to reach 14...,,,,,,,
2,2.0,2020-05-19 14:09:59,ChinaEUMission,1,1262747333747343366,China has exported over 50 billion masks since...,0,0,en,14665,StrongerTogether,"{'n_sent': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0...",china has exported over 50 billion masks since...,,,,,,,
3,3.0,2020-05-19 14:34:35,zlj517,1,1262753525655719938,RT @SpokespersonCHN: Not the first time to bas...,0,0,en,621491,,"{'n_sent': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0...",rt spokespersonchn not the first time to bas...,,,,,,,
4,4.0,2020-05-19 14:34:39,zlj517,1,1262753542101581830,RT @SpokespersonCHN: That what does not kill y...,0,0,en,621491,,"{'n_sent': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0...",rt spokespersonchn that what does not kill y...,,,,,,,


In [89]:
def extract_hashtags(row):
    # Extracts hashtags and attaches unique ones
    hash_list = list(set(re.findall(r'#\S*\w', row["text"])))
    return hash_list

In [90]:
# Create hashtags column with the actual unique hashtags
data["hashtags"] = data.apply(lambda row: extract_hashtags(row), axis = 1)
data.head()

Unnamed: 0.1,Unnamed: 0,created_at,user,verified,id_str,text,retweet_count,favorite_count,lang,follower_count,...,tokens,text_clean,n_sent,token,lemma,upos,xpos,dependency relation,ner,hashtags
0,0.0,2020-05-19 10:59:48,ChinaEUMission,1,1262699470946516992,All parties have reached consensus on a draft ...,0,0,en,14658,...,"{'n_sent': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0...",all parties have reached consensus on a draft ...,,,,,,,,[]
1,1.0,2020-05-19 11:22:28,ChinaEUMission,1,1262705175489363968,China's railway system is expected to reach 14...,0,0,en,14659,...,"{'n_sent': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0...",china s railway system is expected to reach 14...,,,,,,,,[]
2,2.0,2020-05-19 14:09:59,ChinaEUMission,1,1262747333747343366,China has exported over 50 billion masks since...,0,0,en,14665,...,"{'n_sent': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0...",china has exported over 50 billion masks since...,,,,,,,,[#StrongerTogether]
3,3.0,2020-05-19 14:34:35,zlj517,1,1262753525655719938,RT @SpokespersonCHN: Not the first time to bas...,0,0,en,621491,...,"{'n_sent': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0...",rt spokespersonchn not the first time to bas...,,,,,,,,[]
4,4.0,2020-05-19 14:34:39,zlj517,1,1262753542101581830,RT @SpokespersonCHN: That what does not kill y...,0,0,en,621491,...,"{'n_sent': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0...",rt spokespersonchn that what does not kill y...,,,,,,,,[]


In [91]:
# Let's take a subset of necessary columns, add id
df = data[["created_at", "hashtags"]].reset_index().rename(columns={"index": "id"})
df.head()

Unnamed: 0,id,created_at,hashtags
0,0,2020-05-19 10:59:48,[]
1,1,2020-05-19 11:22:28,[]
2,2,2020-05-19 14:09:59,[#StrongerTogether]
3,3,2020-05-19 14:34:35,[]
4,4,2020-05-19 14:34:39,[]


In [92]:
# Select only the ones where we have more than 1 hashtag per tweet
df = df[df["hashtags"].map(len) > 1].reset_index(drop=True)
df.head()

Unnamed: 0,id,created_at,hashtags
0,11,2020-05-20 07:46:38,"[#Hubei, #COVID19]"
1,12,2020-05-20 07:47:09,"[#Hubei, #COVID19]"
2,13,2020-05-20 07:54:16,"[#ShanghaiInternationalFilmFestival, #COVID19]"
3,15,2020-05-20 07:57:18,"[#TwoSessions, #COVID19]"
4,16,2020-05-20 08:00:01,"[#coronavirus, #BackToWork, #COVID_19]"


In [93]:
# Hashtag per row
# convert list of pd.Series then stack it
df = (df
 .set_index(['created_at','id'])['hashtags']
 .apply(pd.Series)
 .stack()
 .reset_index()
 .drop('level_2', axis=1)
 .rename(columns={0:'hashtag'}))
#lowercase!
df["hashtag"] = df["hashtag"].str.lower()
df["hashtag"] = df["hashtag"].str.replace("'.", "")
df["hashtag"] = df["hashtag"].str.replace("’.", "")

df

Unnamed: 0,created_at,id,hashtag
0,2020-05-20 07:46:38,11,#hubei
1,2020-05-20 07:46:38,11,#covid19
2,2020-05-20 07:47:09,12,#hubei
3,2020-05-20 07:47:09,12,#covid19
4,2020-05-20 07:54:16,13,#shanghaiinternationalfilmfestival
...,...,...,...
28478,2020-08-25 23:20:03,26162,#china
28479,2020-08-25 23:20:03,26162,#opinion
28480,2020-08-25 23:20:03,26162,#italy
28481,2020-08-25 23:40:03,26163,#covid19


In [95]:
# Concat some hashtags for better illustrations
def replace_hashtag(row):
    #print(row["hashtag"])
    if row["hashtag"] == "#covid19":
        new_hash = "#coronavirus"
    elif row["hashtag"] == "#covid_19":
        new_hash = "#coronavirus"
    elif row["hashtag"] == "#twosessions":
        new_hash = "#twosessions2020"
    elif row["hashtag"] == "#hk":
        new_hash = "#hongkong"
    elif row["hashtag"] == "#香港":
        new_hash = "#hongkong"
    else:
        new_hash = row["hashtag"]
    return new_hash

df["hashtag"] = df.apply(lambda row: replace_hashtag(row), axis = 1)
df

Unnamed: 0,created_at,id,hashtag
0,2020-05-20 07:46:38,11,#hubei
1,2020-05-20 07:46:38,11,#coronavirus
2,2020-05-20 07:47:09,12,#hubei
3,2020-05-20 07:47:09,12,#coronavirus
4,2020-05-20 07:54:16,13,#shanghaiinternationalfilmfestival
...,...,...,...
28478,2020-08-25 23:20:03,26162,#china
28479,2020-08-25 23:20:03,26162,#opinion
28480,2020-08-25 23:20:03,26162,#italy
28481,2020-08-25 23:40:03,26163,#coronavirus


In [96]:
# Fix dates
# Reading the data out and in again to fix datetime

df.to_csv("data/chinese_hashtags.csv", index=False)
df = pd.read_csv("data/chinese_hashtags.csv")

# Fix datetime - for some reason doesn't give NaT's only when you read the data in from csv
# Normalize the dates
def clean_dates(dd):
    # 0[DAY]  1[MONTH]  2[YEAR]
    datelist = [[],[],[]]
    for line in dd.created_at:
        #If string starts with letters (Mon Mar)
        if re.match(r'[A-Za-z]', line): #Tue Mar 03 20:44:19 +0000 2020
            # Day
            datelist[0].append(re.split(r'(?:[A-Za-z]{3}\s)([A-Za-z]{3})(\s\d{2})(?:.{16})(\d{4})', line)[2])
            # Month - turn month name into month number
            datelist[1].append(datetime.datetime.strptime((re.split(r'(?:[A-Za-z]{3}\s)([A-Za-z]{3})(\s\d{2})(?:.{16})(\d{4})', line)[1]), "%b").month)
            #datelist[1].append("03")
            # Year
            datelist[2].append(re.split(r'(?:[A-Za-z]{3}\s)([A-Za-z]{3})(\s\d{2})(?:.{16})(\d{4})', line)[3])
        else: #2020-02-28
            # Day
            datelist[0].append(re.split(r'(2020)-(\d*)-(\d*)', line)[3])
            # Month
            datelist[1].append(re.split(r'(2020)-(\d*)-(\d*)', line)[2])
            # Year
            datelist[2].append(re.split(r'(2020)-(\d*)-(\d*)', line)[1])

    # Change the list into DF of datetime
    d = pd.to_datetime(pd.DataFrame(datelist).transpose().rename(columns={0:"day", 1:"month", 2:"year"}))
    dd["created_at"] = d
    return(dd)

dd = clean_dates(df)

In [79]:
dd

Unnamed: 0,created_at,id,hashtag
0,2020-05-20,11,#hubei
1,2020-05-20,11,#covid19
2,2020-05-20,12,#hubei
3,2020-05-20,12,#covid19
4,2020-05-20,13,#shanghaiinternationalfilmfestival
...,...,...,...
28478,2020-08-25,26162,#china
28479,2020-08-25,26162,#opinion
28480,2020-08-25,26162,#italy
28481,2020-08-25,26163,#covid19


In [81]:
def df_to_timebins(id_hashtag):
    # Add freq of hashtags by themselves in the dataset
    hashtag_freq = pd.DataFrame({'whole_freq' : id_hashtag.groupby(['hashtag']).size()}).reset_index()

    # Add the whole_frew to id_hashtag - THIS SEEMS BETTER THAN mm
    freq_hashtags = pd.merge(id_hashtag, hashtag_freq, how='left', on=['hashtag'])#, 'id', 'created_at'])
    
    df0 = freq_hashtags
    return(freq_hashtags)

In [82]:
# Start conversion to timebins based on month
f = df_to_timebins(dd)
f

Unnamed: 0,created_at,id,hashtag,whole_freq
0,2020-05-20,11,#hubei,93
1,2020-05-20,11,#coronavirus,4151
2,2020-05-20,12,#hubei,93
3,2020-05-20,12,#coronavirus,4151
4,2020-05-20,13,#shanghaiinternationalfilmfestival,2
...,...,...,...,...
28478,2020-08-25,26162,#china,2224
28479,2020-08-25,26162,#opinion,461
28480,2020-08-25,26162,#italy,23
28481,2020-08-25,26163,#coronavirus,4151


In [83]:
start_date = "2020-05-20"
end_date = "2020-05-31"
mask = (f['created_at'] >= start_date) & (f['created_at'] <= end_date)
bin0 = f.loc[mask].reset_index(drop=True)

start_date = "2020-06-01"
end_date = "2020-06-30"
mask = (f['created_at'] >= start_date) & (f['created_at'] <= end_date)
bin1 = f.loc[mask].reset_index(drop=True)

start_date = "2020-07-01"
end_date = "2020-07-31"
mask = (f['created_at'] >= start_date) & (f['created_at'] <= end_date)
bin2 = f.loc[mask].reset_index(drop=True)

start_date = "2020-08-01"
end_date = "2020-08-25"
mask = (f['created_at'] >= start_date) & (f['created_at'] <= end_date)
bin3 = f.loc[mask].reset_index(drop=True)

In [84]:
# Convert timebins to source-target
def timebin(timebin_dataframe, with_count_1 = True):
    # Aggregate the dataframe based on ID
    hashtags_per_tweet = timebin_dataframe.groupby(['id'])["hashtag"].apply(lambda x: ', '.join(x)).reset_index()

    cols = [[],[]]
    for index, row in hashtags_per_tweet.iterrows():
        hashtags=row['hashtag'].split()
        hashtags_len = len(hashtags)
        for n in list(itertools.combinations(hashtags, 2)):
            cols[0].append(n[0])
            cols[1].append(n[1])

    # Make the data frame from the source and target columns, remove commas
    df3 = pd.DataFrame(list(zip(cols[0], cols[1]))).rename(columns={0:"source",1:"target"}).replace(',','', regex=True)
    
    # Aggregate repeated edges
    df10 = pd.DataFrame({'count' : df3.groupby(['source', 'target']).size()}).reset_index()
    
    if with_count_1 == False:
        df10 = df10[df10["count"] != 1]
    
    # Add the frequencies of hashtags per whole data + per timebin
    time1 = timebin_dataframe.rename(columns={'hashtag':'source', 'whole_freq': 'source_whole_freq', 'bin_freq':'source_bin_freq'})
    time1 = time1.drop(columns=['id', 'created_at'])

    time2 = timebin_dataframe.rename(columns={'hashtag':'target', 'whole_freq': 'target_whole_freq', 'bin_freq':'target_bin_freq'})
    time2 = time2.drop(columns=['id', 'created_at'])#, 'label', 'score'])

    df11 = pd.merge(df10, time1, how = 'left', on='source')
    df11 = pd.merge(df11, time2, how = 'left', on='target')
    
    # Drop duplicates if any!
    df11 = df11.drop_duplicates().reset_index(drop=True)
    
    return(df11)

In [85]:
# Make the timebins
time0 = timebin(bin0)
time1 = timebin(bin1)
time2 = timebin(bin2)
time3 = timebin(bin3)

In [86]:
# Chinese
time0.to_csv('data/hashtag_data/timebin_0_co_hashtags_rmv.csv', index=False)
time1.to_csv('data/hashtag_data/timebin_1_co_hashtags_rmv.csv', index=False)
time2.to_csv('data/hashtag_data/timebin_2_co_hashtags_rmv.csv', index=False)
time3.to_csv('data/hashtag_data/timebin_3_co_hashtags_rmv.csv', index=False)