# Import Library

In [None]:
import pandas as pd
import pycountry
import json 
import os

In [None]:
# Map country code to country name
country_name = pycountry.countries.get(alpha_2='AG').name
country_name

# Add row number to each file

In [None]:
def change_name(path, dictionary):
    for each in dictionary.items():
        name = pycountry.countries.get(alpha_2=each[0]).name
        old_name = path + name + '.csv'
        new_name = path + str(each[1]) + '_' + name + '.csv'
        os.rename(old_name, new_name)

def write_json(file_name, dictionary):
    with open(file_name, "w") as outfile:
        json.dump(dictionary, outfile)

# Early: Kaggle 2020 1~8
- Kaggle 3&4: https://www.kaggle.com/datasets/smid80/coronavirus-covid19-tweets-early-april
- Kaggle 7&8:   Miss URL

In [None]:
early_files_03 = ['2020-03-29.csv', '2020-03-30.csv', '2020-03-31.csv', '2020-04-01.csv', '2020-04-02.csv',
                  '2020-04-03.csv', '2020-04-04.csv', '2020-04-05.csv', '2020-04-06.csv', '2020-04-07.csv',
                  '2020-04-08.csv', '2020-04-09.csv', '2020-04-10.csv', '2020-04-11.csv', '2020-04-12.csv',
                  '2020-04-13.csv', '2020-04-14.csv', '2020-04-15.csv']

early_file_07 = '2020-07.csv'


early_original_path = 'Data/Early/'            # The path stores the original data
early_processed_path = 'Data/Early_Processed/' # The path stores the processed data

early_num = {}                          # key is country_code, value is number of row

### Kaggle 3 & 4

In [None]:
def process_early_3_file(original_path, processed_path, files):
    for i, file_name in enumerate(files):
        file = pd.read_csv(original_path + file_name, index_col=False, error_bad_lines=False, engine='python')

        # Drop some useless columns
        file = file.drop(columns=['status_id', 'reply_to_status_id', 'reply_to_user_id', 'reply_to_screen_name',
                                  'is_quote', 'place_type', 'retweet_count'])
        
        # Remove the row where the value of 'country_code' is NaN, and the value of 'lang' is not 'en'
        file = file.dropna(subset=['country_code'])
        file = file.drop(file[file.lang != 'en'].index)
        
        file = file.drop(columns=['lang'])

        file.rename(columns={"place_full_name": "country_name"}, inplace=True)
        
        file.rename(columns={"screen_name": "user_name"}, inplace=True)
        file.rename(columns={"source": "user_source"}, inplace=True)
        file.rename(columns={"account_created_at": "user_created_at"}, inplace=True)
        file.rename(columns={"favourites_count": "user_favourites_count"}, inplace=True)
        file.rename(columns={"followers_count": "user_followers_count"}, inplace=True)
        file.rename(columns={"friends_count": "user_friends_count"}, inplace=True)
        file.rename(columns={"verified": "user_verified"}, inplace=True)

        file.rename(columns={"created_at": "tweet_created_at"}, inplace=True)
        file.rename(columns={"is_retweet": "tweet_is_retweet"}, inplace=True)
        file.rename(columns={"text": "tweet"}, inplace=True)
        
        file['tweet_created_at'] = pd.to_datetime(file['tweet_created_at'])
        file['user_created_at'] = pd.to_datetime(file['user_created_at'])
        
        file = file.reindex(columns=['tweet_created_at', 'country_code', 'country_name',
                                 'user_id', 'user_created_at', 'user_name', 'user_source', 'user_verified',
                                 'user_favourites_count', 'user_followers_count', 'user_friends_count',
                                 'tweet_is_retweet', 'tweet'])

        # Sort each row by country_code
        file = file.sort_values(by=['country_code'])

        country_code = set(file['country_code'])

        for country in country_code:
            try:
                # The output file name of csv
                output_path = processed_path + pycountry.countries.get(alpha_2=country).name+'.csv'

                temp = file.loc[file['country_code'] == country]

                if country not in early_num:
                    temp.to_csv(output_path, index=False)
                else:
                    temp.to_csv(output_path, mode='a', header=False, index=False)

                # If the country_code in dictionary, increment num
                # If not in dictionary, set 0, then increment num
                early_num[country] = early_num.get(country, 0) + len(temp)

            except:
                print("In [", file_name, "] Can not recognize this country code:", country)

In [None]:
process_early_3_file(early_original_path, early_processed_path, early_files_03)

### Kaggle 7 & 8

In [None]:
def process_early_7_file(original_path, processed_path, file_name):
    file = pd.read_csv(original_path + file_name, index_col=False, error_bad_lines=False, engine='python')
    # file = file.drop(columns=['user_location', 'user_description', 'hashtags'])

    file.rename(columns={"user_created": "user_created_at"}, inplace=True)
    file.rename(columns={"user_favourites": "user_favourites_count"}, inplace=True)
    file.rename(columns={"user_friends": "user_friends_count"}, inplace=True)
    file.rename(columns={"user_followers": "user_followers_count"}, inplace=True)
    file.rename(columns={"source": "user_source"}, inplace=True)

    file.rename(columns={"date": "tweet_created_at"}, inplace=True)
    file.rename(columns={"is_retweet": "tweet_is_retweet"}, inplace=True)
    file.rename(columns={"text": "tweet"}, inplace=True)

    file['tweet_created_at'] = pd.to_datetime(file['tweet_created_at'])
    
    file = file.dropna(subset=['user_favourites_count'])
    file = file.drop(file[file.user_favourites_count == 'None'].index)
    file['user_favourites_count'] = file['user_favourites_count'].astype(int)

    file = file.drop(file[file.user_created_at == 'False'].index)
    file['user_created_at'] = pd.to_datetime(file['user_created_at'])
    
    file = file.dropna(subset=['country_code'])
    file = file.drop(file[file.country_code == 'None'].index)

    file = file.reindex(columns=['tweet_created_at', 'country_code', 'country_name',
                                 'user_id', 'user_created_at', 'user_name', 'user_source', 'user_verified',
                                 'user_favourites_count', 'user_followers_count', 'user_friends_count',
                                 'tweet_is_retweet', 'tweet'])

    # Sort each row by country_code
    file = file.sort_values(by=['country_code'])

    country_code = set(file['country_code'])

    for country in country_code:
        try:
            # The output file name of csv
            output_path = processed_path + pycountry.countries.get(alpha_2=country).name+'.csv'

            temp = file.loc[file['country_code'] == country]
            
            if country not in early_num:
                temp.to_csv(output_path, index=False)
            else:
                temp.to_csv(output_path, mode='a', header=False, index=False)
                
            # If the country_code in dictionary, increment num
            # If not in dictionary, set 0, then increment num
            early_num[country] = early_num.get(country, 0) + len(temp)

        except:
            print("In [", file_name, "] Can not recognize this country code:", country)

In [None]:
process_early_7_file(early_original_path, early_processed_path, early_file_07)

### Add row number to each file

In [None]:
change_name(early_processed_path, early_num)
write_json(early_processed_path + "early_country_num.json", early_num)

# Mid: Kaggle 2020 9~2021 4

- TA Code 11:   https://hkustconnect-my.sharepoint.com/personal/euhaq_connect_ust_hk/_layouts/15/onedrive.aspx?id=%2Fpersonal%2Feuhaq%5Fconnect%5Fust%5Fhk%2FDocuments%2FSNA%5Fdata%2FCOVID%5F&ga=1
- TA Code 12:   https://hkustconnect-my.sharepoint.com/personal/euhaq_connect_ust_hk/_layouts/15/onedrive.aspx?ga=1&id=%2Fpersonal%2Feuhaq%5Fconnect%5Fust%5Fhk%2FDocuments%2FSNA%5Fdata%2Fcovid%2D11%2Ddec%2Dvaccine

In [None]:
mid_files = ['10039.csv', '10043.csv', '10044.csv', '10047.csv',
                          '10383.csv','10385.csv', '10386.csv']

mid_original_path = 'Data/Mid/'            # The path stores the original data
mid_processed_path = 'Data/Mid_Processed/' # The path stores the processed data

mid_num = {}                          # key is country_code, value is number of row

In [None]:
def process_mid_11_file(original_path, processed_path, files):
    for file_name in files:
        user = pd.read_csv(original_path + 'user_' + file_name, index_col=False, error_bad_lines=False, engine='python')
        user.rename(columns={"screen_name": "user_name"}, inplace=True)

        tweet = pd.read_csv(original_path + 'tweet_' + file_name, index_col=False, error_bad_lines=False, engine='python')
        tweet.rename(columns={"user": "user_name"}, inplace=True)

        file = pd.merge(user, tweet, on="user_name", how="outer")

        file.rename(columns={"date": "tweet_created_at"}, inplace=True)

        file.rename(columns={"id": "user_id"}, inplace=True)
        file.rename(columns={"created_at": "user_created_at"}, inplace=True)
        file.rename(columns={"followers_count": "user_followers_count"}, inplace=True)
        file.rename(columns={"verified": "user_verified"}, inplace=True)


        file.rename(columns={"is_retweet": "tweet_is_retweet"}, inplace=True)
        file.rename(columns={"text": "tweet"}, inplace=True)

        file['user_followers_count'] = pd.to_numeric(file['user_followers_count'],errors='coerce')
        file = file.dropna(subset=['user_followers_count'])
        file['user_followers_count'] = file['user_followers_count'].astype(int)
        
        file = file.dropna(subset=['country_code'])

        file = file.reindex(columns=['tweet_created_at', 'country_code', 'country_name',
                                     'user_id', 'user_created_at', 'user_name', 'user_source', 'user_verified',
                                     'user_favourites_count', 'user_followers_count', 'user_friends_count',
                                     'tweet_is_retweet', 'tweet'])

        # Sort each row by country_code
        file = file.sort_values(by=['country_code'])

        country_code = set(file['country_code'])

        for country in country_code:
            try:
                # The output file name of csv
                output_path = processed_path + pycountry.countries.get(alpha_2=country).name+'.csv'

                temp = file.loc[file['country_code'] == country]

                if country not in mid_num:
                    temp.to_csv(output_path, index=False)
                else:
                    temp.to_csv(output_path, mode='a', header=False, index=False)

                # If the country_code in dictionary, increment num
                # If not in dictionary, set 0, then increment num
                mid_num[country] = mid_num.get(country, 0) + len(temp)

            except:
                print("In [", file_name, "] Can not recognize this country code:", country)

In [None]:
process_mid_11_file(mid_original_path, mid_processed_path, mid_files)

In [None]:
change_name(mid_processed_path, mid_num)
write_json(mid_processed_path + "mid_country_num.json", mid_num)

# Late: Kaggle 2021 5~12

- Kaggle 11&12:   Miss URL
- Kaggle 2&4:     Miss URL

In [None]:
late_file_11 = '2021-11 & 12.csv'
late_file_02 = '2022-02 & 03.csv'


late_original_path = 'Data/Late/'            # The path stores the original data
late_processed_path = 'Data/Late_Processed/' # The path stores the processed data

late_num = {}                          # key is country_code, value is number of row

### Kaggle 11 & 12

In [None]:
def process_late_11_file(original_path, processed_path, file_name):
    file = pd.read_csv(original_path + file_name, index_col=False, error_bad_lines=False, engine='python')
    # file = file.drop(columns=['user_location', 'user_description', 'hashtags'])

    file.rename(columns={"user_created": "user_created_at"}, inplace=True)
    file.rename(columns={"source": "user_source"}, inplace=True)
    file.rename(columns={"user_favourites": "user_favourites_count"}, inplace=True)
    file.rename(columns={"user_friends": "user_friends_count"}, inplace=True)
    file.rename(columns={"user_followers": "user_followers_count"}, inplace=True)

    file.rename(columns={"date": "tweet_created_at"}, inplace=True)
    file.rename(columns={"is_retweet": "tweet_is_retweet"}, inplace=True)
    file.rename(columns={"text": "tweet"}, inplace=True)

    file['user_followers_count'] = file['user_followers_count'].astype(int)


    file['tweet_created_at'] = pd.to_datetime(file['tweet_created_at'])
    # file = file.drop(file[file.user_created_at == 'False'].index)
    file['user_created_at'] = pd.to_datetime(file['user_created_at'])

    file = file.reindex(columns=['tweet_created_at', 'country_code', 'country_name',
                                 'user_id', 'user_created_at', 'user_name', 'user_source', 'user_verified',
                                 'user_favourites_count', 'user_followers_count', 'user_friends_count',
                                 'tweet_is_retweet', 'tweet'])

    # Sort each row by country_code
    file = file.sort_values(by=['country_code'])

    country_code = set(file['country_code'])

    for country in country_code:
        try:
            # The output file name of csv
            output_path = processed_path + pycountry.countries.get(alpha_2=country).name+'.csv'

            temp = file.loc[file['country_code'] == country]

            if country not in late_num:
                temp.to_csv(output_path, index=False)
            else:
                temp.to_csv(output_path, mode='a', header=False, index=False)

            # If the country_code in dictionary, increment num
            # If not in dictionary, set 0, then increment num
            late_num[country] = late_num.get(country, 0) + len(temp)

        except:
            print("In [", file_name, "] Can not recognize this country code:", country)

In [None]:
process_late_11_file(late_original_path, late_processed_path, late_file_11)

In [None]:
def process_late_02_file(original_path, processed_path, file_name):
    file = pd.read_csv("Data/Late/2022-02 & 03.csv", index_col=False, error_bad_lines=False, engine='python')
    # file = file.drop(columns=['user_location', 'user_description', 'hashtags'])

    file.rename(columns={"user_created": "user_created_at"}, inplace=True)
    file.rename(columns={"id": "user_id"}, inplace=True)
    file.rename(columns={"source": "user_source"}, inplace=True)
    file.rename(columns={"user_favourites": "user_favourites_count"}, inplace=True)
    file.rename(columns={"user_friends": "user_friends_count"}, inplace=True)
    file.rename(columns={"user_followers": "user_followers_count"}, inplace=True)


    file.rename(columns={"date": "tweet_created_at"}, inplace=True)
    file.rename(columns={"is_retweet": "tweet_is_retweet"}, inplace=True)
    file.rename(columns={"text": "tweet"}, inplace=True)


    file = file.dropna(subset=['country_code'])
    file['tweet_created_at'] = pd.to_datetime(file['tweet_created_at'])
    file = file.drop(file[file.user_created_at == '1'].index)
    file['user_created_at'] = pd.to_datetime(file['user_created_at'])

    file = file.reindex(columns=['tweet_created_at', 'country_code', 'country_name',
                                 'user_id', 'user_created_at', 'user_name', 'user_source', 'user_verified',
                                 'user_favourites_count', 'user_followers_count', 'user_friends_count',
                                 'tweet_is_retweet', 'tweet'])

    # Sort each row by country_code
    file = file.sort_values(by=['country_code'])

    country_code = set(file['country_code'])

    for country in country_code:
        try:
            # The output file name of csv
            output_path = processed_path + pycountry.countries.get(alpha_2=country).name+'.csv'

            temp = file.loc[file['country_code'] == country]

            if country not in late_num:
                temp.to_csv(output_path, index=False)
            else:
                temp.to_csv(output_path, mode='a', header=False, index=False)

            # If the country_code in dictionary, increment num
            # If not in dictionary, set 0, then increment num
            late_num[country] = late_num.get(country, 0) + len(temp)

        except:
            print("In [", file_name, "] Can not recognize this country code:", country)

In [None]:
process_late_02_file(late_original_path, late_processed_path, late_file_02)

### Add row number to each file

In [None]:
change_name(late_processed_path, late_num)
write_json(late_processed_path + "late_country_num.json", late_num)

In [None]:
file = pd.read_csv("Data/Late/2022-02 & 03.csv", index_col=False, error_bad_lines=False, engine='python')
# file = file.drop(columns=['user_location', 'user_description', 'hashtags'])

file.rename(columns={"user_created": "user_created_at"}, inplace=True)
file.rename(columns={"id": "user_id"}, inplace=True)
file.rename(columns={"source": "user_source"}, inplace=True)
file.rename(columns={"user_favourites": "user_favourites_count"}, inplace=True)
file.rename(columns={"user_friends": "user_friends_count"}, inplace=True)
file.rename(columns={"user_followers": "user_followers_count"}, inplace=True)


file.rename(columns={"date": "tweet_created_at"}, inplace=True)
file.rename(columns={"is_retweet": "tweet_is_retweet"}, inplace=True)
file.rename(columns={"text": "tweet"}, inplace=True)



file['tweet_created_at'] = pd.to_datetime(file['tweet_created_at'])
file = file.drop(file[file.user_created_at == '1'].index)
file['user_created_at'] = pd.to_datetime(file['user_created_at'])

file = file.reindex(columns=['tweet_created_at', 'country_code', 'country_name',
                             'user_id', 'user_created_at', 'user_name', 'user_source', 'user_verified',
                             'user_favourites_count', 'user_followers_count', 'user_friends_count',
                             'tweet_is_retweet', 'tweet'])
 
file.head()

In [None]:
b = pd.read_csv("Data/Late/2022-02 & 03.csv", index_col=False, error_bad_lines=False, engine='python')
b

In [None]:
try:
    for each in b['user_created']:
        pd.to_datetime(each)
except:
    print(each)

In [None]:
a = pd.read_csv("Data/Mid/user_10039.csv", index_col=False, error_bad_lines=False, engine='python')
a.rename(columns={"screen_name": "user_name"}, inplace=True)
b = pd.read_csv("Data/Mid/tweet_10039.csv", index_col=False, error_bad_lines=False, engine='python')
b.rename(columns={"user": "user_name"}, inplace=True)
a = pd.merge(a, b, on="user_name", how="outer")

In [None]:
import numpy as np
a.rename(columns={"date": "tweet_created_at"}, inplace=True)

a.rename(columns={"id": "user_id"}, inplace=True)
a.rename(columns={"created_at": "user_created_at"}, inplace=True)
a.rename(columns={"followers_count": "user_followers_count"}, inplace=True)
a.rename(columns={"verified": "user_verified"}, inplace=True)


a.rename(columns={"is_retweet": "tweet_is_retweet"}, inplace=True)
a.rename(columns={"text": "tweet"}, inplace=True)

a['user_followers_count'] = pd.to_numeric(a['user_followers_count'],errors='coerce')
a.dropna(subset=['user_followers_count'], inplace= True)
# try:
#     for each in a['user_followers_count']:
#         each.astype(int)
# except:
#     print(each)
    
a['user_followers_count'] = a['user_followers_count'].astype(int)

a = a.dropna(subset=['country_code'])



a = a.reindex(columns=['tweet_created_at', 'country_code', 'country_name',
                                 'user_id', 'user_created_at', 'user_name', 'user_source', 'user_verified',
                                 'user_favourites_count', 'user_followers_count', 'user_friends_count',
                                 'tweet_is_retweet', 'tweet'])
a

In [None]:
a['date']

In [None]:
df = pd.read_csv("Data/Mid/user_10039.csv", index_col=False, engine='python')
df['followers_count'] = pd.to_numeric(df['followers_count'],errors='coerce')

In [None]:
c = c.drop(c[c['followers_count'].apply(lambda x: isinstance(x, int))])
c 

In [None]:
df = df.dropna(subset=['followers_count'])
df