## Combine 2015 and 2017 datasets

In [12]:
import re
import numpy as np
import pandas as pd
import json

In [13]:
base = "/Users/bandy/Downloads/BT4012/"

In [14]:
df_2017 = pd.read_excel(base + "data.xlsx")
df_2017.shape

(14368, 43)

In [15]:
df_2015 = pd.read_csv(base + "all_users_2015.csv")
df_2015.columns

Index(['id', 'name', 'screen_name', 'statuses_count', 'followers_count',
       'friends_count', 'favourites_count', 'listed_count', 'created_at',
       'url', 'lang', 'time_zone', 'location', 'default_profile',
       'default_profile_image', 'geo_enabled', 'profile_image_url',
       'profile_banner_url', 'profile_use_background_image',
       'profile_background_image_url_https', 'profile_text_color',
       'profile_image_url_https', 'profile_sidebar_border_color',
       'profile_background_tile', 'profile_sidebar_fill_color',
       'profile_background_image_url', 'profile_background_color',
       'profile_link_color', 'utc_offset', 'protected', 'verified',
       'description', 'updated', 'dataset', 'account_type'],
      dtype='object')

## Choosing which columns to remove

In [17]:
#Remove columns with all the same values or null
to_remove_2017 = []
to_remove_2015 = []

def remove_columns(df, to_remove_list):
  for i in df: 
    if len(set(df[i])) == 1 or all(pd.isnull(df[i])):
      to_remove_list.append(i)
      print(i)

print("2017 df: Columns with the same values in all of its rows are:")
remove_columns(df_2017, to_remove_2017)
print("--------------------------------------------------------------")
print("2015 df: Columns with the same values in all of its rows are:")

remove_columns(df_2015, to_remove_2015)


# remove_list.append('country_displayable_name')
# print('country_displayable_name')

2017 df: Columns with the same values in all of its rows are:
follow_request_sent
notifications
contributors_enabled
following
--------------------------------------------------------------
2015 df: Columns with the same values in all of its rows are:
protected
verified


In [18]:
# Remove 'protected' and 'verified from to_remove_list for 2015 dataset
# Even though all values for the above columns are null in 2015 dataset, they are no null in 2017 dataset
# A null entry hence might represent an absence of that attribute, meaning that all accounts in 2015 data are not protected and not verified
to_remove_2015.remove('protected')
to_remove_2015.remove('verified')

In [19]:
# Remove irrelevant columns
# Columns 'test_set_1' and'test_set_2' were used for the previous researcher's own testing
to_remove_2017.extend(['test_set_1', 'test_set_2'])

In [20]:
# Out of 14368 rows, only 1 row has 'is_translator' == 1
# Remove 'is_translator' due to imbalanced distribution of positive instances and lack of further information from data source
display(df_2017.loc[df_2017['is_translator'] == 1,:])
to_remove_2017.extend(['is_translator'])

Unnamed: 0,id,name,screen_name,statuses_count,followers_count,friends_count,favourites_count,listed_count,url,lang,...,description,contributors_enabled,following,created_at,timestamp,crawled_at,updated,test_set_1,test_set_2,account_type
163,110928853,Ayumi,Ayu,41271,15171,210,7721,174,http://t.co/XQBf9jn3u8,ja,...,[è‹±èªžï¼‹æ—¥æœ¬èªž] åˆ_x009d_ã‚_x0081_ã_x0081...,,,Wed Feb 03 07:01:41 +0000 2010,2010-02-03 08:01:41,2015-05-02 06:34:44,2016-03-15 15:54:37,0.0,0.0,real


In [21]:
# 'timestamp' and 'created_at' seem to represent the same date but different time in different formats.
# Keep 'created_at' to standardise with df_2015, remove 'timestamp' and 'crawled_at' due to missing data
display(df_2017.loc[:,['timestamp','created_at','crawled_at']])
print("Number of missing entries in created_at column:", len(df_2017[pd.isnull(df_2017['created_at'])]))
print("Number of missing entries in timestamp column:", len(df_2017[pd.isnull(df_2017['timestamp'])]))
print("Number of missing entries in crawled_at column:", len(df_2017[pd.isnull(df_2017['crawled_at'])]))

to_remove_2017.extend(['timestamp','crawled_at'])

Unnamed: 0,timestamp,created_at,crawled_at
0,2013-06-11 13:20:35,Tue Jun 11 11:20:35 +0000 2013,2015-05-02 06:41:46
1,2014-05-13 12:37:57,Tue May 13 10:37:57 +0000 2014,2015-05-01 17:20:27
2,2011-05-05 01:30:37,Wed May 04 23:30:37 +0000 2011,2015-05-01 18:48:28
3,2010-09-17 16:02:10,Fri Sep 17 14:02:10 +0000 2010,2015-05-01 13:55:16
4,2015-02-06 05:10:49,Fri Feb 06 04:10:49 +0000 2015,2015-05-02 01:17:32
...,...,...,...
14363,NaT,Tue Apr 30 08:23:57 +0000 2013,NaT
14364,NaT,Tue Apr 30 08:34:49 +0000 2013,NaT
14365,NaT,Tue Apr 30 09:21:12 +0000 2013,NaT
14366,NaT,Tue Apr 30 11:25:11 +0000 2013,NaT


Number of missing entries in created_at column: 0
Number of missing entries in timestamp column: 3351
Number of missing entries in crawled_at column: 3351


In [22]:
# Remove dataset in 2015 dataset as it is no longer relevant since the data has been labelled
print("Unique values in dataset column", set(df_2015['dataset'].values))
to_remove_2015.extend(['dataset'])

Unique values in dataset column {'TWT', 'E13', 'INT', 'TFP', 'FSF'}


In [24]:
# After dropping columns, both dataset have the same number of columns
df_2017 = df_2017.drop(to_remove_2017, axis = 1)
print(df_2017.shape)

df_2015 = df_2015.drop(to_remove_2015, axis = 1)
print(df_2015.shape)

(14368, 34)
(5301, 34)


In [26]:
# All colmns in 2015 and 2017 dataset have the same name, but might not be in the same order
print(set(df_2015.columns) == set(df_2017.columns))

# Rearrange 2015 dataset columns to be the same as that of 2017 for merging
cols_list = df_2017.columns.tolist()
df_2015 = df_2015[cols_list]

# All colmns in 2015 and 2017 dataset have the same name and are in the same order, ready for merge
print(list(df_2015.columns) == list(df_2017.columns))

True
True


In [27]:
# Combine both dataset and remove duplicated ids
combined_df = pd.concat([df_2015, df_2017], axis=0, ignore_index=True)
combined_df = combined_df.drop_duplicates(subset=['id'])
combined_df.shape

(16318, 34)

## Remove rows that profile picture cannot be scraped

In [29]:
# Remove rows that profile pic cannot be scraped
with open(base + 'failed_id_new_batch_2015.json') as data:
    failed_id_1 = json.load(data)
with open(base + 'failed_id_new_batch_2017.json') as data:
    failed_id_2 = json.load(data)
failed_id = failed_id_1 + failed_id_2
failed_id = list(set(failed_id)) # Remove duplicated id

In [30]:
len(failed_id)

5205

In [31]:
cleaned_df = combined_df
index_list = []
for index, row in combined_df.iterrows():
    if row['id'] in failed_id:
        index_list.append(index)

unique_index_list = list(set(index_list)) # Remove duplicated indexes
cleaned_df.drop(unique_index_list,inplace = True)
cleaned_df.shape


(11113, 34)

In [32]:
cleaned_df.to_csv(base + 'combined_twitter_data.csv')