# Get All Tweets with #podrevday

In [1]:
#!mkdir data
!GetOldTweets3 --querysearch "podrevday" --since 2020-01-01 --until 2020-08-11 --output "data/jan-aug-2020.csv"

Downloading tweets...
Saved 1481
Done. Output file generated "data/jan-aug-2020.csv".


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from geotext import GeoText

import nest_asyncio
nest_asyncio.apply()

import twint

df_tweets = pd.read_csv('data/jan-aug-2020.csv', parse_dates=['date'])

# Get User Data

In [6]:
pod_rev_users = list(set(df_tweets.username))

c = twint.Config()
c.Store_object = True
c.Pandas = True

for user in pod_rev_users: 
    c.Username = user
    twint.run.Lookup(c)
Users_df = twint.storage.panda.User_df

users_df = Users_df.drop_duplicates()
users_df.to_csv('data/user_data.csv')

1228314814021357568 | The BackTracker History Show | @BackTrackerUK | Private: 0 | Verified: 0 | Bio: The new show on Bradley Stoke Radio in Bristol & also a podcast all about local history as well as genealogy & facts! Views expressed are entirely my own. | Location: United Kingdom | Url: https://www.bradleystokeradio.com/ | Joined: 14 Feb 2020 5:47 AM | Tweets: 900 | Following: 685 | Followers: 418 | Likes: 1089 | Media: 92 | Avatar: https://pbs.twimg.com/profile_images/1252269813843591169/k9CCFYnO_400x400.jpg
944815878 | Berny | @myostaff | Private: 0 | Verified: 0 | Bio: Learn HOW to create a workshop/course/mastermind to deliver virtually OR in-person | Location: Sydney, Australia | Url: https://www.knowledgebroker.online | Joined: 12 Nov 2012 3:42 PM | Tweets: 8388 | Following: 184 | Followers: 202 | Likes: 264 | Media: 149 | Avatar: https://pbs.twimg.com/profile_images/1230085313080393728/v7F2u0RH_400x400.jpg
433185027 | Anne with an E | @Annelinda_c | Private: 0 | Verified: 0 |

# Clean User Data

In [2]:
df_users = pd.read_csv('data/user_data.csv')

In [3]:
df_users = df_users.loc[:, ['id','username', 'name', 'location', 'join_date', 'followers', 'following', 'likes', 'url', 'verified' ]]

In [4]:
def location_extraction (df):
    '''Creates creates a geotext column to extract city and country info if possible'''

    df.loc[:, "location"] = df.loc[:, "location"].fillna("blank")
    df.loc[:, "geotext"] = df.loc[:, "location"].apply(GeoText)
    df.loc[:, 'city'] = df.loc[:, 'geotext'].apply(lambda x: x.cities)
    df.loc[:, 'country'] = df.loc[:, 'geotext'].apply(lambda x: x.countries)
    
    return df

df_users = location_extraction(df_users)

In [5]:
from geonamescache import GeonamesCache
gc = GeonamesCache()
countries = gc.get_countries()
country_info = pd.DataFrame(countries).T
country_info = country_info.set_index('geonameid').reset_index()
name_code = country_info.loc[:, ["name", "iso3"]]

us_states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
             "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA",
             "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY",
             "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX",
             "UT", "VT", "VA", "WA", "WV", "WI", "WY", "USA", "United States",
             'Seattle', "Los Angeles", "Houston", "Atlanta", "Pittsburgh"]

us_state_names = ["Alaska", "Alabama", "Arkansas", "American Samoa", "Arizona", "California", "Colorado", "Connecticut", 
               "District ", "of Columbia", "Delaware", "Florida", "Georgia", "Guam", "Hawaii", "Iowa", "Idaho", "Illinois", 
               "Indiana", "Kansas", "Kentucky", "Louisiana", "Massachusetts", "Maryland", "Maine", "Michigan", "Minnesota", 
               "Missouri", "Mississippi", "Montana", "North Carolina", "North Dakota", "Nebraska", "New Hampshire", "New Jersey", 
               "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Puerto Rico", "Rhode Island", 
               "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Virginia", "Virgin Islands", "Vermont", "Washington", 
               "Wisconsin", "West Virginia", "Wyoming"]

can_prov_abbrev = {'Alberta': 'AB','British Columbia': 'BC','Manitoba': 'MB', 'New Brunswick': 'NB',
                       'Newfoundland and Labrador': 'NL', 'Northwest Territories': 'NT','Nova Scotia': 'NS','Nunavut': 'NU',
                       'Ontario': 'ON','Prince Edward Island': 'PE', 'Quebec': 'QC','Saskatchewan': 'SK','Yukon': 'YT'}

can_prov_names, can_prov_abbr = zip(*can_prov_abbrev.items())

uk = ["England", 'Wales', "Scotland", 'London', "Manchester", "Isle of Wight", "Northern Ireland", "United Kingdom", 'Bailiwick of Guernsey', "UK", "Hoxton", "Jersey"]

india_city = ["Bangalore", "Delhi", "Hyderabad", "Bengaluru"]

german_city = ["Munich", "Berlin", "eisgau","Hamburg", "Dortmund"]

south_africa = ["South Africa", "Durban", "Johannesburg"]

uae = ['UAE', 'Dubai', 'Abu Dhabi']

def replacer(area, name):
    df_users.loc[(df_users.location.str.contains('|'.join(area))), "country"] = name    
    return df_users

def list_to_string(df):
    df["city"] = df['city'].apply(lambda x: "".join(map(str, x)))
    df["country"] = df['country'].apply(lambda x: "".join(map(str, x)))
    
    return df

In [6]:
def location_cleaner(df):
    df = replacer(can_prov_names, "Canada")
    df = replacer(can_prov_abbrev, "Canada")
    df = replacer(us_state_names, "United States")
    df = replacer(us_states, "United States")
    df = replacer(uk, "United Kingdom")
    df = replacer(german_city, "Germany")
    df = replacer(south_africa, "South Africa")
    df = replacer(india_city, "India")
    df = replacer(uae, 'United Arab Emirates')
    df.loc[(df.location == "Italia"), "country"] = "Italy" 
    df.loc[(df.location == "Belgrade"), "country"] = "Serbia" 
    df.loc[(df.country == "PolandSerbia"), "country"] = "Poland" 
    df = list_to_string(df)
    
    return df

In [7]:
df_users = location_cleaner(df_users)
df_users.loc[(df_users.country == "PolandSerbia"), "country"] = "Poland" 

Unnamed: 0,id,username,name,location,join_date,followers,following,likes,url,verified,geotext,city,country
0,1228314814021357568,BackTrackerUK,The BackTracker History Show,United Kingdom,14 Feb 2020,418,685,1089,https://www.bradleystokeradio.com/,0,<geotext.geotext.GeoText object at 0x7f9021883...,,United Kingdom
1,944815878,myostaff,Berny,"Sydney, Australia",12 Nov 2012,202,184,264,https://www.knowledgebroker.online,0,<geotext.geotext.GeoText object at 0x7f9021883...,Sydney,Australia
2,433185027,Annelinda_c,Anne with an E,South Africa,9 Dec 2011,880,838,3881,https://linktr.ee/RootofSciencePodcasts,0,<geotext.geotext.GeoText object at 0x7f9021883...,,South Africa
3,1101617456945287169,StarWarsSession,Star Wars Sessions Podcast,"Essex, UK",1 Mar 2019,1288,1607,16529,http://patreon.com/starwarssessions,0,<geotext.geotext.GeoText object at 0x7f9021883...,Essex,United Kingdom
4,821846,kelake,Clark MacLeod （克拉克）,"Stratford, Prince Edward Island",8 Mar 2007,674,558,7373,http://clarkmacleod.com,0,<geotext.geotext.GeoText object at 0x7f9021883...,StratfordPrince Edward,Canada


In [8]:
df_users.country.value_counts()

                        162
United States           101
United Kingdom           30
Canada                    7
Germany                   5
New Zealand               3
Australia                 3
South Africa              3
India                     3
Malaysia                  2
Poland                    2
France                    2
United Arab Emirates      1
Italy                     1
Sweden                    1
Ecuador                   1
Serbia                    1
Nigeria                   1
Name: country, dtype: int64

In [9]:
df_users_full = pd.merge(left=df_users,
                    right=name_code,
                    how='left',
                    left_on='country',
                    right_on='name')

In [10]:
df_users_full = df_users_full.drop('name_y',axis='columns')

In [11]:
df_users_full.head()

Unnamed: 0,id,username,name_x,location,join_date,followers,following,likes,url,verified,geotext,city,country,iso3
0,1228314814021357568,BackTrackerUK,The BackTracker History Show,United Kingdom,14 Feb 2020,418,685,1089,https://www.bradleystokeradio.com/,0,<geotext.geotext.GeoText object at 0x7f9021883...,,United Kingdom,GBR
1,944815878,myostaff,Berny,"Sydney, Australia",12 Nov 2012,202,184,264,https://www.knowledgebroker.online,0,<geotext.geotext.GeoText object at 0x7f9021883...,Sydney,Australia,AUS
2,433185027,Annelinda_c,Anne with an E,South Africa,9 Dec 2011,880,838,3881,https://linktr.ee/RootofSciencePodcasts,0,<geotext.geotext.GeoText object at 0x7f9021883...,,South Africa,ZAF
3,1101617456945287169,StarWarsSession,Star Wars Sessions Podcast,"Essex, UK",1 Mar 2019,1288,1607,16529,http://patreon.com/starwarssessions,0,<geotext.geotext.GeoText object at 0x7f9021883...,Essex,United Kingdom,GBR
4,821846,kelake,Clark MacLeod （克拉克）,"Stratford, Prince Edward Island",8 Mar 2007,674,558,7373,http://clarkmacleod.com,0,<geotext.geotext.GeoText object at 0x7f9021883...,StratfordPrince Edward,Canada,CAN


# Merge User's Geographic Data with Tweets

In [12]:
full_df = pd.merge(left=df_tweets, 
                   right=df_users_full, 
                   how='left',
                   on='username')

full_df.to_csv('data/tweets_users_august.csv', index=False)