# Analyze user account geo location and visualize it on world map 

In [2]:

import pandas as pd
import numpy as np
import os

## Load IRA users 

In [9]:
filename = './data/ira_users_csv_hashed.csv'

In [10]:
users = pd.read_csv(filename, dtype = {
    "tweetid": str,
    "userid": str,
    "user_display_name": str,
    "user_screen_name": str,
    "user_reported_location": str,
    "user_profile_description": str,
    "user_profile_url": str,
    "follower_count": str,
    "following_count": str,
    "account_creation_date": str,
    "account_language": str,
    "tweet_text": str,
    "tweet_time": str,
    "tweet_client_name": str,
    "in_reply_to_tweetid": str,
    "in_reply_to_userid": str,
    "quoted_tweet_tweetid": str,
    "is_retweet": bool,
    "retweet_userid": str,
    "retweet_tweetid": str,
    "latitude": str,
    "longitude": str,
    "quote_count": str,
    "reply_count": str,
    "like_count": str,
    "retweet_count": str,
    "hashtags": str,
    "urls": str,
    "user_mentions": str,
    "poll_choices": str,
})

In [None]:
# Number of user accounts
print(len(users))

In [6]:
# user account attributes
users.columns


Index(['userid', 'user_display_name', 'user_screen_name',
       'user_reported_location', 'user_profile_description',
       'user_profile_url', 'follower_count', 'following_count',
       'account_creation_date', 'account_language'],
      dtype='object')

## Raw user location count 

In [7]:

pd.crosstab(index=users['user_reported_location'],   columns="count").sort_values(by=['count'], ascending=False).head(10)

col_0,count
user_reported_location,Unnamed: 1_level_1
USA,670
Москва,206
United States,116
Atlanta,73
Санкт-Петербург,66
US,65
Питер,62
Россия,55
Moscow,48
"New York, USA",42


## Translate and reformat user location data

In [92]:

users['location'] = users.user_reported_location.replace({  'Волгоград':'Volgograd, Russia','Egypt':'N/A, Egypt',
                                                            'Ростов':'Rostov-on-Don, Russia','Baltimore, MD':'Baltimore, United States','London':'London, England','Jacksonville':'Jacksonville, United States',
                                                            'Moscow':'Moscow, Russia', 'Петербург':'Saint Petersburg, Russia','New York City':'New York, United States', 'Arizona':'Phoenix, United States',
                                                            'Germany':'N/A, Germany','Санкт-Петербург, Россия':'Saint Petersburg, Russia','Нижний Новгород':'Nizhny Novgorod, Russia','Тверь':'Tver, Russia','Киев':'Kiev, Ukraine',
                                                            'دمشق':'Damascus, Syria','spb':'SPB','Estados Unidos':'N/A, United States','Estados Unidos':'Volgograd, Russia',
                                                            'Казань':'Kazan, Russia','Екатеринбург':'Yekaterinburg, Russia', 'СПБ':'Saint Petersburg, Russia','СПб':'Saint Petersburg, Russia','Washington, D.C.': 'Washington DC, United States',
                                                            'Houston, TX':'Houston, United States','Miami, FL': 'Miami, United States','Texas, USA':'Texas, United States','Washington': 'Washington DC, United States',
                                                            'France': 'N/A, France','Phoenix':'Phoenix, United States','Chicago, IL':'Chicago, United States','Washington, DC': 'Washington D.C, United States',
                                                            'Philadelphia':'Philadelphia, United States'  ,'Chicago':'Chicago, United States','Washington, D.C': 'Washington DC, United States',
                                                            'Чебоксары':'Cheboksary, Russia', 'سوريا':'N/A, Syria','Syria':'N/A, Syria', 'New-York':'New York, United States', 
                                                            'Atlanta' :'Atlanta, United States','LA' :'Los Angeles, United States','Los Angeles' :'Los Angeles, United States','Los Angeles, CA' :'Los Angeles, United States',
                                                            'Boston':'Boston, United States','Омск':'Omsk, Russia','New York, NY':'New York, United States',
                                                            'New York':'New York, United States', 'NY':'New York, United States', 'Новгород':'Veliky Novgorod, Russia', 
                                                            'Москва, Россия':'Moscow, Russia','Новосибирск': 'Novosibirsk, Russia', 'Россия' :'N/A, Russia' ,'Russia' :'N/A, Russia' ,
                                                            'New York, USA' :'New York, United States'   ,'Atlanta, GA' :'Atlanta, United States',
                                                            'Lichfield\t':'Lichfield, England', 'Istanbul via Liverpool': 'Liverpool, England', 'United States':'N/A, United States', 
                                                            'US':'N/A, United States', 'USA':'N/A, United States', 'Москва':'Moscow, Russia',
                                                            'Моscow':'Moscow, Russia', 'Санкт-Петербург':'Saint Petersburg, Russia', 'Питер':'Saint Petersburg, Russia'})

In [94]:
# For accounts without city specified, assume to be the capital of the country
users['location'] = users.location.replace({'N/A, Egypt':'Cairo, Egypt', 'N/A, Syria':'Damascus, Syria', 'N/A, France':'Paris, France', 'N/A, Germany':'Berlin, Germany'    ,'N/A, Russia':'Moscow, Russia'  ,'N/A, United States':'Washington, D.C., United States','Washington DC, United States':'Washington, D.C., United States', 'Washington D.C, United States':'Washington, D.C., United States' })

## Location count by cities

In [95]:

location_count = pd.crosstab(index=users['location'],   columns="count").sort_values(by=['count'], ascending=False)

In [96]:
location_count.reset_index(level=0, inplace=True)


In [None]:
# Get the cities with more than 5 users and exclude invalid cities (MSK, SPB) 

In [97]:

most_frequent = location_count[(location_count['count'] > 5) & (location_count['location'] != 'МSK') & (location_count['location']!='SPB')]

### Parse country from location 

In [98]:

most_frequent['country'] = [str(x).split(',')[-1].strip() for x in most_frequent['location']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [99]:
# glimpse of the data
most_frequent

col_0,location,count,country
0,"Washington, D.C., United States",871,United States
1,"Moscow, Russia",373,Russia
2,"Saint Petersburg, Russia",178,Russia
3,"New York, United States",124,United States
4,"Atlanta, United States",89,United States
5,"Los Angeles, United States",51,United States
6,"Damascus, Syria",41,Syria
7,"Novosibirsk, Russia",38,Russia
9,"Veliky Novgorod, Russia",36,Russia
10,"Omsk, Russia",32,Russia


### Get the country distribution of user accounts

In [107]:
countries_cnt = most_frequent.groupby(['country']).sum().sort_values(by='count', ascending = False)

In [108]:
countries_cnt.reset_index(level=0, inplace=True)

## Map the country distribution on world map 

In [78]:

from geopy.geocoders import Nominatim

In [79]:
geolocator = Nominatim(user_agent="twitter_bot_analysis")

In [110]:
# Get the long and lat coordinates of countries
countries_cnt['long'] = [geolocator.geocode(str(x)).longitude for x in countries_cnt['country']]

In [111]:
countries_cnt['lat'] = [geolocator.geocode(str(x)).latitude for x in countries_cnt['country']]

In [112]:
# Glimpse at country distribution
countries_cnt

col_0,country,count,long,lat
0,United States,1225,-100.445882,39.78373
1,Russia,734,97.745306,64.686314
2,Syria,41,39.049411,34.640186
3,Germany,30,10.423447,51.08342
4,France,11,1.888334,46.603354
5,England,10,-0.54024,52.795479
6,Egypt,8,29.267547,26.254049
7,Ukraine,7,31.271832,49.487197


### Generate map

In [118]:

# import the library
import folium

# Make an empty map
m = folium.Map(location=[0,0], zoom_start=2)
 
# I can add marker one by one on the map
for i in range(0,len(countries_cnt)):
   folium.Circle( location=[countries_cnt.iloc[i]['lat'], countries_cnt.iloc[i]['long']],      popup=countries_cnt.iloc[i]['country'] + ' : ' + str(countries_cnt.iloc[i]['count']),
      radius=float(countries_cnt.iloc[i]['count']*1000),      color='crimson',    fill_color='crimson',  fill=True).add_to(m)
 
# Save it as html
m.save('mymap_countries.html')


## Map user account distribution by cities on world map 

In [100]:


# Get the long and lat coordinates for cities
most_frequent['long'] = [geolocator.geocode(str(x)).longitude for x in most_frequent['location']]
most_frequent['lat'] = [geolocator.geocode(str(x)).latitude for x in most_frequent['location']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


### Glimpse at cities distribution

In [102]:

most_frequent

col_0,location,count,country,long,lat
0,"Washington, D.C., United States",871,United States,-77.036563,38.895009
1,"Moscow, Russia",373,Russia,37.617494,55.750446
2,"Saint Petersburg, Russia",178,Russia,30.316229,59.938732
3,"New York, United States",124,United States,-73.987156,40.730862
4,"Atlanta, United States",89,United States,-84.390185,33.749099
5,"Los Angeles, United States",51,United States,-118.242767,34.053683
6,"Damascus, Syria",41,Syria,36.309581,33.51307
7,"Novosibirsk, Russia",38,Russia,82.923451,55.028217
9,"Veliky Novgorod, Russia",36,Russia,31.275786,58.520986
10,"Omsk, Russia",32,Russia,73.371529,54.991375


## Generate map 

In [120]:

# import the library
import folium

# Make an empty map
m = folium.Map(location=[0,0], zoom_start=2)
 
# I can add marker one by one on the map
for i in range(0,len((most_frequent))):
   folium.Circle( location=[most_frequent.iloc[i]['lat'], most_frequent.iloc[i]['long']],      popup=most_frequent.iloc[i]['location'] + ' : ' + str(most_frequent.iloc[i]['count']),
      radius=float(most_frequent.iloc[i]['count']*500),      color='crimson',    fill_color='crimson',  fill=True).add_to(m)
 
# Save it as html
m.save('mymap.html')


In [None]:
## Future Question
####   From the location analysis, we can see that most accounts reported to be within US though they are from foreign information operations. 
####    In the future, we can further study the differences between accounts reported to be domestic and foreign.
####     1. Do accounts reported to be domestic have higher user interaction?
####     2. Do they target US issues? What languages do they use? 
####     3. Do they post according to US time zone? 

####   From the frequency analysis, we notice significant difference in the tweeting time accross countries
####   In the future, we want to know
####     1. Do tweets posted in a particular time have higher user interaction?
####     