In [1]:
#!pip install twython

In [2]:
from twython import Twython
import pandas as pd
import numpy as np
import datetime as dt
import time
import yaml
import dask.dataframe as dd
from dask.multiprocessing import get

### Setting up  twython

In [3]:
# Setting up twython

APP_KEY = '9vXK2iKIqDjwkI70ImVyUySyN'
APP_SECRET = "3qdL7Jp5MMXcjy96CBh0SWoAqmQxUZysanfscAMYxuJrb4YGe8"

twitter = Twython(APP_KEY, APP_SECRET, oauth_version=2)
ACCESS_TOKEN = twitter.obtain_access_token()
twitter = Twython(APP_KEY, access_token=ACCESS_TOKEN)

### Locations and Keywords used

In [4]:
# Locations used to query tweets

locationsTemp=["chennai","delhi","india","ahmedabad","gujarat","tamil","jammu","srinagar","imphal","lucknow",
                "bengaluru","bangalore","bathinda","chandigarh","ludhiana","amritsar","punjab","arunachal","assam",
                "kolkata","westbengal","kerala","karnataka","andhra","telangana","hyderabad","madhya","uttar",
                "maharashtra","haryana",'Jammu','Srinagar','Delhi-NewDelhi','Bathinda','Dehradun','Chandigarh',
                'Ludhiana','Amritsar','Imphal','Lucknow','Jalandhar','Kolkata','Guwahati','Chennai','Patiala',
                'Bhubaneswar','Bengaluru','Patna','Jaipur','Coimbatore','Hyderabad','Ranchi','Thiruvananthapuram',
                'Shimla','Sangrur','Ahmedabad','Karnal','Pulwama','Puducherry','Gurgaon','Agartala','Madurai',
                'Tiruchirappalli','Bangkok','Salem']

locations = set()
for location in locationsTemp:
    locations.add(location.lower())
locations = pd.DataFrame(list(locations))
locations.columns = ["Locations"]
locations.to_csv("Locations of Interest.csv")

In [5]:
# Keywords used to query tweets

keywords = ['protest','protests','riot','riots','violence','unrest','clash','bandh','issue','rally','demonstration','election','polls','attac']
keywords = pd.DataFrame(keywords)
keywords.columns = ["Keywords"]
keywords.to_csv("Keywords.csv")

In [6]:
locations = pd.read_csv("Locations of Interest.csv", index_col=0)
keywords = pd.read_csv("Keywords.csv", index_col=0)

In [7]:
# Converting the keywords and locations to format suitable for Twitter API

keywordsQuery = ''.join(map(str, [keyword+" OR " for keyword in list(keywords['Keywords'])]))[:-4]
start = 0
to_return = []
loc_list =locations["Locations"].values.tolist()
len_loc = len(loc_list) // 5 + 1
for i in range(5):
    to_return.append(loc_list[start:start+len_loc])
    start+=len_loc
    
locationQueries=[]
for row in to_return:
    locationQueries.append(''.join(map(str, [location+" OR " for location in row]))[:-4])

In [8]:
print("Locations Queries\n\n", locationQueries,"\n")
print("Keywords Query\n\n", keywordsQuery)

Locations Queries

 ['karnataka OR madhya OR tiruchirappalli OR karnal OR jaipur OR jalandhar OR bangalore OR chandigarh OR agartala OR ludhiana OR imphal', 'delhi-newdelhi OR salem OR patna OR andhra OR hyderabad OR gurgaon OR sangrur OR madurai OR haryana OR ranchi OR shimla', 'bangkok OR westbengal OR kolkata OR lucknow OR india OR bhubaneswar OR tamil OR coimbatore OR kerala OR telangana OR bengaluru', 'patiala OR thiruvananthapuram OR amritsar OR assam OR arunachal OR maharashtra OR ahmedabad OR srinagar OR delhi OR punjab OR puducherry', 'guwahati OR pulwama OR dehradun OR chennai OR gujarat OR jammu OR uttar OR bathinda'] 

Keywords Query

 protest OR protests OR riot OR riots OR violence OR unrest OR clash OR bandh OR issue OR rally OR demonstration OR election OR polls OR attac


### Fetching data from Twitter

In [9]:
# Querying twitter API

date = dt.datetime.now()
cnt = 0
tweetList = []
for locationQuery in locationQueries:
    tweets = twitter.cursor(twitter.search, q=keywordsQuery + ' (' +locationQuery + ') -filter:retweets', count=100, tweet_mode = "extended")
    for item in tweets:
        if (cnt%10000 == 0):
            print("Progress: "+str(cnt))
        tweetList.append(item)
        cnt+=1
        if cnt > 40000:
            print("Thread sleep")
            time.sleep(60*15+5)
            cnt = 0
tweetsDF = pd.DataFrame(tweetList)

Progress: 0


  


Progress: 10000
Progress: 20000
Progress: 30000
Progress: 40000
Thread sleep


In [10]:
# Lookup to get full text for each tweet (first fetch gets only truncated text) 

cnt = 1
rowInd = 1
tempList = []
tweetList = []
tweetsDfs = []
tweets = tweetsDF
for index, row in tweets.iterrows():
    tempList.append(row["id"])
    if cnt%100 == 0:
        try:
            tweetList.extend(twitter.lookup_status(id=tempList, tweet_mode = "extended"))
            tempList = []
        except:
            print("Succesful till: "+cnt)
    if (cnt%10000 == 0):
        print("Progress: "+str(cnt))
        if cnt >= 40000:
            tweetsDfs.append(pd.DataFrame(tweetList))
#             tweetsDf.to_csv("Tweets3Weeks"+str(date.day)+str(date.strftime("%m"))+"_"+str(rowInd)+".csv")
            tweetList = []
            rowInd+=1
#             print("Thread sleep")
#             time.sleep(60*15+5)
            cnt = 0
    cnt+=1
tweetList.extend(twitter.lookup_status(id=tempList, tweet_mode = "extended"))
tweetsDfs.append(pd.DataFrame(tweetList))

In [11]:
tweetsDF = pd.concat(tweetsDfs)

In [12]:
tweetsDF.shape
tweetsDF.index = np.arange(len(tweetsDF))
date = dt.datetime.now()
tweetsDF.to_csv("tweets_collected_"+str(date.day)+str(date.strftime("%m"))+".csv")

### Pre-processing tweets to get useful features

In [13]:
# Get user related features

def get_features(row):
    row["userlocation"] = row["user"]["location"]
    row["username"] = row["user"]["name"]
    row["userscreen_name"] = row["user"]["screen_name"]
    row["userdescription"] = row["user"]["description"]
    row["userfollowers_count"] = row["user"]["followers_count"]
    row["userfriends_count"] = row["user"]["friends_count"]
    row["userlisted_count"] = row["user"]["listed_count"]
    row["userfavourites_count"] = row["user"]["favourites_count"]
    row["userverified"] = row["user"]["verified"]
    row["userstatuses_count"] = row["user"]["statuses_count"]
    row["userfollowing"] = row["user"]["following"]
    row["userfollow_request_sent"] = row["user"]["follow_request_sent"]
    row["usercontributors_enabled"] = row["user"]["contributors_enabled"]
    sepr =  "" 
    l = []
    for location in locations['Locations']: 
        if (location in str(row["full_text"]).lower()):
            l.append(location)
    row["extracted_location"] = ','.join(l)
    return row

In [14]:
processed_data = tweetsDF.apply(get_features, axis=1)
processed_data.index = np.arange(len(processed_data))

In [15]:
# Map tweets to acled locations

acledLocations = ['Jammu','Srinagar','Delhi-New Delhi','Bathinda','Dehradun','Chandigarh','Ludhiana','Amritsar',
                  'Imphal','Lucknow','Jalandhar','Kolkata','Guwahati','Chennai','Patiala','Bhubaneswar','Bengaluru',
                  'Patna','Jaipur','Coimbatore','Hyderabad','Ranchi','Thiruvananthapuram','Shimla','Sangrur',
                  'Ahmedabad','Karnal','Pulwama','Puducherry','Gurgaon','Agartala','Madurai','Tiruchirappalli',
                  'Bangkok','Salem','India']

def generate_locations(row):
    dfrow_list = []
    for location in acledLocations: 
        tempLocation = location
        tempLocation2 = location
        if (tempLocation == "Delhi-New Delhi"):
            tempLocation = "delhi"
        elif (tempLocation == "Bengaluru"):
            tempLocation = "bangalore"
            tempLocation2 = "karnataka"
        elif (tempLocation == "Kolkata"):
            tempLocation = "west bengal"
        elif (tempLocation == "Guwahati"):
            tempLocation = "assam"
        elif (tempLocation in ("Lucknow", "Dehradun")):
            tempLocation = "uttar"
        elif (tempLocation in ("Sangrur", "Ludhiana", "Amritsar", "Chandigarh", "Bathinda", "Jalandhar", "Patiala")):
            tempLocation = "punjab"
        elif (tempLocation in ("Karnal", "Gurgaon", "Chandigarh")):
            tempLocation2 = "haryana"
        elif (tempLocation == "Pulwama"):
            tempLocation2 = "jammu"
        elif (tempLocation == "Ahmedabad"):
            tempLocation = "gujarat"
        elif (tempLocation in ("Chennai", "Coimbatore", "Madurai", "Tiruchirappalli", "Salem")):
            tempLocation = "tamil"
        elif (tempLocation in ("Imphal", "Agartala")):
            tempLocation = "arunachal"
        elif (tempLocation == "Hyderabad"):
            tempLocation = "andhra"
            tempLocation2 = "telangana"
        elif (tempLocation == "Thiruvananthapuram"):
            tempLocation = "kerala"
        if (tempLocation.lower() in row["extracted_location"] or location.lower() in row["extracted_location"] or tempLocation2 in row["extracted_location"]):
            row["finallocation"] = location
            dfrow_list.append(row.copy())
    return pd.DataFrame(dfrow_list)

In [17]:
posssed_df = pd.concat(processed_data.apply(generate_locations, axis = 1).tolist())
posssed_df.index = np.arange(len(posssed_df))

### Saving tweets

In [22]:
try:
    df = pd.read_json("../../../data/TweetsConsolidted.json")
    df = pd.concat([posssed_df, df])
    df.index = np.arange(len(df))
except:
    df = posssed_df
df.to_json("../../../data/TweetsConsolidated.json")