### Use Twitter API with Tweepy, a Python wrapper, to search for all mentions of bands on XL Recordings (searchQuery)

In [9]:
from __future__ import print_function
import requests
from requests_oauthlib import OAuth1
import tweepy, cnfg, sys, os, json
import pandas as pd
import pprint


In [2]:
config = cnfg.load("./.twitter_config")
consumer_key = config['consumer_key']
consumer_secret = config['consumer_secret']

In [3]:
auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
 
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True
                )
 
if (not api):
    print ("Can't Authenticate")
    sys.exit(-1)

In [5]:
searchQuery = [('@adele', 'Adele'), ('#adele', 'Adele'), ('@radiohead', 'Radiohead'), ('#radiohead', 'Radiohead'), ('@TheAvalanches', 'The Avalanches'),('@IbeyiOfficial', 'Ibeyi'), ('@XLRECORDINGS', 'XL Recordings'), ('#xlrecordings', 'XL Recordings'),
              ('@gloriousunseen', 'Jack White'), ('@GILA_____', 'GILA'), ('@jai_paul', 'Jai Paul'), ('#jaipaul', 'Jai Paul'), ('Jai AND Paul', 'Jai Paul'), ('@KAYTRANADA', 'Kay Tranada'), ('#kaytranada', 'Kay Tranada'), ('#kingkrule', 'Kingkrule'),
              ('King AND Krule', 'Kingkrule'), ('@MusicLapsley', 'Lapsley'), ('#Lapsley', 'Lapsley'), ('Radiohead', 'Radiohead'), ('@ratatatmusic', 'Ratatat'), ('Ratatat', 'Ratatat'),
              ('@matsoR', 'ROSTAM'), ('ROSTAM', 'ROSTAM'), ('@sigurros', 'Sigur Ros'), ('Sigur Ros', 'Sigur Ros'), ('#sigurros', 'Sigur Ros'), ('@PaulWoolford', 'Paul Woolford'), ('@thomyorke', 'Thom Yorke'),('#thomyorke', 'Thom Yorke'),
              ('@vampireweekend', 'Vampire Weekend'), ('Thom Yorke', 'Thom Yorke'), ('#vampireweekend', 'Vampire Weekend'), ('@arzE', 'Ezra Koenig'), ('Ezra AND Koenig', 'Ezra Koenig'), ('@ZombyMusic', 'Zomby Music'), ('@qotsa', 'Queens of the Stone Age'),('#qotsa', 'Queens of the Stone Age'), ('Queens of the Stone Age', 'Queens of the Stone Age')]

#### I modified code from a blog post here so that I could loop through multiple search queries and write to csv file
#### Blog: https://www.karambelkar.info/2015/01/how-to-use-twitters-search-rest-api-most-effectively./

In [1]:
import csv

maxTweets = 200000
tweetsPerQry = 100  # this is the max the API permits
fName = 'xl.csv' # save tweets to csv document

#go as far back as API allows
sinceId = None

# If results only below a specific ID are, set max_id to that ID.
# else default to no upper limit, start from the most recent tweet matching the search query.
max_id = -1

tweetCount = 0
print("Downloading max {0} tweets".format(maxTweets))

with open(fName, 'w') as f:
    writer = csv.writer(f, delimiter=',') # loop through search query and write tweets to csv
    for i in searchQuery:
        while tweetCount < maxTweets:

            try:
                if (max_id <= 0):
                    if (not sinceId):
                        new_tweets = api.search(q=i[0], count=tweetsPerQry)
                    else:
                        new_tweets = api.search(q=i[0], count=tweetsPerQry,
                                                since_id=sinceId)
                else:
                    if (not sinceId):
                        new_tweets = api.search(q=i[0], count=tweetsPerQry,
                                                max_id=str(max_id - 1))
                    else:
                        new_tweets = api.search(q=i[0], count=tweetsPerQry,
                                                max_id=str(max_id - 1),
                                                since_id=sinceId)
                    if not new_tweets:
                        max_id = -1
                        #next(iterator)
                        print('No more tweets for this search term')
                        break
                        
                for tweet in new_tweets:
                    tweet = tweet._json
                    #print (json.dumps(tweet, indent=4, sort_keys=True))
                    tweet = tweet._json
                    tweet_id = tweet['id']
                    tweet_date = tweet['created_at']
                    tweet_location = tweet['user']['location']
                    tweet_text = tweet['text']
                    tweet_username = tweet['user']['screen_name']
                    tweet_retweets = tweet['retweet_count']
                    tweet_favorites = tweet['favorite_count']
                    tweet_followers = tweet['user']['followers_count']
                    tweet_friends = tweet['user']['friends_count']
                    tweet_hashtags = tweet['entities']['hashtags']
                    tweet_description = tweet['user']['description']
                    tweet_reply_to = tweet['in_reply_to_status_id']
                    search_term = i[0]
                    artist = i[1]
                    
                    writer.writerow([tweet_id, tweet_date, tweet_location, tweet_text, tweet_username,
                                     tweet_retweets, tweet_favorites, tweet_followers, tweet_friends,
                                     tweet_hashtags, tweet_description, tweet_reply_to, search_term, artist])
                tweetCount += len(new_tweets)
                print("Downloaded {0} tweets".format(tweetCount))
                max_id = new_tweets[-1].id
            except tweepy.TweepError as e:
                print("some error : " + str(e))
                break
      

print ("Downloaded {0} tweets, Saved to {1}".format(tweetCount, fName))


In [161]:
df = pd.read_csv('xl.csv', header=None, names=['id', 'created_at', 'location', 'text', 'screen_name', 'retweet_count', 'favorite_count', 'followers_count', 'friends_count', 'hashtags_text', 'description', 'tweet_reply_to', 'search_term', 'artist'])

In [171]:
df = df.drop_duplicates(['id'], keep='last').reset_index(drop=True)

In [173]:
len(df.location.unique())

46879

In [175]:
import pickle

In [176]:
# with open('xl.pkl', 'wb') as picklefile:
#     pickle.dump(df, picklefile)