## Part 2 of the Data Science challenge.
#### Create a tool that visualises different metrics for different search results on Twitter

In [1]:
import tweepy 
import pandas as pd
import requests
import json

In [2]:
def authorize_twitter_api():
    '''read auth keys and secrets for my twitter app, returning authorized api for use to make requests'''
    base_path = "/home/adrian/test/Kahoot-challenge/keys/"
    with open(base_path + "consumer_key.txt") as f:
        consumer_key = f.readline().strip()
    with open(base_path + "consumer_secret.txt") as f:
        consumer_secret = f.readline().strip()
    with open(base_path + "access_token.txt") as f:
        access_token = f.readline().strip()
    with open(base_path + "access_token_secret.txt") as f:
        access_token_secret = f.readline().strip()
        
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    
    return tweepy.API(auth,wait_on_rate_limit=True,wait_on_rate_limit_notify=True)
    

In [14]:
def get_bearer_token():
    base_path = "/home/adrian/test/Kahoot-challenge/keys/"
    with open(base_path + "bearer_token.txt") as f:
        return f.readline().strip()

In [38]:
bearer_token = get_bearer_token()
search_count_url = "https://api.twitter.com/2/tweets/counts/recent"
search_url = "https://api.twitter.com/2/tweets/search/recent"

In [42]:
def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """

    r.headers["Authorization"] = f"Bearer {bearer_token}"
    r.headers["User-Agent"] = "v2RecentTweetCountsPython,v2RecentSearchPython"
    return r

def bearer_search_oauth(r):
    """
    Method required by bearer token authentication.
    """

    r.headers["Authorization"] = f"Bearer {bearer_token}"
    r.headers["User-Agent"] = "v2RecentTweetCountsPython"
    return r


def connect_to_endpoint(url, params):
    response = requests.request("GET", url, auth=bearer_oauth, params=params)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()


In [77]:
# Optional params: start_time,end_time,since_id,until_id,next_token,granularity
query_count_params = {'query': 'kahoot','granularity': 'minute'}
query_count_object = connect_to_endpoint(search_count_url, query_count_params)

Comment: spent way too much time converting to a date object that is filterable, the localize "property" of the datetime object ruined quite a bit. Found that there is a two hour difference between the data from twitter and my current time, therefore using the 5,10,15 minutes and 2 hours back in time when filtering..

In [78]:
'''make dataframe out of query count numbers, and convert from string to timestamps'''
df_query_count = pd.DataFrame(query_count_object["data"], columns=["start","end","tweet_count"])
df_query_count.loc[:,"start"] = pd.to_datetime(df_query_count.loc[:,"start"],utc=True)
df_query_count.loc[:,"end"] = pd.to_datetime(df_query_count.loc[:,"end"],utc=True)
 

In [79]:
'''Get the search counts for the query, 5 minutes, 10 minutes, 15 minutes ago and total hits'''
query_count = {}
timezone_delay = 120
for minutes_ago in range(5,16,5):
    time_delta = pd.to_datetime('today') - pd.Timedelta(120 + minutes_ago, 'minutes')
    query_count[str(minutes_ago) + " minutes ago"] = df_query_count[df_query_count["start"] >= time_delta.tz_localize('UTC')].tweet_count.sum()    
query_count["total"] = query_count_object["meta"]["total_tweet_count"]

In [80]:
query_count

{'5 minutes ago': 1, '10 minutes ago': 4, '15 minutes ago': 7, 'total': 6264}

In [307]:
#hack to get RFC 3339 timestamp as required by twitter
start_time = pd.to_datetime('today') - pd.Timedelta(120 + 15, 'minutes')
start_time = start_time.isoformat("T")+"Z"

# Optional params: start_time,end_time,since_id,until_id,max_results,next_token,
# expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields
search_term = "great"
query_search_params = {'query': search_term,'tweet.fields': 'author_id,created_at','max_results':'100', 'start_time':start_time}


In [308]:
'''
Due to limitations in results for each request from API, pagination needs to be implemented. Using next_token for this.
Will only iterate through the 1000 first results, as this should be more than enough to illustrate proof of concept
'''
tweets = []
for i in range(100,1001,100):
    query_search_object = connect_to_endpoint(search_url, query_search_params)
    tweets.extend(query_search_object["data"])
    try: 
        next_token = query_search_object["meta"]['next_token']
        query_search_params["next_token"] = query_search_object["meta"]["next_token"]
    except:
        #no more content, exit loop
        break
    

In [309]:
len(tweets)

1000

In [310]:
tweets

[{'id': '1427757771857047561',
  'author_id': '941888552740315137',
  'text': 'Great workout but lol never taking 4 days off again.',
  'created_at': '2021-08-17T22:22:33.000Z'},
 {'id': '1427757771794096140',
  'author_id': '1222157494656688129',
  'text': 'RT @ScottHech: Okay great *but only 10!!!!!????? Should be thousands.',
  'created_at': '2021-08-17T22:22:33.000Z'},
 {'id': '1427757771764736001',
  'author_id': '1366096823132295168',
  'text': '@SkzHyunjinStay8 HAPPY BIRTHDAY SKY🥳❤️Hope you have a great day 🕺🏾🥳!',
  'created_at': '2021-08-17T22:22:33.000Z'},
 {'id': '1427757771739602950',
  'author_id': '2424097747',
  'text': '@Englshgrl61 Our country is doing so well!  Great management ….',
  'created_at': '2021-08-17T22:22:33.000Z'},
 {'id': '1427757771546632203',
  'author_id': '1414667046080684040',
  'text': '@ChampionsLeague Attract the defense to the left with Solomon, Marcos Antonio and Matvienko, and finishing right with Pedrinho and Tete... I could see some great mome

In [311]:
df_tweets = pd.DataFrame(tweets)
df_tweets["created_at"] = pd.to_datetime(df_tweets.loc[:,"created_at"],utc=True)

In [312]:
from sklearn.feature_extraction.text import CountVectorizer

In [313]:
def get_most_frequent_words(query_word, corpus,n_words):
    '''
    Returns the most frequently used words from the input corpus. 
    Removes punctuation and english stop words. 
    Removing english stop words can be a weakness in some analytical tasks, and should be considered further.
    '''
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(df_tweets.text.values)
    
    sum_words = X.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items() if word != query_word]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n_words]

In [314]:
frequent_terms = {}
for minutes_ago in range(5,16,5):
    time_delta = pd.to_datetime('today') - pd.Timedelta(120 + minutes_ago, 'minutes')
    frequent_terms[str(minutes_ago) + " minutes ago"] = get_most_frequent_words(search_term,df_tweets[df_tweets["created_at"] >= time_delta.tz_localize('UTC')].text.values, 10)  

In [315]:
frequent_terms

{'5 minutes ago': [('rt', 402),
  ('https', 275),
  ('day', 64),
  ('just', 63),
  ('like', 63),
  ('amp', 61),
  ('good', 51),
  ('people', 49),
  ('time', 43),
  ('hope', 39)],
 '10 minutes ago': [('rt', 402),
  ('https', 275),
  ('day', 64),
  ('just', 63),
  ('like', 63),
  ('amp', 61),
  ('good', 51),
  ('people', 49),
  ('time', 43),
  ('hope', 39)],
 '15 minutes ago': [('rt', 402),
  ('https', 275),
  ('day', 64),
  ('just', 63),
  ('like', 63),
  ('amp', 61),
  ('good', 51),
  ('people', 49),
  ('time', 43),
  ('hope', 39)]}

In [316]:
from textblob import TextBlob
from collections import Counter
top_tweepers = {}
sentiment = {}
for minutes_ago in range(5,16,5):
    time_delta = pd.to_datetime('today') - pd.Timedelta(120 + minutes_ago, 'minutes')
    tmp_df = df_tweets[df_tweets["created_at"] >= time_delta.tz_localize('UTC')]
    top_tweepers[str(minutes_ago) + " minutes ago"] = Counter(tmp_df.author_id).most_common(10)  
    sentiment[str(minutes_ago) + " minutes ago"] = TextBlob(' '.join(tmp_df.text)).sentiment

In [317]:
top_tweepers

{'5 minutes ago': [('1389055415963451398', 5),
  ('1373386713674829825', 4),
  ('822060714080636931', 4),
  ('817821464598609922', 3),
  ('1367494165035159561', 2),
  ('1219279638062608385', 2),
  ('1419812209404768262', 2),
  ('1141347536323506176', 2),
  ('1425894223610527747', 2),
  ('1404956102203084800', 2)],
 '10 minutes ago': [('1389055415963451398', 5),
  ('1373386713674829825', 4),
  ('822060714080636931', 4),
  ('817821464598609922', 3),
  ('1367494165035159561', 2),
  ('1219279638062608385', 2),
  ('1419812209404768262', 2),
  ('1141347536323506176', 2),
  ('1425894223610527747', 2),
  ('1404956102203084800', 2)],
 '15 minutes ago': [('1389055415963451398', 5),
  ('1373386713674829825', 4),
  ('822060714080636931', 4),
  ('817821464598609922', 3),
  ('1367494165035159561', 2),
  ('1219279638062608385', 2),
  ('1419812209404768262', 2),
  ('1141347536323506176', 2),
  ('1425894223610527747', 2),
  ('1404956102203084800', 2)]}

In [318]:
sentiment

{'5 minutes ago': Sentiment(polarity=0.43207066238313274, subjectivity=0.6038670644765931),
 '10 minutes ago': Sentiment(polarity=0.43207066238313274, subjectivity=0.6038670644765931),
 '15 minutes ago': Sentiment(polarity=0.43207066238313274, subjectivity=0.6038670644765931)}

In [258]:
text = TextBlob(df_tweets.iloc[0].text)

In [261]:
tokenized_text = tokenizer.tokenize(df_tweets.iloc[0].text.lower())

NameError: name 'tokenizer' is not defined

In [259]:
text.sentiment

Sentiment(polarity=0.35, subjectivity=0.65)

In [3]:
api = authorize_twitter_api()

In [4]:
class MyStreamListener(tweepy.StreamListener):
    
    def on_status(self, status):
        print(status.text)
    def on_error(self, status_code):
        if status_code == 420:
            #returning False in on_error disconnects the stream
            return False

In [18]:
myStreamListener = MyStreamListener()
myStream = tweepy.Stream(auth = api.auth, listener=myStreamListener)

In [21]:
tweets = []
for page in tweepy.Cursor(api.search, q='hello', count=100, result_type="recent",tweet_mode='extended').pages(10):
    tweets.extend([status._json for status in page])

Rate limit reached. Sleeping for: 256


KeyboardInterrupt: 

In [22]:
import requests
import os
import json