In [2]:
# !pip install pandarallel
# !pip install contractions
# !pip install --upgrade pip
# !pip install --upgrade azure-cognitiveservices-language-textanalytics

In [25]:
import json
import math
import nltk
import keys
import requests
import numpy as np
import contractions
import pandas as pd
    
from datetime import datetime
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from pandarallel import pandarallel
from msrest.authentication import CognitiveServicesCredentials
from azure.cognitiveservices.language.textanalytics import TextAnalyticsClient

#from keys import subscription_key, text_analytics_base_url, bing_spell_check_key, spell_check_url 

In [4]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/azureuser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
raw_tweets = pd.read_csv(
    "sample_tweets.csv",
    index_col=None
)

In [6]:
raw_tweets["id"] = raw_tweets.index
raw_tweets.rename(columns={"Body text": "text"}, inplace=True)
raw_tweets.head(2)

Unnamed: 0,Down ID,Posted time,text,Point latitiude,Point longitude,Bio location,Image url,id
0,"""tag:search.twitter.com,2005:1109081183105835008""",22/03/2019 13:15,"""Primeape was recently spotted trying to make ...",,,,"""https://pbs.twimg.com/profile_images/76017773...",0
1,"""tag:search.twitter.com,2005:1109080961277407234""",22/03/2019 13:14,"""@gianninewbon Meet me at London Bridge at hal...",,,"""Derbados""","""https://pbs.twimg.com/profile_images/10569615...",1


In [7]:
raw_tweets.count()

Down ID            44816
Posted time        44816
text               44816
Point latitiude      701
Point longitude      701
Bio location       31946
Image url          42978
id                 44816
dtype: int64

In [8]:
# ~51% duplicates
raw_tweets.drop_duplicates(subset=['text']).count()

Down ID            22022
Posted time        22022
text               22022
Point latitiude      678
Point longitude      678
Bio location       15688
Image url          20558
id                 22022
dtype: int64

In [10]:
tweets = raw_tweets.groupby(["text"]).id.count().reset_index()
tweets.columns = ["text", "count"]
tweets.sort_values(by="count", inplace=True, ascending=False)
tweets.head()

Unnamed: 0,text,count
11637,"""LOL I JUST HEARD THE TUBE COORDINATOR (or wha...",2470
15277,"""RT @EtniesJags: “London Bridge is falling dow...",1955
9621,"""For anyone believing that corruption is mainl...",654
20342,"""You mean like the 7/7 tube bombers did, the M...",608
15915,"""RT @kurtnysgvia: Tangina yung grades ko paran...",593


In [11]:
tweets["text"].replace(r'^\s*$', np.nan, regex=True).isna().sum()

0

In [12]:
def spell_check(sentence):
    data = {'text': sentence}
    params = {
        'mkt':'en-us',
        'mode':'proof'
        }
    headers = {
        'Content-Type': 'application/x-www-form-urlencoded',
        'Ocp-Apim-Subscription-Key': bing_spell_check_key,
        }
    response = requests.post(spell_check_url, headers=headers, params=params, data=data)
    json_response = response.json()

    for token in json_response["flaggedTokens"]:
        sentence = sentence.replace(str(token["token"]), token["suggestions"][0]["suggestion"])
    
    return sentence

In [13]:
tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
def process_tweet(tweet):
    normalized_tweet = tweet.lower().strip()
    try:
        normalized_tweet = spell_check(str(normalized_tweet))        
        normalized_tweet = contractions.fix(normalized_tweet)
        tokenized = tokenizer.tokenize(normalized_tweet)
    except:
        tokenized = normalized_tweet.split(" ")
    isascii = lambda s: len(s) == len(s.encode())
    ascii_words = [word for word in tokenized[1:-1] if isascii(word)]
    normalized_tweet = " ".join(ascii_words)
    return normalized_tweet


In [14]:
pandarallel.initialize(nb_workers=32)
print("Process started at {}".format(datetime.now()))
tweets["processed_text"] = tweets["text"].parallel_apply(process_tweet)
print("Process ended at {}".format(datetime.now()))

New pandarallel memory created - Size: 2000 MB
Pandarallel will run on 32 workers
Process started at 2019-06-25 19:16:30.423452
Process ended at 2019-06-25 19:16:30.876383


### Key Phrases

In [16]:
tweets.reset_index(inplace=True)
tweets.head()

Unnamed: 0,index,text,count,processed_text
0,11637,"""LOL I JUST HEARD THE TUBE COORDINATOR (or wha...",2470,i just heard the tube coordinator (or whatever...
1,15277,"""RT @EtniesJags: “London Bridge is falling dow...",1955,"@etniesjags: bridge is falling down, my dear"
2,9621,"""For anyone believing that corruption is mainl...",654,anyone believing that corruption is mainly thi...
3,20342,"""You mean like the 7/7 tube bombers did, the M...",608,"mean like the 7/7 tube bombers did, the marche..."
4,15915,"""RT @kurtnysgvia: Tangina yung grades ko paran...",593,@kurtnysgvia: tangina yung grades ko parang lo...


In [21]:
tweet_list = tweets[["index", "processed_text"]].rename(
        columns={"index":"id", "processed_text": "text"}
    ).to_json(orient = "records")

json.loads(tweet_list)[0:5]

[{'id': 11637,
  'text': 'i just heard the tube coordinator (or whatever they are called) at london bridge just say into his all my guys are bolaz, please stand clear of the'},
 {'id': 15277, 'text': '@etniesjags: bridge is falling down, my dear'},
 {'id': 9621,
  'text': 'anyone believing that corruption is mainly third world this article detailing how of public money was spent on a non existent london bridge is a must read. at the centre of the scandal: @borisjohnson, the man planning to be next uk pm'},
 {'id': 20342,
  'text': 'mean like the 7/7 tube bombers did, the marchester arena bomber did and the london bridge and westminster killers did ... demonising anyone who didnt follow islam?? you are a hypocrite of mammoth propertions https://t.co/nttuukmi7d\n""when the flames of hatred are fanned, when people are demonised because of their faith, when we play on people\'s fears rather than addressing them, the consequences are deadly, as we have seen so sadly today.'},
 {'id': 15915,

In [26]:
headers = {"Ocp-Apim-Subscription-Key": subscription_key}
keyphrase_url = text_analytics_base_url + "keyPhrases"

key_phrases_df = pd.DataFrame()
tweets_chunks = np.array_split(json.loads(tweet_list), 
                               math.ceil(len(tweets)/1000))

for chunk in tweets_chunks:
    chunk_json = json.loads(json.dumps({"documents" : list(chunk)})) 
    response  = requests.post(keyphrase_url, 
                              headers=headers, 
                              json=chunk_json)
    key_phrases = response.json()
    key_phrases_df = key_phrases_df.append(
        pd.DataFrame.from_dict(
            key_phrases['documents']
        )
    )

In [27]:
key_phrases_df.head()

Unnamed: 0,id,keyPhrases
0,11637,"[bolaz, guys, tube coordinator, london bridge]"
1,15277,"[bridge, etniesjags]"
2,9621,"[man, article, scandal, borisjohnson, world, p..."
3,20342,"[london bridge, marchester arena bomber, westm..."
4,15915,[tangina yung grades ko parang london bridge n...


In [28]:
len(key_phrases_df)

21976

In [29]:
def drop_long_key_phrases(keyPhrases):
    for phrase in keyPhrases:
        if len(phrase.split(" ")) >  5:
            keyPhrases.remove(phrase)
    return keyPhrases
key_phrases_df["shortKeyPhrases"] = key_phrases_df["keyPhrases"].apply(drop_long_key_phrases)

In [30]:
key_phrases_df.head()

Unnamed: 0,id,keyPhrases,shortKeyPhrases
0,11637,"[bolaz, guys, tube coordinator, london bridge]","[bolaz, guys, tube coordinator, london bridge]"
1,15277,"[bridge, etniesjags]","[bridge, etniesjags]"
2,9621,"[man, article, scandal, borisjohnson, world, p...","[man, article, scandal, borisjohnson, world, p..."
3,20342,"[london bridge, marchester arena bomber, westm...","[london bridge, marchester arena bomber, westm..."
4,15915,[kurtnysgvia],[kurtnysgvia]


In [31]:
tweets.head()

Unnamed: 0,index,text,count,processed_text
0,11637,"""LOL I JUST HEARD THE TUBE COORDINATOR (or wha...",2470,i just heard the tube coordinator (or whatever...
1,15277,"""RT @EtniesJags: “London Bridge is falling dow...",1955,"@etniesjags: bridge is falling down, my dear"
2,9621,"""For anyone believing that corruption is mainl...",654,anyone believing that corruption is mainly thi...
3,20342,"""You mean like the 7/7 tube bombers did, the M...",608,"mean like the 7/7 tube bombers did, the marche..."
4,15915,"""RT @kurtnysgvia: Tangina yung grades ko paran...",593,@kurtnysgvia: tangina yung grades ko parang lo...


In [34]:
key_phrases_df.id = key_phrases_df.id.astype(int)
# key_phrases_df.keyPhrases = key_phrases_df.keyPhrases.astype(str)
# key_phrases_df.shortKeyPhrases = key_phrases_df.shortKeyPhrases.astype(str)

metadata_keyPhrases = pd.merge(
    key_phrases_df[["id", "keyPhrases", "shortKeyPhrases"]], 
    tweets[["index", "text", "processed_text", "count"]], 
    left_on="id", 
    right_on="index",
    how="inner")
metadata_keyPhrases.sort_values(by=["count"], ascending=False, inplace=True)
metadata_keyPhrases.to_csv("tweet_keyPhrases.csv")
metadata_keyPhrases[["keyPhrases", "text", "count"]].head(10)

Unnamed: 0,keyPhrases,text,count
0,"[bolaz, guys, tube coordinator, london bridge]","""LOL I JUST HEARD THE TUBE COORDINATOR (or wha...",2470
1,"[bridge, etniesjags]","""RT @EtniesJags: “London Bridge is falling dow...",1955
2,"[man, article, scandal, borisjohnson, world, p...","""For anyone believing that corruption is mainl...",654
3,"[london bridge, marchester arena bomber, westm...","""You mean like the 7/7 tube bombers did, the M...",608
4,[kurtnysgvia],"""RT @kurtnysgvia: Tangina yung grades ko paran...",593
5,"[non-muslims, paris shootings, london bridge a...","""Why are non-muslims, celebrities and influenc...",590
6,"[pm outside newscorps hq, london bridge, media...","""The time for tolerating the media whipping up...",565
7,"[disputed land hai, rahe hain, saare pakistani...","""Haan hum to saare pakistani hain jo yahan tax...",461
8,"[london bridge, pilots, tyler joseph's street ...","""From the Rock Sound Archive: Twenty One Pilot...",430
9,"[dass, stockholm, manchester, arabischen staat...","""Nach den Opfern in Nizza, St. Petersburg, Lon...",375


In [35]:
(
  key_phrases_df["keyPhrases"].apply(lambda x: len(x)).mean(),
  key_phrases_df["keyPhrases"].apply(lambda x: len(x)).max(),
  key_phrases_df["keyPhrases"].apply(lambda x: len(x)).min()
)

(5.261512559155443, 80, 0)

In [36]:
(
  key_phrases_df["shortKeyPhrases"].apply(lambda x: len(x)).mean(),
  key_phrases_df["shortKeyPhrases"].apply(lambda x: len(x)).max(),
  key_phrases_df["shortKeyPhrases"].apply(lambda x: len(x)).min()
)

(5.261512559155443, 80, 0)

In [37]:
not_covered_list = []
def no_key_phrases(row):
    if len(row.keyPhrases) == 0:
        not_covered_list.append(row.id)
        return 1
not_covered =  key_phrases_df.apply(no_key_phrases, axis=1).sum()

print("Number of posts not covered by Key Phrases: {} or {} %".format(
    not_covered, 
    (not_covered/len(key_phrases_df)*100)
    )
)

Number of posts not covered by Key Phrases: 203.0 or 0.9237349836184929 %


In [40]:
tweets[tweets.index.isin(not_covered_list)].processed_text.values

array(['@etniesjags: bridge is falling down, my dear',
       'anyone believing that corruption is mainly third world this article detailing how of public money was spent on a non existent london bridge is a must read. at the centre of the scandal: @borisjohnson, the man planning to be next uk pm',
       '....in no particular order e&oe 9/11, 7/11, bataclan, pulse nightclub, san bernardino, manchester, westminster, london bridge, bali, madrid, boston marathon, 2 backpackers decapitated in morocco, tunisia, rotherham et al, charlene downes (cut up into kebabs)',
       'part of @illuminatedriv, light artist leo design for london bridge will respond to the continuous stream of movement, colour and cultural activity in the area. find out more about the rich history #icebridgemonth #icebridges',
       'to have @pearlassistance with us on london bridge. sal and team have been a true and loyal friend to uphd members since the beginning! https://t.co/j8qwma36us\n""live from london bridge: p

In [43]:
key_phrases_df[key_phrases_df["shortKeyPhrases"].map(len)>10].count()

id                 769
keyPhrases         769
shortKeyPhrases    769
dtype: int64

In [44]:
not_covered_list = []
def no_key_phrases(row):
    if len(row.shortKeyPhrases) == 0:
        not_covered_list.append(row.id)
        return 1
not_covered =  key_phrases_df.apply(no_key_phrases, axis=1).sum()

print("Number of posts not covered by Shortened Key Phrases: {} or {} %".format(
    not_covered, 
    (not_covered/len(key_phrases_df)*100)
    )
)

Number of posts not covered by Shortened Key Phrases: 203.0 or 0.9237349836184929 %


In [46]:
tweets[tweets.index.isin(not_covered_list)].text.values

array(['"RT @EtniesJags: “London Bridge is falling down, my dear lady” 🔥🔥🔥🔥😂 https://t.co/sxUvrDtlrS"',
       '"For anyone believing that corruption is mainly ‘a third world problem’, this article detailing how £40m of public money was spent on a non existent London Bridge is a must read. At the centre of the scandal: @BorisJohnson, the man planning to be next UK PM https://t.co/vvfjPZzxyN\nhttps://www.architectsjournal.co.uk/opinion/heads-should-roll-over-garden-bridge-debacle-but-the-odds-are-they-wont/10040010.article"',
       '"@VictorEriceira ....in no particular order e&oe 9/11, 7/11, Bataclan, Pulse Nightclub, San Bernardino, Manchester, Westminster, London Bridge, Bali, Madrid, Boston Marathon, 2 backpackers decapitated in Morocco, Tunisia, Rotherham et al, Charlene Downes (cut up into kebabs) Sweden"',
       '"As part of @illuminatedriv, light artist Leo Villareal’s design for London Bridge will respond to the continuous stream of movement, colour and cultural activity in t

In [47]:
unique_key_phrases = set(sum(key_phrases_df["shortKeyPhrases"], []))

key_phrases_category = {}
for key in unique_key_phrases:
    key_phrases_category[key] = []

def map_categories(row):
    for phrase in row["shortKeyPhrases"]:
        key_phrases_category[phrase].append(row["id"])
    return None

key_phrases_df.apply(map_categories, axis=1)

key_phrases_category_count={}
for key, value in key_phrases_category.items():
    key_phrases_category_count[key] = len(value)

from collections import OrderedDict
key_phrases_category_count = OrderedDict(
                               sorted(key_phrases_category_count.items(), 
                                      key=lambda kv: kv[1], 
                                      reverse=True)
                            )

print(len(key_phrases_category_count))
# print(key_phrases_category_count)

38929


In [50]:
key_phrases_category_count

OrderedDict([('london bridge', 8290),
             ('tower bridge', 2129),
             ('bridge', 1413),
             ('london', 1388),
             ('train', 547),
             ('waterloo bridge', 516),
             ('time', 470),
             ('people', 452),
             ('london bridge station', 365),
             ('day', 363),
             ('manchester', 359),
             ('minutes', 325),
             ('london bridge attack', 286),
             ('trains', 277),
             ('morning', 265),
             ('uk', 263),
             ('week', 250),
             ('way', 247),
             ('west sacramento', 244),
             ('days', 222),
             ('tower', 220),
             ('westminster', 215),
             ('towerbridge', 212),
             ('eb', 206),
             ('muslims', 205),
             ('years', 200),
             ('fergie', 191),
             ('mins', 178),
             ('manchester arena', 171),
             ('thames', 163),
             ('kids', 161),
      

In [71]:
top_25_key_phrases = list(key_phrases_category_count.keys())[0:25]
metadata_keyPhrases['covered_top_25'] = metadata_keyPhrases["keyPhrases"].apply(
    lambda x: any(item in x for item in top_25_key_phrases)
)
metadata_keyPhrases['covered_top_25'].value_counts()/len(metadata_keyPhrases)*100

True     69.994539
False    30.005461
Name: covered_top_25, dtype: float64

In [88]:
top_200_key_phrases = list(key_phrases_category_count.keys())[0:200]
metadata_keyPhrases['covered_top_200'] = metadata_keyPhrases["keyPhrases"].apply(
    lambda x: any(item in x for item in top_200_key_phrases)
)
metadata_keyPhrases['covered_top_200'].value_counts()/len(metadata_keyPhrases)*100

True     80.32399
False    19.67601
Name: covered_top_200, dtype: float64

In [89]:
top_2000_key_phrases = list(key_phrases_category_count.keys())[0:2000]
metadata_keyPhrases['covered_top_2000'] = metadata_keyPhrases["keyPhrases"].apply(
    lambda x: any(item in x for item in top_2000_key_phrases)
)
metadata_keyPhrases['covered_top_2000'].value_counts()/len(metadata_keyPhrases)*100

True     90.262104
False     9.737896
Name: covered_top_2000, dtype: float64

In [96]:
list(key_phrases_category_count.keys())[0:2000]

['london bridge',
 'tower bridge',
 'bridge',
 'london',
 'train',
 'waterloo bridge',
 'time',
 'people',
 'london bridge station',
 'day',
 'manchester',
 'minutes',
 'london bridge attack',
 'trains',
 'morning',
 'uk',
 'week',
 'way',
 'west sacramento',
 'days',
 'tower',
 'westminster',
 'towerbridge',
 'eb',
 'muslims',
 'years',
 'fergie',
 'mins',
 'manchester arena',
 'thames',
 'kids',
 "i'm",
 'night',
 'tower bridge gtwy',
 'place',
 'world',
 'london bridge terrorist attack',
 'battersea',
 'lambeth bridge',
 'guys',
 'man',
 'platform',
 'year',
 'islamophobic',
 'londonbridge',
 'song',
 'interior department spokeswoman',
 'railway',
 'millenium bridge',
 'podcast episode',
 'anti-immigrant rhetoric',
 'oxford circus',
 'se',
 'england',
 'piccadilly circus',
 'thanks',
 'cathedral',
 'delays',
 'paddington station',
 'madame tussauds',
 'borough market',
 'service',
 'river',
 'city',
 'seconds',
 'pictures',
 'vs',
 'paris',
 'work',
 'protest',
 'london bridge attac

In [98]:
2000/len(tweets)*100

9.081827263645446

__________________________________________________________________________________________________________________