In [2]:
# !pip install pandarallel
# !pip install contractions
# !pip install --upgrade pip
# !pip install --upgrade azure-cognitiveservices-language-textanalytics

In [1]:
import json
import math
import nltk
import keys
import requests
import numpy as np
import contractions
import pandas as pd
    
from datetime import datetime
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from pandarallel import pandarallel
from msrest.authentication import CognitiveServicesCredentials
from azure.cognitiveservices.language.textanalytics import TextAnalyticsClient

#from keys import subscription_key, text_analytics_base_url, bing_spell_check_key, spell_check_url 

In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/azureuser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
raw_tweets = pd.read_csv(
    "sample_tweets.csv",
    index_col=None
)

In [4]:
raw_tweets["id"] = raw_tweets.index
raw_tweets.rename(columns={"Body text": "text"}, inplace=True)
raw_tweets.head(2)

Unnamed: 0,Down ID,Posted time,text,Point latitiude,Point longitude,Bio location,Image url,id
0,"""tag:search.twitter.com,2005:1109081183105835008""",22/03/2019 13:15,"""Primeape was recently spotted trying to make ...",,,,"""https://pbs.twimg.com/profile_images/76017773...",0
1,"""tag:search.twitter.com,2005:1109080961277407234""",22/03/2019 13:14,"""@gianninewbon Meet me at London Bridge at hal...",,,"""Derbados""","""https://pbs.twimg.com/profile_images/10569615...",1


In [5]:
raw_tweets.count()

Down ID            44816
Posted time        44816
text               44816
Point latitiude      701
Point longitude      701
Bio location       31946
Image url          42978
id                 44816
dtype: int64

In [6]:
# ~51% duplicates
raw_tweets.drop_duplicates(subset=['text']).count()

Down ID            22022
Posted time        22022
text               22022
Point latitiude      678
Point longitude      678
Bio location       15688
Image url          20558
id                 22022
dtype: int64

In [7]:
tweets = raw_tweets.groupby(["text"]).id.count().reset_index()
tweets.columns = ["text", "count"]
tweets.sort_values(by="count", inplace=True, ascending=False)
tweets.head()

Unnamed: 0,text,count
11637,"""LOL I JUST HEARD THE TUBE COORDINATOR (or wha...",2470
15277,"""RT @EtniesJags: “London Bridge is falling dow...",1955
9621,"""For anyone believing that corruption is mainl...",654
20342,"""You mean like the 7/7 tube bombers did, the M...",608
15915,"""RT @kurtnysgvia: Tangina yung grades ko paran...",593


In [8]:
tweets["text"].replace(r'^\s*$', np.nan, regex=True).isna().sum()

0

In [9]:
def spell_check(sentence):
    data = {'text': sentence}
    params = {
        'mkt':'en-us',
        'mode':'proof'
        }
    headers = {
        'Content-Type': 'application/x-www-form-urlencoded',
        'Ocp-Apim-Subscription-Key': bing_spell_check_key,
        }
    response = requests.post(spell_check_url, headers=headers, params=params, data=data)
    json_response = response.json()

    for token in json_response["flaggedTokens"]:
        sentence = sentence.replace(str(token["token"]), token["suggestions"][0]["suggestion"])
    
    return sentence

In [10]:
tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
def process_tweet(tweet):
    normalized_tweet = tweet.lower().strip()
    normalized_tweet = normalized_tweet.replace("“","")
    try:
        normalized_tweet = spell_check(str(normalized_tweet))        
        normalized_tweet = contractions.fix(normalized_tweet)
        tokenized = tokenizer.tokenize(normalized_tweet)
    except:
        tokenized = normalized_tweet.split(" ")
        
    isascii = lambda s: len(s) == len(s.encode())  
    ascii_words = []
    for word in tokenized:
        if isascii(word):
            if ("http" not in word):
                ascii_words.append(word)
    
    normalized_tweet = " ".join(ascii_words)
    normalized_tweet = normalized_tweet.lower().strip()
    normalized_tweet = normalized_tweet.replace("-","")
    normalized_tweet = normalized_tweet.replace("'","")
    normalized_tweet = normalized_tweet.replace("  ","")
    normalized_tweet = normalized_tweet.replace('"', '')
    normalized_tweet = normalized_tweet.strip()
    
    return normalized_tweet


In [11]:
process_tweet("London Bridge - London Bridge.")

'london bridgelondon bridge.'

In [12]:
pandarallel.initialize(nb_workers=32)
print("Process started at {}".format(datetime.now()))
tweets["processed_text"] = tweets["text"].parallel_apply(process_tweet)
print("Process ended at {}".format(datetime.now()))

New pandarallel memory created - Size: 2000 MB
Pandarallel will run on 32 workers
Process started at 2019-06-25 21:02:33.631348
Process ended at 2019-06-25 21:02:34.090019


### Key Phrases

In [13]:
tweets.reset_index(inplace=True)
tweets.head()

Unnamed: 0,index,text,count,processed_text
0,11637,"""LOL I JUST HEARD THE TUBE COORDINATOR (or wha...",2470,lol i just heard the tube coordinator (or what...
1,15277,"""RT @EtniesJags: “London Bridge is falling dow...",1955,"rt @etniesjags: london bridge is falling down,..."
2,9621,"""For anyone believing that corruption is mainl...",654,for anyone believing that corruption is mainly...
3,20342,"""You mean like the 7/7 tube bombers did, the M...",608,"you mean like the 7/7 tube bombers did, the ma..."
4,15915,"""RT @kurtnysgvia: Tangina yung grades ko paran...",593,rt @kurtnysgvia: tangina yung grades ko parang...


In [14]:
tweet_list = tweets[["index", "processed_text"]].rename(
        columns={"index":"id", "processed_text": "text"}
    ).to_json(orient = "records")

json.loads(tweet_list)[0:5]

[{'id': 11637,
  'text': 'lol i just heard the tube coordinator (or whatever they are called) at london bridge just say into his all my guys are bolaz, please stand clear of the'},
 {'id': 15277,
  'text': 'rt @etniesjags: london bridge is falling down, my dear'},
 {'id': 9621,
  'text': 'for anyone believing that corruption is mainly third world this article detailing how of public money was spent on a non existent london bridge is a must read. at the centre of the scandal: @borisjohnson, the man planning to be next uk pm'},
 {'id': 20342,
  'text': 'you mean like the 7/7 tube bombers did, the marchester arena bomber did and the london bridge and westminster killers did ... demonising anyone who didnt follow islam?? you are a hypocrite of mammoth propertions the flames of hatred are fanned, when people are demonised because of their faith, when we play on peoples fears rather than addressing them, the consequences are deadly, as we have seen so sadly today.'},
 {'id': 15915,
  'text':

In [18]:
headers = {"Ocp-Apim-Subscription-Key": subscription_key}
keyphrase_url = text_analytics_base_url + "keyPhrases"

key_phrases_df = pd.DataFrame()
tweets_chunks = np.array_split(json.loads(tweet_list), 
                               math.ceil(len(tweets)/1000))

for chunk in tweets_chunks:
    chunk_json = json.loads(json.dumps({"documents" : list(chunk)})) 
    response  = requests.post(keyphrase_url, 
                              headers=headers, 
                              json=chunk_json)
    key_phrases = response.json()
    key_phrases_df = key_phrases_df.append(
        pd.DataFrame.from_dict(
            key_phrases['documents']
        )
    )

In [19]:
key_phrases_df.head()

Unnamed: 0,id,keyPhrases
0,11637,"[guys, tube coordinator, bolaz, london bridge]"
1,15277,"[london bridge, rt, etniesjags]"
2,9621,"[man, article, scandal, borisjohnson, world, p..."
3,20342,"[london bridge, marchester arena bomber, flame..."
4,15915,[tangina yung grades ko parang london bridge n...


In [20]:
key_phrases_df.count()

id            22018
keyPhrases    22018
dtype: int64

In [21]:
def get_short_key_phrases(keyPhrases):
    shorter_phrases = []
    for phrase in keyPhrases:
        if len(phrase.split(" ")) <  5:
            shorter_phrases.append(phrase)
    return shorter_phrases
key_phrases_df["shortKeyPhrases"] = key_phrases_df["keyPhrases"].apply(get_short_key_phrases)

In [22]:
key_phrases_df.head()

Unnamed: 0,id,keyPhrases,shortKeyPhrases
0,11637,"[guys, tube coordinator, bolaz, london bridge]","[guys, tube coordinator, bolaz, london bridge]"
1,15277,"[london bridge, rt, etniesjags]","[london bridge, rt, etniesjags]"
2,9621,"[man, article, scandal, borisjohnson, world, p...","[man, article, scandal, borisjohnson, world, p..."
3,20342,"[london bridge, marchester arena bomber, flame...","[london bridge, marchester arena bomber, flame..."
4,15915,[tangina yung grades ko parang london bridge n...,[kurtnysgvia]


In [28]:
tweets.head()

Unnamed: 0,index,text,count,processed_text
0,11637,"""LOL I JUST HEARD THE TUBE COORDINATOR (or wha...",2470,lol i just heard the tube coordinator (or what...
1,15277,"""RT @EtniesJags: “London Bridge is falling dow...",1955,"rt @etniesjags: london bridge is falling down,..."
2,9621,"""For anyone believing that corruption is mainl...",654,for anyone believing that corruption is mainly...
3,20342,"""You mean like the 7/7 tube bombers did, the M...",608,"you mean like the 7/7 tube bombers did, the ma..."
4,15915,"""RT @kurtnysgvia: Tangina yung grades ko paran...",593,rt @kurtnysgvia: tangina yung grades ko parang...


In [29]:
key_phrases_df.id = key_phrases_df.id.astype(int)

metadata_keyPhrases = pd.merge(
    key_phrases_df[["id", "keyPhrases", "shortKeyPhrases"]], 
    tweets[["index", "text", "processed_text", "count"]], 
    left_on="id", 
    right_on="index",
    how="inner")
metadata_keyPhrases.sort_values(by=["count"], ascending=False, inplace=True)
metadata_keyPhrases.to_csv("tweet_keyPhrases.csv")
metadata_keyPhrases[["keyPhrases", "processed_text", "count"]].head(10)

Unnamed: 0,keyPhrases,processed_text,count
0,"[guys, tube coordinator, bolaz, london bridge]",lol i just heard the tube coordinator (or what...,2470
1,"[london bridge, rt, etniesjags]","rt @etniesjags: london bridge is falling down,...",1955
2,"[man, article, scandal, borisjohnson, world, p...",for anyone believing that corruption is mainly...,654
3,"[london bridge, marchester arena bomber, flame...","you mean like the 7/7 tube bombers did, the ma...",608
4,[tangina yung grades ko parang london bridge n...,rt @kurtnysgvia: tangina yung grades ko parang...,593
5,"[paris shootings, london bridge attacks, nonmu...","why are nonmuslims, celebrities and influencer...",590
6,"[pm outside newscorps hq, london bridge, 30pm8...",the time for tolerating the media whipping up ...,565
7,"[aa jata hai, bas tu hai asli hindustani jo lo...",haan hum to saare pakistani hain jo yahan tax ...,461
8,"[london bridge, pilots, tyler josephs street p...",from the rock sound archive: twenty one pilots...,430
9,[barcelona und sogar istanbul und lese ich heu...,"nach den opfern in nizza, st. petersburg, lond...",375


In [30]:
(
  key_phrases_df["keyPhrases"].apply(lambda x: len(x)).mean(),
  key_phrases_df["keyPhrases"].apply(lambda x: len(x)).max(),
  key_phrases_df["keyPhrases"].apply(lambda x: len(x)).min()
)

(5.804750658552094, 81, 0)

In [31]:
(
  key_phrases_df["shortKeyPhrases"].apply(lambda x: len(x)).mean(),
  key_phrases_df["shortKeyPhrases"].apply(lambda x: len(x)).max(),
  key_phrases_df["shortKeyPhrases"].apply(lambda x: len(x)).min()
)

(5.653374511763103, 81, 0)

In [32]:
not_covered_list = []
def no_key_phrases(row):
    if len(row.keyPhrases) == 0:
        not_covered_list.append(row.id)
        return 1
not_covered =  key_phrases_df.apply(no_key_phrases, axis=1).sum()

print("Number of posts not covered by Key Phrases: {} or {} %".format(
    not_covered, 
    (not_covered/len(key_phrases_df)*100)
    )
)

Number of posts not covered by Key Phrases: 1.0 or 0.004541738577527478 %


In [33]:
for text in tweets[tweets["index"].isin(not_covered_list)].text.values:
    print(">>>>>>>>>>>>>")
    print(text)
    print(process_tweet(text))

>>>>>>>>>>>>>
"抜き打ちで流してみんなを動かす効果のあるSBベスト3 1.「Seven Steps」 2.「London Bridge」 3.「Rock My Soul」 ※中の人主観です。＃ラボ_LABO"
3 1 . seven steps 2 . london bridge 3 . rock my soul


In [34]:
key_phrases_df[key_phrases_df["shortKeyPhrases"].map(len)>10].count()

id                 844
keyPhrases         844
shortKeyPhrases    844
dtype: int64

In [35]:
not_covered_list = []
def no_key_phrases(row):
    if len(row.shortKeyPhrases) == 0:
        not_covered_list.append(row.id)
        return 1
not_covered =  key_phrases_df.apply(no_key_phrases, axis=1).sum()

print("Number of posts not covered by Shortened Key Phrases: {} or {} %".format(
    not_covered, 
    (not_covered/len(key_phrases_df)*100)
    )
)

Number of posts not covered by Shortened Key Phrases: 283.0 or 1.2853120174402761 %


In [36]:
for text in tweets[tweets["index"].isin(not_covered_list)].text.values[0:20]:
    print(">>>>>>>>>>>>>")
    print(text)
    print(process_tweet(text))

>>>>>>>>>>>>>
"Londra Tamigi Tower Bridge città che trae origini romane Albione https://t.co/WFXSTVKYci
https://www.immaginienonsoloparole.it/londra-tamigi-tower-bridge-11-prospettive-inconsuete-2016-01-08"
londra tamigi tower bridge che trae origini romane albione
>>>>>>>>>>>>>
"#ラジオ深夜便 思い出のスクリーン・ミュージック 私の好きなラヴ・ストーリー作品集 https://t.co/qeSwa7G9si Waterloo Bridge (マイラ・レスター)Vivien Leigh Robert Taylor https://t.co/ti9RfBufMf 君の名は ♪織井茂子 https://t.co/1khSrgeQbE Auld Lang Syne https://t.co/sxEH1ZhwrM
https://www.youtube.com/watch?v=NEn0NjGI4Fs&feature=youtu.be
https://www.youtube.com/watch?v=6egm4SNGLbY&feature=youtu.be
https://www.youtube.com/watch?v=4HGcUPkCtyQ&feature=youtu.be"
waterloo bridge ( ) vivien leigh robert taylor auld lang syne
>>>>>>>>>>>>>
"The Memorial Art Gallery highlights Monet's ""Waterloo Bridge"" series https://t.co/r6bZn8AExk
https://www.wxxinews.org/post/memorial-art-gallery-highlights-monets-waterloo-bridge-series"
the memorial art gallery highlights monets   waterloo

In [37]:
unique_key_phrases = set(sum(key_phrases_df["shortKeyPhrases"], []))

key_phrases_category = {}
for key in unique_key_phrases:
    key_phrases_category[key] = []

def map_categories(row):
    for phrase in row["shortKeyPhrases"]:
        key_phrases_category[phrase].append(row["id"])
    return None

key_phrases_df.apply(map_categories, axis=1)

key_phrases_category_count={}
for key, value in key_phrases_category.items():
    key_phrases_category_count[key] = len(value)

from collections import OrderedDict
key_phrases_category_count = OrderedDict(
                               sorted(key_phrases_category_count.items(), 
                                      key=lambda kv: kv[1], 
                                      reverse=True)
                            )

print(len(key_phrases_category_count))
# print(key_phrases_category_count)

43408


In [38]:
key_phrases_category_count

OrderedDict([('london bridge', 9158),
             ('tower bridge', 2280),
             ('rt', 906),
             ('london', 893),
             ('railway', 664),
             ('se', 566),
             ('waterloo bridge', 542),
             ('train', 538),
             ('time', 472),
             ('people', 446),
             ('london bridge station', 389),
             ('day', 377),
             ('manchester', 357),
             ('london bridge attack', 352),
             ('minutes', 329),
             ('trains', 278),
             ('uk', 277),
             ('morning', 260),
             ('week', 259),
             ('west sacramento', 244),
             ('fergie', 240),
             ('days', 240),
             ('way', 236),
             ('bridge', 236),
             ('towerbridge', 220),
             ('inconvenience', 213),
             ('eb', 206),
             ('years', 204),
             ('westminster', 202),
             ('southernrailuk', 201),
             ('tlrailuk', 198),
    

In [39]:
top_25_key_phrases = list(key_phrases_category_count.keys())[0:25]
metadata_keyPhrases['covered_top_25'] = metadata_keyPhrases["keyPhrases"].apply(
    lambda x: any(item in x for item in top_25_key_phrases)
)
metadata_keyPhrases['covered_top_25'].value_counts()/len(metadata_keyPhrases)*100

True     68.484876
False    31.515124
Name: covered_top_200, dtype: float64

In [45]:
top_250_key_phrases = list(key_phrases_category_count.keys())[0:250]
metadata_keyPhrases['covered_top_250'] = metadata_keyPhrases["keyPhrases"].apply(
    lambda x: any(item in x for item in top_250_key_phrases)
)
metadata_keyPhrases['covered_top_250'].value_counts()/len(metadata_keyPhrases)*100

True     80.724861
False    19.275139
Name: covered_top_250, dtype: float64

In [49]:
top_2500_key_phrases = list(key_phrases_category_count.keys())[0:2500]
metadata_keyPhrases['covered_top_2500'] = metadata_keyPhrases["keyPhrases"].apply(
    lambda x: any(item in x for item in top_2500_key_phrases)
)
metadata_keyPhrases['covered_top_2500'].value_counts()/len(metadata_keyPhrases)*100

True     90.035426
False     9.964574
Name: covered_top_2500, dtype: float64

In [51]:
2500/len(tweets)

0.11352284079556807

In [50]:
list(key_phrases_category_count.keys())[0:2500]

['london bridge',
 'tower bridge',
 'rt',
 'london',
 'railway',
 'se',
 'waterloo bridge',
 'train',
 'time',
 'people',
 'london bridge station',
 'day',
 'manchester',
 'london bridge attack',
 'minutes',
 'trains',
 'uk',
 'morning',
 'week',
 'west sacramento',
 'fergie',
 'days',
 'way',
 'bridge',
 'towerbridge',
 'inconvenience',
 'eb',
 'years',
 'westminster',
 'southernrailuk',
 'tlrailuk',
 'kids',
 'im',
 'mins',
 'lambeth bridge',
 'muslims',
 'thanks',
 'thames',
 'dont',
 'night',
 'place',
 'london bridge terrorist attack',
 'manchester arena',
 'man',
 'londonbridge',
 'battersea',
 'world',
 'guys',
 'year',
 'platform',
 'islamophobic',
 'antiimmigrant rhetoric',
 'interior department spokeswoman',
 'podcast episode',
 'millenium bridge',
 'song',
 'oxford circus',
 'piccadilly circus',
 'bridge gtwy',
 'england',
 'madame tussauds',
 'paddington station',
 'cathedral',
 'tower bridge gtwy',
 'service',
 'delays',
 'seconds',
 'city',
 'pictures',
 'vs',
 'paris',
 

__________________________________________________________________________________________________________________