# Tweet Preprocessing
#### Basic text-preprocessing pipeline (in no particular order):
- Detect and translate tweets to English
- Tokenization
- Stopword removal & Lemmatization
- Remove URLs and reserved words (RTs)
- Lowercasing
- Remove # and @ symbols but keep values
- Spell Checker
- Remove punctuation (possibly, although useful for tweet fragmentation)

In [122]:
import os
import pandas as pd
import numpy as np
import preprocessor as prep
from os.path import join
from sqlite3 import connect
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
from googletrans import Translator
from nltk.corpus import stopwords
from nltk.util import ngrams
from spellchecker import SpellChecker
import nltk
import string

pd.options.display.max_columns = None
pd.options.display.max_rows = None

### Getting Data (tweets)

In [123]:
project_dir = join(os.getcwd(), os.pardir)
raw_dir = join(project_dir, 'data', 'raw')
interim_dir = join(project_dir, 'data', 'interim')

%config InlineBackend.figure_format = 'svg'

In [1]:
raw_fname = 'data_pull_sample.json'
df = pd.read_json(join(raw_dir, raw_fname), lines=True)

NameError: name 'pd' is not defined

In [125]:
df_users = pd.DataFrame(df['user'].tolist())

df_tweets = df.drop(columns=[
    'id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'user',
    'coordinates', 'place', 'quoted_status_id', 'favorited',
    'retweeted', 'retweeted_status', 'matching_rules', 'geo', 
    'filter_level', 'display_text_range', 'contributors',
    'quoted_status', 'quoted_status_id', 'quoted_status_permalink',
    'in_reply_to_screen_name', 'text', 'extended_tweet', 'truncated',
    'entities', 'extended_entities'
])
df_tweets['user_id_str'] = df['user'].apply(lambda x: x['id_str'])
df_tweets['full_text'] = df.apply(
    lambda row: 
        row['text'] 
        if not row['truncated'] 
        else row['extended_tweet']['full_text'], 
    axis=1
)

def get_retweet_id(row):
    """returns: is_retweet, original_tweet_id_str"""
    if type(row['retweeted_status']) == dict:
        return True, row['retweeted_status']['id_str']
    else:
        return False, np.nan

df_tweets['is_retweet'], df_tweets['original_tweet_id_str'] = zip(*df.apply(get_retweet_id, axis=1))
df_tweets['is_reply'] = ~df['in_reply_to_status_id'].isna()
df_tweets.drop_duplicates(subset='id_str', inplace=True)
df_tweets.loc[:,'is_original'] = ~df_tweets[['is_reply', 'is_retweet', 'is_quote_status']].sum(1).astype(bool)

In [126]:
print(df_tweets.shape)
df_tweets = df_tweets[df_tweets['is_retweet']==False]
df_tweets.shape

(20500, 19)


(5478, 19)

In [127]:
df_tweets.head()

Unnamed: 0,created_at,id_str,source,in_reply_to_status_id_str,in_reply_to_user_id_str,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,lang,possibly_sensitive,quoted_status_id_str,user_id_str,full_text,is_retweet,original_tweet_id_str,is_reply,is_original
0,2020-06-14 23:57:21+00:00,1272317232626888704,"<a href=""http://twitter.com/download/android"" ...",1.272317e+18,4844328000.0,False,0,1,0,0,ja,,,1065957356079476736,@Ampan_C そのコメント欄に必ず私が居た‼️🤔🙄🙄🙄\n自分で質問して自分が最初にコメ...,False,,True,False
1,2020-06-14 23:51:31+00:00,1272315765975183360,"<a href=""http://twitter.com/download/android"" ...",1.272312e+18,4844328000.0,False,0,1,0,0,ja,,,1065957356079476736,@Ampan_C X軸とY軸が有るグラフで虚数を掛けると90度回転する‼️🤔,False,,True,False
4,2020-06-14 23:49:53+00:00,1272315355700981760,"<a href=""http://twitter.com/download/iphone"" r...",1.272315e+18,4844328000.0,False,0,1,0,0,ja,,,906562306401755136,@Ampan_C あんぱんおぱよー！,False,,True,False
5,2020-06-14 23:46:30+00:00,1272314503699390464,"<a href=""http://twitter.com/download/android"" ...",1.272311e+18,4844328000.0,False,0,1,0,0,ja,,,1065957356079476736,@Ampan_C 今は無きヤフ~の公式アプリ「ネタりか」の中のコ〜ナ〜「意味の無い質問」‼️...,False,,True,False
9,2020-06-14 23:37:10+00:00,1272312154100654080,"<a href=""http://twitter.com/download/iphone"" r...",1.272312e+18,4844328000.0,False,0,1,0,0,ja,,,1082211825632964608,@Ampan_C むしろ見えるのか.....,False,,True,False


### Translate Tweets

In [128]:
def translate_tweet(text, lang):
    trans = Translator()
    return trans.translate(text).text

In [129]:
# %%time
# temp_df = df_tweets

# for i in temp_df.index:
#     if temp_df['lang'][i]!='en':
#         temp_df.loc[i,'full_text_processed'] = translate_tweet(temp_df['full_text'][i], temp_df['lang'][i])
        
# temp_df.head()

In [130]:
# %%time
# temp_df = df_tweets
# for row in temp_df.itertuples():    
#     if row.lang != 'en':
#         df_tweets.at[row.Index,'full_text_processed'] = translate_tweet(row.full_text, row.lang)

# temp_df.head()

In [131]:
# %%time
# temp_df = df_tweets

# def translate_func(x, text, lang,col):
    
#     if x[lang] != 'en':
#         x[col]= translate_tweet(x[text], x[lang])
#     else:
#         x[col]=x[text]
#     return x

# temp_df.apply(lambda x: translate_func(x, 'full_text', 'lang','full_text_processed'),axis=1)

# temp_df.head()

In [133]:
%%time

def translate_func(x, text, lang):
    
    if x[lang] != 'en':
        process = translate_tweet(x[text], x[lang])
    else:
        process = x[text]
    return process

df_tweets['full_text_processed'] = df_tweets.apply(lambda x: translate_func(x, 'full_text', 'lang'),axis=1)
df_tweets.head()

CPU times: user 28.4 s, sys: 1.11 s, total: 29.6 s
Wall time: 4min 37s


Unnamed: 0,created_at,id_str,source,in_reply_to_status_id_str,in_reply_to_user_id_str,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,lang,possibly_sensitive,quoted_status_id_str,user_id_str,full_text,is_retweet,original_tweet_id_str,is_reply,is_original,full_text_processed
0,2020-06-14 23:57:21+00:00,1272317232626888704,"<a href=""http://twitter.com/download/android"" ...",1.272317e+18,4844328000.0,False,0,1,0,0,ja,,,1065957356079476736,@Ampan_C そのコメント欄に必ず私が居た‼️🤔🙄🙄🙄\n自分で質問して自分が最初にコメ...,False,,True,False,@Ampan_C I was always in the comment section! ...
1,2020-06-14 23:51:31+00:00,1272315765975183360,"<a href=""http://twitter.com/download/android"" ...",1.272312e+18,4844328000.0,False,0,1,0,0,ja,,,1065957356079476736,@Ampan_C X軸とY軸が有るグラフで虚数を掛けると90度回転する‼️🤔,False,,True,False,@Ampan_C A graph with X and Y axes rotates 90 ...
4,2020-06-14 23:49:53+00:00,1272315355700981760,"<a href=""http://twitter.com/download/iphone"" r...",1.272315e+18,4844328000.0,False,0,1,0,0,ja,,,906562306401755136,@Ampan_C あんぱんおぱよー！,False,,True,False,@Ampan_C Anpanpayo!
5,2020-06-14 23:46:30+00:00,1272314503699390464,"<a href=""http://twitter.com/download/android"" ...",1.272311e+18,4844328000.0,False,0,1,0,0,ja,,,1065957356079476736,@Ampan_C 今は無きヤフ~の公式アプリ「ネタりか」の中のコ〜ナ〜「意味の無い質問」‼️...,False,,True,False,@Ampan_C There is no right now Yahoo ~ Ko in t...
9,2020-06-14 23:37:10+00:00,1272312154100654080,"<a href=""http://twitter.com/download/iphone"" r...",1.272312e+18,4844328000.0,False,0,1,0,0,ja,,,1082211825632964608,@Ampan_C むしろ見えるのか.....,False,,True,False,@Ampan_C Does it look rather.....


### Removing URLs and Reserved Words (RTs)

In [134]:
from preprocessor import api
df_tweets['full_text_processed'] = df_tweets['full_text_processed'].astype(str)

api.set_options('urls','reserved_words')


In [135]:
df_tweets['full_text_processed'] = df_tweets['full_text_processed'].apply(lambda x: api.clean(x))

### Lowercasing & Punctuation Removal

In [136]:
df_tweets['full_text_processed'] = df_tweets['full_text_processed'].apply(lambda x: x.lower())

In [137]:
def remove_punct(text):
    table = str.maketrans('','',string.punctuation)
    return text.translate(table)

In [138]:
df_tweets['full_text_processed'] = df_tweets['full_text_processed'].apply(lambda x: remove_punct(x))

### Lemmatization & Stopword removal

In [139]:
lemmatizer = nltk.stem.WordNetLemmatizer()
df_tweets['full_text_processed'] = df_tweets['full_text_processed'].apply(lambda x: ' '.join([lemmatizer.lemmatize(w) for w in x.split()]))

In [140]:
stop_words = set(stopwords.words('english'))
df_tweets['full_text_processed'] = df_tweets['full_text_processed'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

### Spell Checker

In [141]:

spell = SpellChecker()

def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)

In [142]:
%%time

##Taking too much time to execute
# temp_df = df_tweets
# df_tweets['full_text_processed'] = df_tweets['full_text_processed'].apply(lambda x: correct_spellings(x))

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.58 µs


In [143]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

sid = SentimentIntensityAnalyzer()
print(sid.polarity_scores('i love vader'))


{'neg': 0.0, 'neu': 0.192, 'pos': 0.808, 'compound': 0.6369}


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/jaredross/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


"\ndef translate_func(x, text, lang,col):\n    \n    if x[lang] != 'en':\n        x[col]= translate_tweet(x[text], x[lang])\n    else:\n        x[col]=x[text]\n    return x\n\ndf_tweets.apply(lambda x: translate_func(x, 'full_text', 'lang','full_text_processed'),axis=1)\ndf_tweets.head()\n"

In [144]:
def create_sentiment(x,text):

    return sid.polarity_scores(text)['compound']

df_tweets['sentiment'] = df_tweets.apply(lambda x: create_sentiment(x, x['full_text_processed']),axis=1)
df_tweets.head()

Unnamed: 0,created_at,id_str,source,in_reply_to_status_id_str,in_reply_to_user_id_str,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,lang,possibly_sensitive,quoted_status_id_str,user_id_str,full_text,is_retweet,original_tweet_id_str,is_reply,is_original,full_text_processed,sentiment
0,2020-06-14 23:57:21+00:00,1272317232626888704,"<a href=""http://twitter.com/download/android"" ...",1.272317e+18,4844328000.0,False,0,1,0,0,ja,,,1065957356079476736,@Ampan_C そのコメント欄に必ず私が居た‼️🤔🙄🙄🙄\n自分で質問して自分が最初にコメ...,False,,True,False,ampanc wa always comment section ️🤔🙄🙄🙄 asked c...,0.0
1,2020-06-14 23:51:31+00:00,1272315765975183360,"<a href=""http://twitter.com/download/android"" ...",1.272312e+18,4844328000.0,False,0,1,0,0,ja,,,1065957356079476736,@Ampan_C X軸とY軸が有るグラフで虚数を掛けると90度回転する‼️🤔,False,,True,False,ampanc graph x ax rotates 90 degree multiplied...,0.0772
4,2020-06-14 23:49:53+00:00,1272315355700981760,"<a href=""http://twitter.com/download/iphone"" r...",1.272315e+18,4844328000.0,False,0,1,0,0,ja,,,906562306401755136,@Ampan_C あんぱんおぱよー！,False,,True,False,ampanc anpanpayo,0.0
5,2020-06-14 23:46:30+00:00,1272314503699390464,"<a href=""http://twitter.com/download/android"" ...",1.272311e+18,4844328000.0,False,0,1,0,0,ja,,,1065957356079476736,@Ampan_C 今は無きヤフ~の公式アプリ「ネタりか」の中のコ〜ナ〜「意味の無い質問」‼️...,False,,True,False,ampanc right yahoo ko official application net...,0.9042
9,2020-06-14 23:37:10+00:00,1272312154100654080,"<a href=""http://twitter.com/download/iphone"" r...",1.272312e+18,4844328000.0,False,0,1,0,0,ja,,,1082211825632964608,@Ampan_C むしろ見えるのか.....,False,,True,False,ampanc doe look rather,0.0


### Loading into Database

In [147]:
from src.data._load_es import load_es
from src.data._transform import merge_dataframes 
df_merge = merge_dataframes(df_users,df_tweets)
df_merge.head()


Unnamed: 0,id,user_id,name,screen_name,location,url,description,translator_type,derived,protected,verified,followers_count,friends_count,listed_count,favourites_count,statuses_count,user_created_at,utc_offset,time_zone,geo_enabled,lang_x,contributors_enabled,is_translator,profile_background_color,profile_background_image_url,profile_background_image_url_https,profile_background_tile,profile_link_color,profile_sidebar_border_color,profile_sidebar_fill_color,profile_text_color,profile_use_background_image,profile_image_url,profile_image_url_https,profile_banner_url,default_profile,default_profile_image,following,follow_request_sent,notifications,tweet_created_at,tweet_id,source,in_reply_to_status_id_str,in_reply_to_user_id_str,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,lang_y,possibly_sensitive,quoted_status_id_str,user_id_str,full_text,is_retweet,original_tweet_id_str,is_reply,is_original,full_text_processed,sentiment
0,1065957356079476736,1065957356079476736,大天使糖君…。❤️💢,torured13,北赤羽,,(北)赤羽の妖精‼️😫\n遂に帰宅はしたけれど‼️🤔😫🏠👣\nADHDに適応障害に自閉症スペ...,none,{'locations': [{'country': 'Equatorial Guinea'...,False,False,163,343,2,44864,28739,2018-11-23 13:16:53+00:00,,,True,,False,False,F5F8FA,,,False,1DA1F2,C0DEED,DDEEF6,333333,True,http://pbs.twimg.com/profile_images/1273009676...,https://pbs.twimg.com/profile_images/127300967...,https://pbs.twimg.com/profile_banners/10659573...,True,False,,,,2020-06-14 23:57:21+00:00,1272317232626888704,"<a href=""http://twitter.com/download/android"" ...",1.27232e+18,4844330000.0,False,0,1,0,0,ja,,,1065957356079476736,@Ampan_C そのコメント欄に必ず私が居た‼️🤔🙄🙄🙄\n自分で質問して自分が最初にコメ...,False,,True,False,ampanc wa always comment section ️🤔🙄🙄🙄 asked c...,0.0
1,1065957356079476736,1065957356079476736,大天使糖君…。❤️💢,torured13,北赤羽,,(北)赤羽の妖精‼️😫\n遂に帰宅はしたけれど‼️🤔😫🏠👣\nADHDに適応障害に自閉症スペ...,none,{'locations': [{'country': 'Equatorial Guinea'...,False,False,163,343,2,44864,28739,2018-11-23 13:16:53+00:00,,,True,,False,False,F5F8FA,,,False,1DA1F2,C0DEED,DDEEF6,333333,True,http://pbs.twimg.com/profile_images/1273009676...,https://pbs.twimg.com/profile_images/127300967...,https://pbs.twimg.com/profile_banners/10659573...,True,False,,,,2020-06-14 23:51:31+00:00,1272315765975183360,"<a href=""http://twitter.com/download/android"" ...",1.27231e+18,4844330000.0,False,0,1,0,0,ja,,,1065957356079476736,@Ampan_C X軸とY軸が有るグラフで虚数を掛けると90度回転する‼️🤔,False,,True,False,ampanc graph x ax rotates 90 degree multiplied...,0.0772
2,1065957356079476736,1065957356079476736,大天使糖君…。❤️💢,torured13,北赤羽,,(北)赤羽の妖精‼️😫\n遂に帰宅はしたけれど‼️🤔😫🏠👣\nADHDに適応障害に自閉症スペ...,none,{'locations': [{'country': 'Equatorial Guinea'...,False,False,163,343,2,44864,28739,2018-11-23 13:16:53+00:00,,,True,,False,False,F5F8FA,,,False,1DA1F2,C0DEED,DDEEF6,333333,True,http://pbs.twimg.com/profile_images/1273009676...,https://pbs.twimg.com/profile_images/127300967...,https://pbs.twimg.com/profile_banners/10659573...,True,False,,,,2020-06-14 23:46:30+00:00,1272314503699390464,"<a href=""http://twitter.com/download/android"" ...",1.27231e+18,4844330000.0,False,0,1,0,0,ja,,,1065957356079476736,@Ampan_C 今は無きヤフ~の公式アプリ「ネタりか」の中のコ〜ナ〜「意味の無い質問」‼️...,False,,True,False,ampanc right yahoo ko official application net...,0.9042
3,1065957356079476736,1065957356079476736,大天使糖君…。❤️💢,torured13,北赤羽,,(北)赤羽の妖精‼️😫\n遂に帰宅はしたけれど‼️🤔😫🏠👣\nADHDに適応障害に自閉症スペ...,none,{'locations': [{'country': 'Equatorial Guinea'...,False,False,163,343,2,44864,28739,2018-11-23 13:16:53+00:00,,,True,,False,False,F5F8FA,,,False,1DA1F2,C0DEED,DDEEF6,333333,True,http://pbs.twimg.com/profile_images/1273009676...,https://pbs.twimg.com/profile_images/127300967...,https://pbs.twimg.com/profile_banners/10659573...,True,False,,,,2020-06-14 18:35:41+00:00,1272236283230744576,"<a href=""http://twitter.com/download/android"" ...",1.27223e+18,4844330000.0,False,0,0,0,0,ja,,,1065957356079476736,@Ampan_C 外に出ると悪さするから軟禁みたいにしてるって言われた‼️🤔😵😵😵,False,,True,False,ampanc wa told feel like house arrest bad go o...,-0.5267
4,1065957356079476736,1065957356079476736,大天使糖君…。❤️💢,torured13,北赤羽,,(北)赤羽の妖精‼️😫\n遂に帰宅はしたけれど‼️🤔😫🏠👣\nADHDに適応障害に自閉症スペ...,none,{'locations': [{'country': 'Equatorial Guinea'...,False,False,163,343,2,44864,28739,2018-11-23 13:16:53+00:00,,,True,,False,False,F5F8FA,,,False,1DA1F2,C0DEED,DDEEF6,333333,True,http://pbs.twimg.com/profile_images/1273009676...,https://pbs.twimg.com/profile_images/127300967...,https://pbs.twimg.com/profile_banners/10659573...,True,False,,,,2020-06-14 18:35:05+00:00,1272236130117672960,"<a href=""http://twitter.com/download/android"" ...",1.27224e+18,1.06596e+18,False,0,1,0,0,ja,0.0,,1065957356079476736,@Ampan_C 次元がいっぱい (ハヤカワ文庫 NF 28 ―アシモフの科学エッセイ 8)...,False,,True,False,ampanc lot dimension hayakawa bunko nf 28 ― as...,0.0


In [148]:
load_es(df_merge)

(743127, [])