# News Keywords Extraction

In [None]:
import pandas as pd
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=False)
import plotly.express as px

import fasttext

import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import spacy
sp = spacy.load('en_core_web_sm')

import emoji
import string

## Reading Data

In [None]:
df_tweets = pd.read_json('data/news/news_tweets/2022-03-16__2022-04-01/tweets.json')
df_tweets

In [None]:
df_authors = pd.read_json('data/news/news_tweets/2022-03-16__2022-04-01/users.json')
df_authors

## Data Exploration

### Counts and IDs

In [None]:
ids = pd.read_csv('data/news/twitter_users_news.csv', header=None, names=['handle', 'id', 'website'])['id']
ids

In [None]:
src_count = df_tweets.groupby('author_id')['id'].count()
src_count

In [None]:
src_count.value_counts()

Some users have less thant 90 tweets

In [None]:
src_count[src_count < 90]

In [None]:
df_authors[df_authors['id'] == 247723476]

Some users are extracted although not asked to be extracted

In [None]:
df_authors[~df_authors['id'].isin(ids.values)]

In [None]:
df_tweets[~df_tweets.author_id.isin(ids.values)].author_id.unique()

### Exploring Withheld

In [None]:
df_tweets[~df_tweets.withheld.isna()].head()

In [None]:
df_authors[df_authors['id'].isin(df_tweets[~df_tweets.withheld.isna()].author_id.unique())]

In [None]:
df_authors[~df_authors.withheld.isna()]

### Data Language Statistics 

In [None]:
lang_count = df_tweets.groupby('lang')[['id']].count().reset_index()
lang_count['id'] = lang_count['id'] / lang_count['id'].sum()
fig = px.pie(lang_count, names='lang', values='id', title='Language proportions'.title(), height=700, hover_name=(lang_count['id']*100).round(2).apply(str) + '%')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide', title_x=0.5,)
fig.show()

### Language Verification

In [None]:
df_english = df_tweets[df_tweets.lang == 'en'].copy()
df_english

In [None]:
df_english.text[:5].tolist()

## Language Identification

In [None]:
model = fasttext.load_model('lid.176.bin')

Remove redundant spaces, links and emojis

In [None]:
df_tweets['clean_text'] = df_tweets.text.str.replace(r'\s+', ' ', regex=True).str.replace(r'http\S+', '', regex=True).str.strip().apply(lambda s: emoji.replace_emoji(s, replace=''))
df_tweets['clean_text']

In [None]:
langs = model.predict(df_tweets['clean_text'].tolist(), k=2)

In [None]:
df = pd.DataFrame(list(zip(*langs)), columns=['pred_lang', 'pred_conf'])
df

In [None]:
df[['pred_lang', 'pred_lang_2']] = pd.DataFrame(df.pred_lang.values.tolist(), index=df.index)
df['pred_lang'] = df.pred_lang.str.replace('__label__', '')
df['pred_lang_2'] = df.pred_lang_2.str.replace('__label__', '')
df[['pred_conf', 'pred_conf_2']] = pd.DataFrame(df.pred_conf.values.tolist(), index=df.index)
df

In [None]:
df.pred_conf.hist()

In [None]:
df_tweets = df_tweets.join(df)
df_tweets

In [None]:
(df_tweets.lang == df_tweets.pred_lang).value_counts(normalize=False)

In [None]:
(df_tweets.lang == df_tweets.pred_lang).value_counts(normalize=True)

In [None]:
df_tweets[df_tweets.pred_conf < 0.5]

In [None]:
df_tweets = df_tweets[(df_tweets.pred_conf >= 0.5) & (df_tweets.clean_text.apply(len) > 0)]
len(df_tweets)

## English Keywords

In [None]:
df_english = df_tweets[df_tweets.pred_lang == 'en'].copy()
df_english

### Remove Retweets, Quotes and Replies

In [None]:
df_english['tweet_type'] = df_english['referenced_tweets'].apply(lambda l: tuple(sorted([d['type'] for d in l])) if type(l) == list else l)
df_english.head()

In [None]:
df_english.tweet_type.value_counts(dropna=False)

In [None]:
df_english = df_english[df_english['tweet_type'].isna()]
df_english.head()

### Text Cleaning

In [None]:
english_texts = df_english.text
english_texts

#### Remove redundant spaces, links and emojis

In [None]:
english_texts = english_texts.str.replace(r'\s+', ' ', regex=True).str.replace(r'http\S+', '', regex=True).str.strip().apply(lambda s: emoji.replace_emoji(s, replace=''))
english_texts

#### Tokenize

In [None]:
english_tokens = english_texts.parallel_apply(word_tokenize)
english_tokens

#### Tag Tokens

In [None]:
english_annotated_tokens = english_tokens.parallel_apply(pos_tag)
english_annotated_tokens

#### Separate Text from Mentions and Hashtags

In [None]:
def separate(ls):
    res = {'text': [],
           'hashtags': [],
           'mentions': []}
    
    hashtag = False
    mention = False
    
    for s, t in ls:
        if hashtag:
            hashtag = False
            res['hashtags'].append('#' + s)
        elif mention:
            mention = False
            res['mentions'].append('@' + s)
        else:
            if s == '#':
                hashtag = True
            elif s == '@':
                mention = True
            else:    
                res['text'].append((s, t))
            
    return res

In [None]:
separate(english_annotated_tokens.iloc[0])

In [None]:
df_english_tokens = pd.DataFrame(english_annotated_tokens.apply(separate).tolist(), index=english_annotated_tokens.index)
df_english_tokens

#### Translate Tags

In [None]:
df_english_token_texts = df_english_tokens.text.explode().dropna()
df_english_token_texts

In [None]:
# https://www.geeksforgeeks.org/python-lemmatization-approaches-with-examples/
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return None

In [None]:
df_english_token_texts = df_english_token_texts.apply(lambda x: (x[0].lower(), pos_tagger(x[1])))
df_english_token_texts

In [None]:
df_english_tagged_tokens = pd.DataFrame(df_english_token_texts.tolist(), index=df_english_token_texts.index, columns=['token', 'tag'])
df_english_tagged_tokens

#### Remove punctuations

In [None]:
all_punct_regex ='^([^\w\s]+|_)$'

In [None]:
df_english_tagged_tokens = df_english_tagged_tokens[(~df_english_tagged_tokens.token.isin(list(string.punctuation))) & (~df_english_tagged_tokens.token.str.match(all_punct_regex))]
df_english_tagged_tokens

Remove all numbers

In [None]:
df_english_tagged_tokens = df_english_tagged_tokens[(~df_english_tagged_tokens.token.str.isnumeric())]
df_english_tagged_tokens

Remove stopwords

In [None]:
df_english_tagged_tokens = df_english_tagged_tokens[(~df_english_tagged_tokens.token.str.lower().isin(stopwords.words())) & (~df_english_tagged_tokens.token.str.lower().isin(sp.Defaults.stop_words))]
df_english_tagged_tokens

#### Lemmatize

In [None]:
lemmatizer = WordNetLemmatizer()
def lemmatize(word, tag):
    if tag is None:
        # if there is no available tag, append the token as is
        return word
    else:       
        # else use the tag to lemmatize the token
        return lemmatizer.lemmatize(word, tag)

In [None]:
df_english_tagged_tokens = df_english_tagged_tokens.copy()
df_english_tagged_tokens['token'] = df_english_tagged_tokens.parallel_apply(lambda row: lemmatize(row['token'], row['tag']), axis=1)
df_english_tagged_tokens

### Value Counts

In [None]:
df_english_tagged_tokens['tuple'] = list(zip(df_english_tagged_tokens.token, df_english_tagged_tokens.tag))
df_english_tagged_tokens

In [None]:
token_counts = df_english_tagged_tokens.tuple.value_counts()
token_counts

In [None]:
token_counts = token_counts[token_counts > 3]
token_counts

In [None]:
df_token_counts = token_counts.to_frame(name='count').reset_index().rename(columns={'index': 'token'})
df_token_counts

In [None]:
df_token_counts[['token', 'tag']] = pd.DataFrame(df_token_counts.token.tolist(), index=df_token_counts.index)
df_token_counts

In [None]:
df_token_counts.to_csv('dump/token_count.csv')

In [None]:
df_token_counts.groupby('tag').apply(lambda x: x.nlargest(15, ['count'])).reset_index(drop=True)

In [None]:
df_token_counts[df_token_counts.token == 'breaking']