# News Filtering

In [None]:
import pandas as pd
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=False)

import os

import emoji
import string

import fasttext

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
import spacy
sp = spacy.load('en_core_web_sm')

## Reading Data

### Tweets

In [None]:
data = pd.read_parquet('data/tweets/_all_cleaned/tweets.parquet')
data.head()

In [None]:
data.head(20)

### Profane Vocabulary

In [None]:
path = 'data/bad_words/'
files = [path + f for f in sorted(os.listdir(path)) if 'csv' in f]
files

In [None]:
bad_words = []
for f in files:
    d = pd.read_csv(f, header=None, names=['words'])
    d['lang'] = f.split('/')[-1].split('_')[0]
    bad_words.append(d)

In [None]:
bad_words = pd.concat(bad_words, ignore_index=True).drop_duplicates()

In [None]:
bad_words

## Pre-Processing 

### Language Identification

In [None]:
model = fasttext.load_model('data/lid.176.bin')

Remove redundant spaces, links and emojis

In [None]:
data_clean_text = data.text.str.replace(r'\s+', ' ', regex=True).str.replace(r'http\S+', '', regex=True).str.strip().parallel_apply(lambda s: emoji.replace_emoji(s, replace=''))
data_clean_text

In [None]:
langs = model.predict(data_clean_text.tolist(), k=2)

In [None]:
df = pd.DataFrame(list(zip(*langs)), columns=['pred_lang', 'pred_conf'])
df

In [None]:
df[['pred_lang', 'pred_lang_2']] = pd.DataFrame(df.pred_lang.values.tolist(), index=df.index)
df['pred_lang'] = df.pred_lang.str.replace('__label__', '')
df['pred_lang_2'] = df.pred_lang_2.str.replace('__label__', '')
df[['pred_conf', 'pred_conf_2']] = pd.DataFrame(df.pred_conf.values.tolist(), index=df.index)
df

In [None]:
df.pred_conf.hist()

In [None]:
sns.displot(df.pred_conf, kde=True, height=4, aspect=1.5, bins=20, stat='probability')
title = plt.title('Language Identification Confidence')
plt.savefig('plots/lang_id/' + title.get_text() + '.svg', format='svg', bbox_inches="tight")
plt.show()

In [None]:
data = data.join(df)
data

In [None]:
(data.lang == data.pred_lang).value_counts(normalize=False)

In [None]:
(data.lang == data.pred_lang).value_counts(normalize=True)

In [None]:
data[data.pred_conf < 0.5]

In [None]:
data = data[(data.pred_conf >= 0.5) & (data_clean_text.apply(len) > 0)]
len(data)

### English Tweets

In [None]:
en_bad_words = bad_words[bad_words.lang == 'en'].words.tolist()
len(en_bad_words)

In [None]:
data_en = data[data.pred_lang == 'en'].copy()
data_en.shape

In [None]:
data_en[data_en.lang != 'en'][['lang', 'text']].head(30)

In [None]:
print(data_en.loc[681].text)

In [None]:
print(data_en.loc[3183].text)

In [None]:
print(data_en.loc[3004].text)

In [None]:
data_en.to_parquet('data/tweets/en/english_tweets.parquet')

## News Identification Using Users

In [None]:
news_users = pd.read_csv('data/news/twitter_users_news.csv', names=['handle', 'id', 'website'])

In [None]:
data_en['news_user'] = data_en.author_id.astype(int).isin(news_users['id'])

## News Identifiction Using Text Attributes

### Text Cleaning

### Remove redundant spaces, links and emojis

In [None]:
data_en_texts = data_en.text

In [None]:
data_en_texts = data_en_texts.str.replace(r'\s+', ' ', regex=True)\
                             .str.replace(r'http\S+', '', regex=True)\
                             .str.strip()\
                             .parallel_apply(lambda s: emoji.replace_emoji(s, replace=''))
data_en_texts

### Tokenize

In [None]:
from time import time

In [None]:
%%time
data_en_tokens = data_en_texts.parallel_apply(word_tokenize)
data_en_tokens

### Tag Tokens

In [None]:
%%time
data_en_tokens_tags = data_en_tokens.parallel_apply(pos_tag)
data_en_tokens_tags

### Separate Text from Mentions and Hashtags

In [None]:
def separate(ls):
    res = {'text': [],
           'hashtags': [],
           'mentions': []}
    
    hashtag = False
    mention = False
    
    for s, t in ls:
        if hashtag:
            hashtag = False
            res['hashtags'].append('#' + s)
        elif mention:
            mention = False
            res['mentions'].append('@' + s)
        else:
            if s == '#':
                hashtag = True
            elif s == '@':
                mention = True
            else:    
                res['text'].append((s, t))
            
    return res

In [None]:
english_tokens_list = data_en_tokens_tags.parallel_apply(separate)
df_english_tokens = pd.DataFrame(english_tokens_list.tolist(), index=data_en_tokens_tags.index)
df_english_tokens

### Translate Tags

In [None]:
df_english_token_texts = df_english_tokens.text.explode().dropna()
df_english_token_texts

In [None]:
# https://www.geeksforgeeks.org/python-lemmatization-approaches-with-examples/
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return None

In [None]:
df_english_token_texts = df_english_token_texts.parallel_apply(lambda x: (x[0].lower(), pos_tagger(x[1])))
df_english_token_texts

In [None]:
df_english_tagged_tokens = pd.DataFrame(df_english_token_texts.tolist(), index=df_english_token_texts.index, columns=['token', 'tag'])
df_english_tagged_tokens

### Clean Text

#### Remove Punctuations

In [None]:
all_punct_regex ='^([^\w\s]+|_)$'

In [None]:
df_english_tagged_tokens = df_english_tagged_tokens[(~df_english_tagged_tokens.token.isin(list(string.punctuation))) & (~df_english_tagged_tokens.token.str.match(all_punct_regex))]
df_english_tagged_tokens

#### Remove All Numbers

In [None]:
df_english_tagged_tokens = df_english_tagged_tokens[(~df_english_tagged_tokens.token.str.isnumeric())]
df_english_tagged_tokens

#### Remove Stopwords

In [None]:
df_english_tagged_tokens = df_english_tagged_tokens[(~df_english_tagged_tokens.token.str.lower().isin(stopwords.words())) & (~df_english_tagged_tokens.token.str.lower().isin(sp.Defaults.stop_words))]
df_english_tagged_tokens

### Lemmatize

In [None]:
lemmatizer = WordNetLemmatizer()
def lemmatize(word, tag):
    if tag is None:
        # if there is no available tag, append the token as is
        return word
    else:       
        # else use the tag to lemmatize the token
        return lemmatizer.lemmatize(word, tag)

In [None]:
df_english_tagged_tokens = df_english_tagged_tokens.copy()
df_english_tagged_tokens['token'] = df_english_tagged_tokens.parallel_apply(lambda row: lemmatize(row['token'], row['tag']), axis=1)
df_english_tagged_tokens

### News Words Identification

In [None]:
news_words = pd.read_csv('data/news/news_words.csv').rename(columns={'index': 'words'})['words'].values
news_words

In [None]:
news_series = df_english_tagged_tokens.token.isin(news_words).fillna(False).groupby(level=0).agg(any).rename('news_words')
news_series

In [None]:
data_en = data_en.join(news_series)

## Candidate News Tweets

In [None]:
data_en.head()

In [None]:
len(data_en[data_en.news_user])

In [None]:
len(data_en[data_en.news_words.fillna(False)])

In [None]:
candidate_news = data_en[(data_en.news_words.fillna(False)) | (data_en.news_user)].copy()
len(candidate_news)

## Structure Filtering

In [None]:
candidate_news.head()

In [None]:
candidate_news['text_url'] = candidate_news.text.str.split('http')\
                                                .apply(lambda l: [len(x.split()) for x in l])\
                                                .apply(lambda l: len(l) == 2 and l[-1] == 1)

## Profanity and Emoji Filtering

In [None]:
candidate_news['profane_words'] = candidate_news.text.parallel_apply(lambda s: [w for w in s.split() if w in en_bad_words])

In [None]:
candidate_news['not_profane'] = candidate_news['profane_words'].apply(len) < 1

In [None]:
candidate_news.head()

## News

In [None]:
len(candidate_news[candidate_news.text_url & candidate_news.not_profane])

In [None]:
candidate_news[candidate_news.text_url & candidate_news.not_profane].text

In [None]:
candidate_news[candidate_news.text_url & candidate_news.not_profane]['id'].to_csv('data/news/news_indexes.csv', index=None, header=False)