In [11]:
import pandas as pd
import nltk
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')
from dateutil import parser
import re
from geotext import GeoText

# Spacy for tokenizing our texts

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Gensim is needed for modeling

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

from gensim.utils import tokenize

from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Gab Daos\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [7]:
# Setting up Spacy Tokenizer
nlp = English()

def lemmatizer(doc):
    # This takes in a doc of tokens from the NER and lemmatizes them. 
    # Pronouns (like "I" and "you" get lemmatized to '-PRON-', so I'm removing those.
    doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)
    
def remove_stopwords(doc):
    # This will remove stopwords and punctuation.
    # Use token.text to return strings, which we'll need for Gensim.
    doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    return doc

# This will add pipelines in our tokenization process.

nlp.add_pipe(lemmatizer,name='lemmatizer')
nlp.add_pipe(remove_stopwords, name="stopwords", last=True)

In [8]:
# This is a function that will create a model that predicts the topics conveyed by each group of tweet sentiments


def topic_modeler(tokenized_texts, no_topics, no_words):
    topics = []

    words = corpora.Dictionary(tokenized_texts)
    corpus = [words.doc2bow(doc) for doc in tokenized_texts]

    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=words,
                                                random_state = 3,
                                               num_topics= no_topics)
    
    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_texts, dictionary=words, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()

    return lda_model

In [389]:
df = pd.read_csv('rappler_scraping.csv')
df = df.iloc[:,1:]
df['date'] = [parser.parse(date).strftime('%Y-%m-%d') for date in df['date']]
df = df[(df['text'].str.contains('coronavirus'))]
df = df.reset_index(drop = True)

#location = pd.read_csv('ph_locations.csv')
#location = location.applymap(str.lower)

In [390]:
# LDA Topics 

words = df['text'].str.lower()
listWords = []
for item in words:
    listWords.append([nlp(item)])

topics = []
for x in listWords:
    res = topic_modeler(x, 1, 30)
    res = res.show_topic(0, topn = 30)
    topics.append([word[0] for word in res])
    
df['LDA_Topics'] = topics

In [391]:
# Extracting all the counting phrases in the articles

df['count_docs'] =  df['text'].apply(lambda x: re.findall("(?:[a-zA-Z'-]+[^a-zA-Z'-]+){0,0}[0-9](?:[^a-zA-Z'-]+[a-zA-Z'-]+){0,1}", x))

checker = ['confirmed','suspected','quarantine','case','infected','monitoring','chinese']

count_docs = []
for index, row in df.iterrows():
    passed = []
    for item in row['count_docs']:
        if any(ext in item.lower() for ext in checker):
            passed.append(item)
            break
    
    count_docs.append(passed)

df['count_docs'] = count_docs

In [392]:
# Extracting all the PH Locations using geotext on the articles

df['PH_Loc'] = [list(set(GeoText(content, 'PH').cities)) for content in df['text']]
df['PH_Loc'] = [[x.lower() for x in w] for w in df['PH_Loc']]
df['PH_Loc'] =[[x.replace('city', '') for x in w] for w in df['PH_Loc']]

In [393]:
# Identifying which articles are about suspicious or confirmed cases of the virus

status = []
for index, row in df.iterrows():
    if ('confirmed' in row['LDA_Topics']) & ('confirm' in row['title'])  & (row['date'] >= '2020-01-30'):
        status.append('confirmed')
    elif ('confirmed' in row['LDA_Topics']) & (row['date'] >= '2020-01-30'):
        status.append('confirmed')
    elif (any(words in row['LDA_Topics']  for words in ['suspected','quarantine','case','infected','monitoring']))& ('FACT CHECK' not in row['title']) & ('FALSE' not in row['title']):
        status.append('suspected')
    else:
        status.append('')
df['status'] = status

In [396]:
# Selecting Provinces in the identified locations

df['PH_Loc'] = [list(set(loc) & set(location['Pro_Name'].unique())) for loc in df['PH_Loc']]

In [397]:
# For locations not identified through the text, it will check with the LDA topics if a location is identified and use it instead

for index, row in df.iterrows():
    if len(row['PH_Loc']) == 0:
        try:
            df.loc[index, 'PH_Loc'] = [list(set(row['LDA_Topics']) & set(location['Pro_Name'].unique()))]
        except ValueError:
            continue

In [403]:
# Cleaning the document counts to just numbers

counts = []
for count in df['count_docs']:
    try:
        counts.append(count[0].split(' ')[0])
    except IndexError:
        counts.append(0)

df['counts'] = counts
df['counts'] = [str(count).replace(',', '') for count in df['counts']]
df['counts'] = [str(count).replace('.', '') for count in df['counts']]

In [404]:
# Finalizing Locations

ph_loc = []
for loc in df['PH_Loc']:
    try:
        ph_loc.append(loc[0])
    except IndexError:
        ph_loc.append('')
df['Loc'] = ph_loc

# Processing for CSV

In [407]:
df = df.reset_index(drop = True)
df.to_csv('rappler_parsed.csv', index = False)

In [408]:
df_test = df[['source_id','date','category','title','author','text','status','counts','Loc']]

In [409]:
prov = pd.read_csv('provinces.csv')

In [410]:
df_test = df_test.merge(prov, left_on='Loc', right_on = 'Pro_Name', how='left')

In [411]:
df_test = df_test[['source_id','date','category','title','author','text','status','counts','Loc','coordinates']]

In [412]:
df_test.to_csv('rappler_test.csv', index = False)

In [21]:
filtered_df.to_csv('sample_output.csv', index = False)