In [75]:
import pandas as pd
import nltk
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')
from dateutil import parser
import re
from geotext import GeoText
import numpy as np
from scipy import stats 
import geopandas as gpd

# Spacy for tokenizing our texts

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Gensim is needed for modeling

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

from gensim.utils import tokenize

from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jrdaos\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
# Setting up Spacy Tokenizer
nlp = English()

def lemmatizer(doc):
    # This takes in a doc of tokens from the NER and lemmatizes them. 
    # Pronouns (like "I" and "you" get lemmatized to '-PRON-', so I'm removing those.
    doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)
    
def remove_stopwords(doc):
    # This will remove stopwords and punctuation.
    # Use token.text to return strings, which we'll need for Gensim.
    doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    return doc

# This will add pipelines in our tokenization process.

nlp.add_pipe(lemmatizer,name='lemmatizer')
nlp.add_pipe(remove_stopwords, name="stopwords", last=True)

In [3]:
# This is a function that will create a model that predicts the topics conveyed by each group of tweet sentiments


def topic_modeler(tokenized_texts, no_topics, no_words):
    topics = []

    words = corpora.Dictionary(tokenized_texts)
    corpus = [words.doc2bow(doc) for doc in tokenized_texts]

    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=words,
                                                random_state = 3,
                                               num_topics= no_topics)
    
    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_texts, dictionary=words, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()

    return lda_model

In [169]:
df = pd.read_csv('rappler_scraping.csv')
df = df.iloc[:,1:]
df['date'] = [parser.parse(date).strftime('%Y-%m-%d') for date in df['date']]
df = df[(df['text'].str.contains('coronavirus'))]
df = df[df['category'] == 'Philippines']
df = df.drop_duplicates()
df = df.reset_index(drop = True)



location = pd.read_csv('ph_locations.csv')
location = location.applymap(str.lower)

In [170]:
# LDA Topics 

words = df['text'].str.lower()
listWords = []
for item in words:
    listWords.append([nlp(item)])

topics = []
for x in listWords:
    res = topic_modeler(x, 1, 30)
    res = res.show_topic(0, topn = 30)
    topics.append([word[0] for word in res])
    
df['LDA_Topics'] = topics

In [171]:
# Extracting all the counting phrases in the articles

df['count_docs'] =  df['text'].apply(lambda x: re.findall("\d+(?:,\d+)?\s+[a-zA-Z]+", x))

checker = ['confirmed','suspected','quarantine','case','infected','monitoring','chinese','monitored']

count_docs = []
for index, row in df.iterrows():
    passed = []
    for item in row['count_docs']:
        if any(ext in item.lower() for ext in checker):
            passed.append(item)
            break
    
    count_docs.append(passed)

df['count_docs'] = count_docs

In [172]:
# Extracting all the PH Locations using geotext on the articles

df['PH_Loc'] = [list(set(GeoText(content, 'PH').cities)) for content in df['text']]
df['PH_Loc'] = [[x.lower() for x in w] for w in df['PH_Loc']]
df['PH_Loc'] =[[x.replace('city', '') for x in w] for w in df['PH_Loc']]

In [173]:
# Identifying which articles are about suspicious or confirmed cases of the virus

status = []
for index, row in df.iterrows():
    if ('confirmed' in row['LDA_Topics']) & ('confirm' in row['title'])  & (row['date'] >= '2020-01-30'):
        status.append('confirmed')
    elif ('confirmed' in row['LDA_Topics']) & (row['date'] >= '2020-01-30'):
        status.append('confirmed')
    elif (any(words in row['LDA_Topics']  for words in ['suspected','quarantine','case','infected','monitoring']))& ('FACT CHECK' not in row['title']) & ('FALSE' not in row['title']):
        status.append('suspected')
    else:
        status.append('')
df['status'] = status

In [174]:
# Selecting Provinces in the identified locations

df['PH_Loc'] = [list(set(loc) & set(location['Pro_Name'].unique())) for loc in df['PH_Loc']]

In [175]:
# For locations not identified through the text, it will check with the LDA topics if a location is identified and use it instead

for index, row in df.iterrows():
    if len(row['PH_Loc']) == 0:
        try:
            df.loc[index, 'PH_Loc'] = [list(set(row['LDA_Topics']) & set(location['Pro_Name'].unique()))]
        except ValueError:
            continue

In [176]:
# Cleaning the document counts to just numbers

counts = []
case = []
for count in df['count_docs']:
    try:
        counts.append(count[0].split(' ')[0])
    except IndexError:
        counts.append(0)
        
    try:
        case.append(count[0].split(' ')[1])
    except IndexError:
        case.append('')

df['counts'] = counts
df['counts'] = [str(count).replace(',', '') for count in df['counts']]
df['counts'] = [str(count).replace('.', '') for count in df['counts']]
df['case'] = case

In [177]:
# Finalizing Locations

ph_loc = []
for loc in df['PH_Loc']:
    try:
        ph_loc.append(loc[0])
    except IndexError:
        ph_loc.append('')
df['Loc'] = ph_loc

In [178]:
# Fixing confirmed counts

count_fixer = []
for index, row in df.iterrows():
    if (row['status'] == 'confirmed') & (row['case'] != 'confirmed'):
        count_fixer.append(1)
    else:
        count_fixer.append(row['counts'])

df['counts'] = count_fixer

# Processing for CSV

In [183]:
df = df.reset_index(drop = True)
df.to_csv('rappler_parsed.csv', index = False)

In [184]:
df = pd.read_csv('rappler_parsed.csv')

In [185]:
def parse(df):
    print(df.info())
    
    # Get min/max/mean values
    dfa = pd.pivot_table(df, values = 'counts', index=['date', 'Loc'], columns='status', aggfunc=[min, max, np.mean, stats.mode])
    
    # Remove multi-index
    dfa.columns = ["_".join(pair) for pair in dfa.columns]
    dfa = dfa.reset_index()
    
    # Replace 0 with np.nan to forward fill null values
    dfa = dfa.replace(0, np.nan)
    
    # Forward filling needs to be by area
    places = list(df['Loc'].unique())
    
    global dfb
    dfb = pd.DataFrame()
    for place in places:
        df_temp = dfa[dfa['Loc'] == place].fillna(method='ffill')
        dfb = dfb.append(df_temp)
    return dfb

In [186]:
res = parse(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Data columns (total 13 columns):
source_id     68 non-null int64
date          68 non-null object
category      68 non-null object
title         68 non-null object
author        68 non-null object
text          68 non-null object
LDA_Topics    68 non-null object
count_docs    68 non-null object
PH_Loc        68 non-null object
status        31 non-null object
counts        68 non-null int64
case          17 non-null object
Loc           30 non-null object
dtypes: int64(2), object(11)
memory usage: 7.0+ KB
None


In [187]:
res

Unnamed: 0,date,Loc,min_confirmed,min_suspected,max_confirmed,max_suspected,mean_confirmed,mean_suspected,mode_confirmed,mode_suspected
1,2020-01-23,cebu,,,,555.0,,185.0,,"([0], [2])"
2,2020-01-24,cebu,,14.0,,14.0,,14.0,,"([14], [1])"
7,2020-01-30,cebu,,14.0,,14.0,,14.0,,"([0], [2])"
9,2020-01-31,cebu,1.0,100.0,1.0,100.0,1.0,100.0,"([1], [2])","([100], [1])"
10,2020-02-02,cebu,1.0,100.0,1.0,100.0,1.0,100.0,"([1], [1])","([100], [1])"
11,2020-02-05,cebu,1.0,100.0,1.0,100.0,1.0,100.0,"([1], [1])","([100], [1])"
0,2020-01-23,aklan,,3.0,,3.0,,3.0,,"([3], [1])"
3,2020-01-25,aklan,,80.0,,80.0,,80.0,,"([80], [1])"
5,2020-01-28,aklan,,11.0,,11.0,,11.0,,"([11], [1])"
6,2020-01-30,aklan,1.0,11.0,1.0,11.0,1.0,11.0,"([1], [1])","([11], [1])"


In [188]:
res = res[['date','Loc', 'min_suspected','min_confirmed']]

In [189]:
res = res.fillna(0)

In [182]:
prov.to_file("provinces.shp")

In [107]:
prov = gpd.read_file('provinces.shp')

In [198]:
df = pd.merge(res, prov, left_on = 'Loc', right_on = 'Pro_Name')

In [199]:
df = df[['date','Loc','min_suspected','min_confirmed','long','lat']]
df.columns = (['Date','Location','Suspected','Confirmed','Longitude','Latitude'])

In [200]:
df['Date'] = [datetime.datetime.strptime(str(date), '%Y-%m-%d').strftime('%Y-%m-%dT%H:%M:%S.%f') for date in df['Date']]

In [202]:
df.to_csv('ncov_parsed.csv', index = False)