In [1]:
#%% libraries
import os
import itertools

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

import plotly.plotly as py
import plotly.graph_objs as go

from datetime import datetime
%matplotlib inline

# run for jupyter notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

### functions

In [3]:
def date_stitcher(prefix,ybegin, yend,
                  months = ['{num:02d}'.format(num=x) for x in range(1, 13)], 
                  days= ['{num:02d}'.format(num=x) for x in range(1, 32)] ):
    years = [str(x) for x in range(ybegin, yend)]
    filelist = []
    combinations = list(itertools.product(years, months, days))
    for combination in combinations:
        arguments = "_".join(combination)
        command = prefix + arguments + '.csv'
        filelist.append(command)
    return filelist

In [4]:
def kw_search(flist, keywords, inpdir,fuzzymatch=False,verbose=False):
    freqcols = ['word', 'freq']
    # initialise dataframe
    tallies = pd.DataFrame(flist, columns=['file']).set_index('file')
    tallies['exists']=np.nan
    tallies['wordcount']=np.nan
    for kw in keywords:
        tallies[kw] = np.nan
    # count in all files
    for infile in flist:
        file = inpdir + '/' + infile
        exist_flag = os.path.exists(file)
        if exist_flag:
            tallies.at[infile, 'exists'] = 1
            tkp = pd.read_csv(file, usecols=freqcols, index_col='word')
            tallies.at[infile, 'wordcount'] = tkp.freq.sum()
            for kw in keywords:
                try:
                    # fuzzy match 
                    if fuzzymatch:
                        tkp['word'] = tkp.index
                        tallies.at[infile, kw] = tkp[tkp['word'].str.contains(kw)]['freq'].sum()
                    else:
                    # strict match
                        tallies.at[infile, kw] = pd.to_numeric(tkp.loc[kw])[0]
                    
                except:
                    continue
            if verbose: print(infile, 'loaded and searched')
            del tkp
        else:
            tallies.at[infile, 'exists'] = 0
            if verbose: print(infile, ' does not exist')
            continue
    # subset to nonempty rows
    data = tallies[(tallies['exists']==1)]
    return(data)

In [5]:
def prep_ngram_data(flist,keywords,inp,fuzzymatch=False,percent=False):
    data = kw_search(flist, keywords, inp, fuzzymatch)
    # this will print a barrage of warnings 
    data.reset_index(level=0, inplace=True)
    data.file = data.file.str[11:21] # hacky - relies on particular naming format
    data['date']=pd.to_datetime(data['file'], format='%Y_%m_%d')
    keepvars = ['date','wordcount']+keywords
    
    # normalize by day word-count 
    if percent:
        for k in keywords:
            data[k] = (data[k]/data['wordcount'])*100
        
    clean = data[keepvars].set_index('date')
    
    return(clean)

`prep_ngram_data` optionally applies normalisation on word-counts to construct
$$
P_{w,t} = \frac{N_{w,t}}{\sum_{i}^I N_{i,t}}
$$

for word $w$ on day $t$, where the denominator is the word count 

## interactive plots 
(uses plotly - please submit an issue to github repo if the plots are 404 )

In [6]:
def plot_interactive(df,vars,header='Appearances in TKP archive'):
    data = []
    for v in vars:
        data.append(go.Scatter(x = df.index,y = df[v],name=v))
    
    layout = dict(
        title=header,
        xaxis=dict(
            rangeselector=dict(
                buttons=list([
                    dict(count=1,label='1m',step='month',stepmode='backward'),
                    dict(count=6,label='6m',step='month',stepmode='backward'),
                    dict(count=1,label='YTD',step='year',stepmode='todate'),
                    dict(count=1,label='1y',step='year',stepmode='backward'),
                    dict(count=2,label='2y',step='year',stepmode='backward'),
                    dict(step='all')
                ])
            ),
            rangeslider=dict(),
            type='date'
        )
    )
    fig = dict(data=data, layout=layout)
    return py.iplot(fig)

In [8]:
# preliminaries
working = '/media/alal/LAL_DATA/Newspapers/The Kathmandu Post'
os.chdir(working)
tmp = '/home/alal/tmp'
inp = working + '/word_frequencies'

# set up list for date ranges
flist = date_stitcher('wfreqs_TKP_',2007, 2018)

In [9]:
disasters = ['earthquake', 'fire', 'flood','drought','landslide']

In [11]:
%%time
stressors = prep_ngram_data(flist,disasters,inp,fuzzymatch=True)

CPU times: user 1min 24s, sys: 5.33 s, total: 1min 30s
Wall time: 1min 34s


# Interactive Plots

check that daily word-counts don't bounce around too much over time - that would mean trends in raw counts are misleading

### Overall Word Count

In [13]:
plot_interactive(stressors,['wordcount'],header='word count TKP')

### Natural Disasters

In [14]:
plot_interactive(stressors,disasters,header='Natural Disasters')