In [1]:
#%% libraries
import os
import itertools

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

import plotly.plotly as py
import plotly.graph_objs as go

from datetime import datetime
%matplotlib inline

# run for jupyter notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
def date_stitcher(ybegin, yend,
                  months = ['{num:02d}'.format(num=x) for x in range(1, 13)], 
                  days= ['{num:02d}'.format(num=x) for x in range(1, 32)] ):
    years = [str(x) for x in range(ybegin, yend)]
    filelist = []
    combinations = list(itertools.product(years, months, days))
    for combination in combinations:
        arguments = "_".join(combination)
        command = 'wfreqs_TKP_' + arguments + '.csv'
        filelist.append(command)
    return filelist


def kw_search(flist, keywords, inpdir,fuzzymatch=False,verbose=False):
    freqcols = ['word', 'freq']
    # initialise dataframe
    tallies = pd.DataFrame(flist, columns=['file']).set_index('file')
    tallies['exists']=np.nan
    for kw in keywords:
        tallies[kw] = np.nan
    # count in all files
    for infile in flist:
        file = inpdir + '/' + infile
        exist_flag = os.path.exists(file)
        if exist_flag:
            tallies.at[infile, 'exists'] = 1
            tkp = pd.read_csv(file, usecols=freqcols, index_col='word')
            for kw in keywords:
                try:
                    # fuzzy match 
                    if fuzzymatch:
                        tkp['word'] = tkp.index
                        tallies.at[infile, kw] = tkp[tkp['word'].str.contains(kw)]['freq'].sum()
                    else:
                    # strict match
                        tallies.at[infile, kw] = pd.to_numeric(tkp.loc[kw])[0]
                    
                except:
                    continue
            if verbose: print(infile, 'loaded and searched')
            del tkp
        else:
            tallies.at[infile, 'exists'] = 0
            if verbose: print(infile, ' does not exist')
            continue
    # subset to nonempty rows
    data = tallies[(tallies['exists']==1)]
    return(data)

def prep_ngram_data(flist,keywords,inp,fuzzymatch=False):
    data = kw_search(flist, keywords, inp, fuzzymatch)
    # this will print a barrage of warnings 
    data.reset_index(level=0, inplace=True)
    data.file = data.file.str[11:21]
    data['date']=pd.to_datetime(data['file'], format='%Y_%m_%d')
    keepvars = ['date']+keywords
    clean = data[keepvars].set_index('date')
    return(clean)

In [8]:
# preliminaries
working = '/media/alal/NEPALINEWS/The Kathmandu Post'
os.chdir(working)
tmp = '/home/alal/tmp'
inp = '/media/alal/NEPALINEWS/The Kathmandu Post/word_frequencies'

# set up list for date ranges
flist = date_stitcher(2007, 2018)
keywords = ['language', 'bhasa', 'maithili','newar', 'tharu', 'tamang']
langs_nep = ['maithili', 'bhojpuri', 'tharu', 'tamang', 'newar', 'bajjika',
             'magar','dotyali', 'urdu', 'awadhi', 'limbu', 'gurung', 'baitadeli', 'rai', 'aachami']
disasters = ['earthquake', 'fire', 'flood','drought','landslide']

In [4]:
%%time
clean = prep_ngram_data(flist,keywords,inp,fuzzymatch=True)

CPU times: user 1min 34s, sys: 641 ms, total: 1min 35s
Wall time: 1min 35s


In [5]:
%%time
langs = prep_ngram_data(flist,langs_nep,inp,fuzzymatch=True)

CPU times: user 3min 34s, sys: 860 ms, total: 3min 35s
Wall time: 3min 35s


In [9]:
%%time
stressors = prep_ngram_data(flist,disasters,inp,fuzzymatch=True)

CPU times: user 1min 31s, sys: 144 ms, total: 1min 31s
Wall time: 1min 31s


### Interactive Plot function 

In [10]:
def plot_interactive(df,vars):
    data = []
    for v in vars:
        data.append(go.Scatter(
            x = df.index,
            y = df[v],
            name=v
        ))
    
    layout = dict(
        title='Appearances in TKP archive',
        xaxis=dict(
            rangeselector=dict(
                buttons=list([
                    dict(count=1,
                         label='1m',
                         step='month',
                         stepmode='backward'),
                    dict(count=6,
                         label='6m',
                         step='month',
                         stepmode='backward'),
                    dict(count=1,
                        label='YTD',
                        step='year',
                        stepmode='todate'),
                    dict(count=1,
                        label='1y',
                        step='year',
                        stepmode='backward'),
                    dict(step='all')
                ])
            ),
            rangeslider=dict(),
            type='date'
        )
    )
    fig = dict(data=data, layout=layout)
    return py.iplot(fig)

# Interactive Plots

In [12]:
plot_interactive(clean,keywords)

In [13]:
plot_interactive(langs,langs_nep)

The draw time for this plot will be slow for clients without much RAM.



Estimated Draw Time Slow



In [11]:
plot_interactive(stressors,disasters)