In [13]:
#%% libraries
import os
import sys
import glob
import itertools
import textract
import nltk
from nltk.corpus import stopwords
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

import plotly.plotly as py
import plotly.graph_objs as go

from datetime import datetime
%matplotlib inline

# run for jupyter notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [9]:
def date_stitcher(ybegin, yend,
                  months = ['{num:02d}'.format(num=x) for x in range(1, 13)], 
                  days= ['{num:02d}'.format(num=x) for x in range(1, 32)] ):
    years = [str(x) for x in range(ybegin, yend)]
    filelist = []
    combinations = list(itertools.product(years, months, days))
    for combination in combinations:
        arguments = "_".join(combination)
        command = 'wfreqs_TKP_' + arguments + '.csv'
        filelist.append(command)
    return filelist


def kw_search(flist, keywords, inpdir,fuzzymatch=False,verbose=False):
    freqcols = ['word', 'freq']
    # initialise dataframe
    tallies = pd.DataFrame(flist, columns=['file']).set_index('file')
    tallies['exists']=np.nan
    for kw in keywords:
        tallies[kw] = np.nan
    # count in all files
    for infile in flist:
        file = inpdir + '/' + infile
        exist_flag = os.path.exists(file)
        if exist_flag:
            tallies.at[infile, 'exists'] = 1
            tkp = pd.read_csv(file, usecols=freqcols, index_col='word')
            for kw in keywords:
                try:
                    # fuzzy match 
                    if fuzzymatch:
                        tkp['word'] = tkp.index
                        tallies.at[infile, kw] = tkp[tkp['word'].str.contains(kw)]['freq'].sum()
                    else:
                    # strict match
                        tallies.at[infile, kw] = pd.to_numeric(tkp.loc[kw])[0]
                    
                except:
                    continue
            if verbose: print(infile, 'loaded and searched')
            del tkp
        else:
            tallies.at[infile, 'exists'] = 0
            if verbose: print(infile, ' does not exist')
            continue
    # subset to nonempty rows
    data = tallies[(tallies['exists']==1)]
    return(data)

def prep_ngram_data(flist,keywords,inp,fuzzymatch=False):
    data = kw_search(flist, keywords, inp, fuzzymatch)
    # this will print a barrage of warnings 
    data.reset_index(level=0, inplace=True)
    data.file = data.file.str[11:21]
    data['date']=pd.to_datetime(data['file'], format='%Y_%m_%d')
    keepvars = ['date']+keywords
    clean = data[keepvars].set_index('date')
    return(clean)

In [17]:
# preliminaries
working = '/media/alal/NEPALINEWS/The Kathmandu Post'
os.chdir(working)
tmp = '/home/alal/tmp'
inp = '/media/alal/NEPALINEWS/The Kathmandu Post/word_frequencies'

# set up list for date ranges
flist = date_stitcher(2007, 2018)
keywords = ['language', 'bhasa', 'maithili','newar', 'tharu', 'tamang']
langs_nep = ['maithili', 'bhojpuri', 'tharu', 'tamang', 'newar', 'bajjika', 'magar','dotyali', 'urdu', 'awadhi', 'limbu', 'gurung', 'baitadeli', 'rai', 'aachami']

In [18]:
%%time
clean = prep_ngram_data(flist,keywords,inp,fuzzymatch=True)

CPU times: user 1min 26s, sys: 144 ms, total: 1min 26s
Wall time: 1min 26s


# Interactive Plot

In [19]:
def plot_interactive(vars):
    data = []
    for v in vars:
        data.append(go.Scatter(
            x = clean.index,
            y = clean[v],
            name=v
        ))
    
    layout = dict(
        title='Appearance in TKP archive:' + ",".join(vars),
        
        xaxis=dict(
            rangeselector=dict(
                buttons=list([
                    dict(count=1,
                         label='1m',
                         step='month',
                         stepmode='backward'),
                    dict(count=6,
                         label='6m',
                         step='month',
                         stepmode='backward'),
                    dict(count=1,
                        label='YTD',
                        step='year',
                        stepmode='todate'),
                    dict(count=1,
                        label='1y',
                        step='year',
                        stepmode='backward'),
                    dict(step='all')
                ])
            ),
            rangeslider=dict(),
            type='date'
        )
    )
    fig = dict(data=data, layout=layout)
    return py.iplot(fig)

In [20]:
keywords = ['language', 'bhasa', 'maithili','newar', 'tharu', 'tamang']
plot_interactive(keywords)