In [1]:
#%% libraries
import os
import itertools
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

import plotly.plotly as py
import plotly.graph_objs as go

from datetime import datetime
%matplotlib inline

# run for jupyter notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

### functions

In [2]:
def date_stitcher(prefix,ext,ybegin, yend,
                  months = ['{num:02d}'.format(num=x) for x in range(1, 13)], 
                  days= ['{num:02d}'.format(num=x) for x in range(1, 32)] ):
    years = [str(x) for x in range(ybegin, yend)]
    filelist = []
    combinations = list(itertools.product(years, months, days))
    for combination in combinations:
        arguments = "_".join(combination)
        command = prefix + arguments + ext
        filelist.append(command)
    return filelist

In [3]:
def lines_with_words(infile, tup):
    file = open(infile,'r') 
    count = 0
    p = r'^'
    # stitch together string literal for arbitrary list of arguments
    for arg in tup:
        p = p + '(?=.*{0}).*'.format(arg)
    p = p+'$'
    #  string literal looks like r'^(?=.*{0}).*(?=.*{1}).*$'.format(word1,word2,...)
    r = re.compile(p)
    
    for line in file:
        if len(re.findall(r,line)) > 0:
            count += 1
    
    return count

In [4]:
def kw_tuple_search(flist, inpdir, listOfTuples):
    # initialise dataframe
    tallies = pd.DataFrame(flist, columns=['file']).set_index('file')
    tallies['exists']=np.nan
    for kw in listOfTuples:
        colname = '-'.join(kw)        
        tallies[colname] = np.nan
    
    # count in all files
    for infile in flist:
        file = inpdir + '/' + infile
        exist_flag = os.path.exists(file)
        if exist_flag:
            tallies.at[infile, 'exists'] = 1
            for kw in listOfTuples:
                try:
                    colname = '-'.join(kw)        
                    tally = lines_with_words(file,kw)
                    tallies.at[infile, colname] = tally                    
                except:
                    continue
        else:
            tallies.at[infile, 'exists'] = 0
            continue
    # subset to nonempty rows
    data = tallies[(tallies['exists']==1)]
    return(data)


In [5]:
def prep_ngram_data(flist,keywords,inp):
    data = kw_tuple_search(flist,inp,keywords)
    # this will print a barrage of warnings
    data.reset_index(level=0, inplace=True)
    data.file = data.file.str[14:24] # hacky - relies on particular naming format
    data['date']=pd.to_datetime(data['file'], format='%Y_%m_%d')
    
    data.drop(['file','exists'],axis=1,inplace=True)
    clean = data.set_index('date')
    
    return(clean)

## interactive plots 
(uses plotly - please submit an issue to github repo if the plots are 404 )

In [13]:
def plot_interactive(df,header='Appearances same sentence in TKP archive'):
    data = []
    vars = list(df)
    for v in vars:
        data.append(go.Scatter(x = df.index,y = df[v],name=v))
    
    layout = dict(
        title=header,
        xaxis=dict(
            rangeselector=dict(
                buttons=list([
                    dict(count=1,label='1m',step='month',stepmode='backward'),
                    dict(count=6,label='6m',step='month',stepmode='backward'),
                    dict(count=1,label='YTD',step='year',stepmode='todate'),
                    dict(count=1,label='1y',step='year',stepmode='backward'),
                    dict(count=2,label='2y',step='year',stepmode='backward'),
                    dict(step='all')
                ])
            ),
            rangeslider=dict(),
            type='date'
        )
    )
    fig = dict(data=data, layout=layout)
    return py.iplot(fig)

In [7]:
# preliminaries
working = '/media/alal/NEPALINEWS/The Kathmandu Post'
os.chdir(working)
tmp = '/home/alal/tmp'
inp = '/media/alal/NEPALINEWS/The Kathmandu Post/word_frequencies'

flist = date_stitcher('sentences_TKP_','.txt',2007, 2018)
inp = '/media/alal/NEPALINEWS/The Kathmandu Post/sentences'

In [8]:
keywords = [('tamang','court'),('tamang','school'),
            ('newar','court'),('newar','school'),
            ('bhojpuri','court'),('bhojpuri','school'),
            ('maithili','court'),('maithili','school'),
            ('gurung','court'),('gurung','school')]

In [13]:
%%time
clean = prep_ngram_data(flist,keywords,inp)

CPU times: user 2min 30s, sys: 1.65 s, total: 2min 32s
Wall time: 2min 32s


In [14]:
clean.sum()

tamang-court        88.0
tamang-school      192.0
newar-court         30.0
newar-school        28.0
bhojpuri-court       0.0
bhojpuri-school      4.0
maithili-court      10.0
maithili-school     12.0
gurung-court       237.0
gurung-school      325.0
dtype: float64

In [56]:
plot_interactive(clean)

## `({tamang,newar,bhojpuri,maithili,gurung} + language + {court,school}` in same sentence

In [9]:
keywords = [
    ('tamang', 'language', 'court'), 
    ('tamang', 'language', 'school'),
    ('newar', 'language', 'court'), 
    ('newar', 'language', 'school'),
    ('bhojpuri', 'language', 'court'), 
    ('bhojpuri', 'language', 'school'),
    ('maithili', 'language', 'court'), 
    ('maithili', 'language', 'school'),
    ('gurung', 'language', 'court'), 
    ('gurung', 'language', 'school')
]

In [10]:
%%time
data2 = prep_ngram_data(flist,keywords,inp)

CPU times: user 2min 41s, sys: 1.92 s, total: 2min 43s
Wall time: 2min 43s


In [11]:
data2.sum()

tamang-language-court       0.0
tamang-language-school      3.0
newar-language-court        3.0
newar-language-school       3.0
bhojpuri-language-court     0.0
bhojpuri-language-school    3.0
maithili-language-court     3.0
maithili-language-school    5.0
gurung-language-court       1.0
gurung-language-school      4.0
dtype: float64

In [14]:
plot_interactive(data2)

The draw time for this plot will be slow for clients without much RAM.



Estimated Draw Time Slow

