In [2]:
#%% libraries
import os
import sys
import glob
import io
import itertools
import nltk
from nltk.corpus import stopwords
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline

# run for jupyter notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [5]:
newspaper_name = 'The Himalayan Times'

In [6]:

root = '/media/alal/LAL_DATA/Newspapers/' + newspaper_name + '/'
os.chdir(root)
%pwd()

'/media/alal/LAL_DATA/Newspapers/The Himalayan Times'

In [7]:
# pick file, remove punctuation and stopwords
tmp = '/home/alal/tmp'
inp = root + '/raw_txts'
out = root + '/word_frequencies/'

if not os.path.exists(out):
    os.makedirs(out)

In [8]:
def write_word_freqs(inputfile,outdir):
    filterout= set(stopwords.words('english')+
               list(string.punctuation)+
               ['\'\'','``','\'s','’',"“","”",
                'the','said','nepal','world','kathmandu'])
    cols = ['word','freq']

    base = os.path.abspath(inputfile)
    wdir, fname = outdir, os.path.split(base)[1]
    writepath = wdir + '/wfreqs_' + fname.split('.')[0] + '.csv'

    f = open(inputfile)
    raw = f.read()
    tokens = [token.lower() for token in nltk.word_tokenize(raw)]
    cleaned = [token for token in tokens if token not in filterout]
    
    fdict = dict(nltk.FreqDist(cleaned))
    df = pd.DataFrame(list(fdict.items()),columns=cols)
    df = df.sort_values('freq',ascending=0)
    
    df.to_csv(writepath,columns=['word','freq'])

**sentence splitter relies on nltk data**

run 
```{python}
nltk.download()
```

and select 'all packages' (3 GB download to `/home/<user>/` ) 


In [9]:
nltk.data.path.append('/media/alal/LAL_DATA/Newspapers/nltk_data')

In [10]:
def write_sentences(inputfile,outdir):
    base = os.path.abspath(inputfile)
    wdir, fname = outdir, os.path.split(base)[1]
    writepath = wdir + '/sentences_' + fname.split('.')[0] + '.txt'

    f = open(inputfile)
    raw = f.read()
    string = raw.replace('\n'," ")
    sentences = [token.lower() for token in nltk.tokenize.sent_tokenize(string)]

    outF = open(writepath, "w")
    sentences = map(lambda x: x+"\n", sentences)

    outF.writelines(sentences)
    outF.close()

## Parallelize execution of word-counter function

In [11]:
# pick file, remove punctuation and stopwords
tmp = '/home/alal/tmp'
inp = root + 'raw_txts'
out = root + '/word_frequencies/'

if not os.path.exists(out):
    os.makedirs(out)

In [12]:
files = glob.glob(inp+'/THT_*.txt')

In [13]:
from joblib import Parallel, delayed
import multiprocessing
num_cores = multiprocessing.cpu_count()

In [14]:
%%time
results = Parallel(n_jobs=num_cores)(delayed(write_word_freqs)(i,out) \
                                     for i in files)

CPU times: user 747 ms, sys: 89.5 ms, total: 837 ms
Wall time: 2min 57s


## Parallelize execution of sentence splitter 

In [15]:
# pick file, remove punctuation and stopwords
tmp = '/home/alal/tmp'
inp = root + 'raw_txts'
out = root + '/sentences/'

if not os.path.exists(out):
    os.makedirs(out)

In [16]:
files = glob.glob(inp+'/THT_*.txt')

In [17]:
from joblib import Parallel, delayed
import multiprocessing
num_cores = multiprocessing.cpu_count()

In [18]:
%%time
results = Parallel(n_jobs=num_cores)(delayed(write_sentences)(i,out) \
                                     for i in files)

CPU times: user 159 ms, sys: 57.2 ms, total: 217 ms
Wall time: 39.5 s
