In [98]:
import pandas as pd 
import os
from more_itertools import unique_everseen
from sklearn.feature_extraction.text import CountVectorizer
import spacy

In [108]:
nlp = spacy.load('en_core_web_sm')

In [2]:
os.listdir('../data/')

['all_records_df.csv',
 'CCADMANUAL 2019 040319 - excl. SAR codes.pdf',
 'glove.6B.50d.txt',
 'merged_records_df.csv',
 'mo_codes_matched.csv',
 'nyt-corpus-crime-stores.csv',
 'raw_data.csv',
 'Xtown_ 2019 Hate Crime Write-Ups.txt',
 'Xtown_ Hate Crime Redesign - Sheet1.csv']

In [3]:
df = pd.read_csv("../data/nyt-corpus-crime-stores.csv", index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df['body'] = df['body'].str.replace('LEAD:', '').str.strip()

In [58]:
paragraphs = (df['body']
              .str.split('\n')
              .apply(lambda x: list(map(lambda y: y.strip(), x)))
              .apply(lambda x: list(unique_everseen(x)))
              .apply(lambda x: list(filter(lambda y: y != '', x)))
              .str.join('\n')
             )

In [43]:
import util

In [59]:
paragraphs = paragraphs.apply(util.preprocess_lite)

In [125]:
def lemmatize_doc(doc):
    processed_grafs = []
    for graf in doc.split('\n'):
        processed_graf = list(map(lambda x: x.lemma_, nlp(graf)))
        processed_grafs.append(' '.join(processed_graf))
    return processed_grafs

In [126]:
processed_docs = []
# for processed_grafs in tqdm(util.multiprocess(paragraphs, lemmatize_doc, num_workers=4), total=len(paragraphs)):
for processed_grafs in tqdm(map(lemmatize_doc, paragraphs), total=len(paragraphs)):
    processed_docs.append(processed_grafs)

HBox(children=(IntProgress(value=0, max=48695), HTML(value='')))

In [129]:
lemma_docs = pd.Series(processed_docs)

In [131]:
lemma_docs = lemma_docs.str.join('\n')

In [None]:
lemma_docs.to_csv('../data/nyt-corpus-crime-stories-.csv')

In [133]:
df['lemmas'] = lemma_docs

In [134]:
df.to_csv('../data/nyt-corpus-crime-stores-lemmatized.csv')

In [138]:
cv = CountVectorizer(min_df=.005, max_df=.6, stop_words='english')

In [139]:
cv.fit(lemma_docs)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.6, max_features=None, min_df=0.005,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [145]:
def process_paragraphs(body_text):
    paragraphs = body_text.split('\n')
    output_vectors = []
    for p in paragraphs:
        output_vector = []
        p = util.preprocess_lite(p)
        for w in p.split():
            if w in cv.vocabulary_:
                output_vector.append(int(cv.vocabulary_[w]))
        output_vectors.append(output_vector)
    return output_vectors

In [146]:
from tqdm import tqdm_notebook as tqdm

In [147]:
all_output = []
for doc in tqdm(lemma_docs.str.lower(), total=len(paragraphs)):
    all_output.append(process_paragraphs(doc))

HBox(children=(IntProgress(value=0, max=48695), HTML(value='')))

In [86]:
import json

In [148]:
os.makedirs('../topic_model/data_lemmas')

In [149]:
vocab = pd.Series(cv.vocabulary_).sort_values()

In [150]:
with open('../topic_model/data_lemmas/doc_vecs.json', 'w') as f:
    for doc in tqdm(all_output):
        out_str = json.dumps({'paragraphs': doc})
        f.write(out_str)
        f.write('\n')
        
with open('../topic_model/data_lemmas/vocab.txt', 'w') as f:
    for word in vocab.index:
        f.write(word)
        f.write('\n')

HBox(children=(IntProgress(value=0, max=48695), HTML(value='')))