In [3]:
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models.ldamulticore import LdaMulticore

from gensim.models import Phrases
from gensim.parsing.preprocessing import preprocess_string, remove_stopwords, strip_numeric, strip_punctuation, strip_short, stem_text
import matplotlib.pyplot as plt

In [4]:
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary

## Custom Year Definition
Modify these variables for your needs

In [63]:
FILE_PATH = '../Files/gensim/'

# Selected years
SELECTED_YEARS = [2012, 2013]

# Add bigrams and trigrams
ADD_BIGRAMS = True

# Only add bigrams that appear BIGRAMS_MIN_COUNT times or more
BIGRAMS_MIN_COUNT = 20

# Filter out words that occur in less than FILTER_NO_ABOVE documents
FILTER_NO_BELOW = 20

In [6]:
from ipynb.fs.full.util import load_data
X, y = load_data()

  "    relevant_cols = [\"PERMID\", \"CIK\", \"Ticker\", \"year\", \"FilingDate\", \"company_name\", \"Dividend Payer\", \"DPS growth\", \"DPS cut\", \"zEnvironmental\", \"dEnvironmental\", \"sector\"]\n",


In [38]:
filtered_by_year = X[X.year.isin(SELECTED_YEARS)]
items = {
    'item1a': filtered_by_year['item1a_risk'],
    'item7': filtered_by_year['item7_mda']
}

In [40]:
print(f'Got {filtered_by_year.shape[0]} documents')

Got 5119 documents


In [41]:
tokenizer = RegexpTokenizer(r'\w+')
for item in items:
    docs = items[item]
    for idx in range(len(docs)):
        docs.iloc[idx] = docs.iloc[idx].lower()  # Convert to lowercase.
        docs.iloc[idx] = tokenizer.tokenize(docs.iloc[idx])  # Split into words.
        docs.iloc[idx] = docs.iloc[idx][4:] # Remove first 4 words
    
    # Remove numbers, but not words that contain numbers.
    docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

    # Remove words that are only one character.
    docs = [[token for token in doc if len(token) > 1] for doc in docs]
    
    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
    
    # Add bigrams and trigrams to docs (only ones that appear BIGRAMS_MIN_COUNT times or more).
    if ADD_BIGRAMS:
        bigram = Phrases(docs, min_count=BIGRAMS_MIN_COUNT)
        for idx in range(len(docs)):
            for token in bigram[docs[idx]]:
                if '_' in token:
                    # Token is a bigram, add to document.
                    docs[idx].append(token)
                
    items[item] = docs

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [42]:
dictionaries = {}
for item in items:
    dictionaries[item] = Dictionary(items[item])

In [43]:
for dictionary in dictionaries.values():
    # Filter out words that occur in less than FILTER_NO_BELOW documents.
    dictionary.filter_extremes(no_below=FILTER_NO_BELOW)
    
#     dictionary.filter_extremes(no_below=20, no_above=0.1)

In [44]:
corpus = {}
for item in items:
    corpus[item] = [dictionaries[item].doc2bow(doc) for doc in items[item]]

In [45]:
for item in items:
    print(item + ':')
    print('\tNumber of unique tokens: %d' % len(dictionaries[item]))
    print('\tNumber of documents: %d' % len(corpus[item]))

item1a:
	Number of unique tokens: 12265
	Number of documents: 5119
item7:
	Number of unique tokens: 12891
	Number of documents: 5119


In [46]:
id2word = {}
for item in items:
    temp = dictionaries[item][0] # Initialize id2token mappings
    id2word[item] = dictionaries[item].id2token

### Write output to file

In [64]:
import pickle

base_name = str(SELECTED_YEARS[0])
if len(SELECTED_YEARS) > 1: base_name += f'-{SELECTED_YEARS[-1]}'
base_name += '_{}_{}.pkl'

str_mapping = {
    'corpus': corpus,
    'id2word': id2word
}

for item in items:
    for obj in str_mapping:
        with open(FILE_PATH+base_name.format(item, obj), 'wb') as file:
            pickle.dump(str_mapping[obj][item], file)

In [51]:
test = pickle.dumps(corpus['item1a'])

In [53]:
test_id2word = pickle.dumps(id2word['item1a'])