In [6]:
# Data Structures
import numpy  as np
import pandas as pd
import geopandas as gpd
import json

# Corpus Processing
import re
import nltk.corpus
from unidecode                        import unidecode
from nltk.tokenize                    import word_tokenize
from nltk                             import SnowballStemmer
from sklearn.feature_extraction.text  import TfidfVectorizer
from sklearn.preprocessing            import normalize

# K-Means
from sklearn import cluster

# Visualization and Analysis
import matplotlib.pyplot  as plt
import matplotlib.cm      as cm
import seaborn            as sns
from sklearn.metrics                  import silhouette_samples, silhouette_score

In [13]:
data = pd.read_csv('usc_seating_data.csv', encoding='utf-8')

In [16]:
data['Abstract 1'] = data['Abstract 1'].fillna(' ')
data['Abstract 2'] = data['Abstract 2'].fillna(' ')
data['Abstract 3'] = data['Abstract 3'].fillna(' ')

In [17]:
data['added_abstracts'] = data['Abstract 1'] + ' ' + data['Abstract 2'] + ' ' + data['Abstract 3']

In [24]:
data_cleaned = data[['First name', 'Last Name', 'School', 'Category', 'PhD', 'Provided Keywords', 'Random', 'Skip', 'added_abstracts']]

In [25]:
data_cleaned

Unnamed: 0,First name,Last Name,School,Category,PhD,Provided Keywords,Random,Skip,added_abstracts
0,Kevin,Murphy,USC,Finance,No,,,,Finance theory says that companies in declinin...
1,Robert,Dittmar,University of Michigan,Finance,No,Asset pricing,,,We conduct a systematic examination of the ret...
2,Mitch,Warachka,Chapman University,Finance,No,Innovation\nValue Creation\nCulture,,,A large literature reports that proximity infl...
3,William,Mullins,UC San Diego,Finance,No,"Finance, Corporate and Household finance, poli...",,,We find evidence of selective exposure to conf...
4,Stan,Markov,UT Dallas,Accounting,No,"social media, big data, market efficiency,\",,,We examine how increased competition stemming ...
...,...,...,...,...,...,...,...,...,...
73,Kristi,Rennekamp,Cornell University,Accounting,No,,,,As firms increasingly use social media to prov...
74,Suzanne,Burzillo,USC,Accounting,Yes,,,,Recent years have witnessed growing interest i...
75,Nick,Bloom,Stanford,Economics,No,,,,We construct the World Uncertainty Index (WUI)...
76,Anastassia,Fedyk,Berkeley Haas,Finance,No,,,,We study the use and economic impact of artifi...


In [28]:
corpus = data_cleaned['added_abstracts'].tolist()

In [31]:
# removes stopwords from a tokenized list
def remove_stopwords(tokens, words):
    return [token for token in tokens if token not in words]

# apply stemming to a list of tokens
def apply_stemming(tokens, stemmer):
    return [stemmer.stem(token) for token in tokens]

# find words <= 2 letters or >= 21 letters
def find_two_letters(tokens):
    two_letters = []
    for token in tokens:
        if len(token) <= 2 or len(token) >= 21:
            two_letters.append(token)
    return two_letters

In [36]:
def processCorpus(corpus, language):   
    stopwords = nltk.corpus.stopwords.words(language)
    param_stemmer = SnowballStemmer(language)
    other_words = [line.rstrip('\n') for line in open('stopwords_scrapmaker.txt')] # Load .txt file line by line
    
    for document in corpus:
        index = corpus.index(document)
        corpus[index] = corpus[index].replace(u'\ufffd', '8')   # Replaces the ASCII '�' symbol with '8'
        corpus[index] = corpus[index].replace(',', '')          # Removes commas
        corpus[index] = corpus[index].rstrip('\n')              # Removes line breaks
        corpus[index] = corpus[index].casefold()                # Makes all letters lowercase
        
        corpus[index] = re.sub('\W_',' ', corpus[index])        # removes specials characters and leaves only words
        corpus[index] = re.sub("\S*\d\S*"," ", corpus[index])   # removes numbers and words concatenated with numbers IE h4ck3r. Removes road names such as BR-381.
        corpus[index] = re.sub("\S*@\S*\s?"," ", corpus[index]) # removes emails and mentions (words with @)
        corpus[index] = re.sub(r'http\S+', '', corpus[index])   # removes URLs with http
        corpus[index] = re.sub(r'www\S+', '', corpus[index])    # removes URLs with www

        listOfTokens = word_tokenize(corpus[index])
        twoLetterWord = find_two_letters(listOfTokens)

        listOfTokens = remove_stopwords(listOfTokens, stopwords)
        listOfTokens = remove_stopwords(listOfTokens, other_words)
        
        listOfTokens = apply_stemming(listOfTokens, param_stemmer)
        listOfTokens = remove_stopwords(listOfTokens, other_words)

        corpus[index]   = " ".join(listOfTokens)
        corpus[index] = unidecode(corpus[index])

    return corpus

In [40]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/alekseyvalouev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alekseyvalouev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [41]:
language = 'english'
corpus = processCorpus(corpus, language)
corpus[18][0:460]

'use trade-level data examin role activ manag fund ( amf ) earn news dissemin . find amf drawn particip disproportion earn announc ( ea ) includ bundl manageri guidanc . two piec news direct inconsist amf trade direct futur guidanc rather current earn . amf exhibit abil discern adapt trade bias bundl guidanc . amf trade ea general profit non-ea trade result revers guidanc bias extrem . overal find increas amf trade ea lead faster price adjust . collect find'