In [9]:
import pandas as pd
import numpy as np
import unicodecsv as csv
from PyDictionary import PyDictionary
from os import listdir


## Open manually coded words

In [None]:
# manual-word-tags.csv contains 249 most common words in corpus,
# manually sorted into "name", "personal", "stopword", "descriptor", "health", and "action"
# here we import them into a dictionary, "codings"

codings = {}
reader = csv.reader( open('manual-word-tags.csv','rU'))
reader.next()
for row in reader:
    word = row[0]
    tag = row[1]
    if tag not in codings:
        codings[tag] = []
    codings[tag].append(word)
for category,terms in codings.iteritems():
    print "%s - %d terms" % (category, len(terms))

In [None]:
# use PyDictionary to broaden word categories by getting synonyms of all words in each category

to_expand = ['personal','descriptor','health','action']
expanded_codings = {}
for category in to_expand:
    
    terms = codings[category][:] 
    expanded_codings[category] = terms #add terms we already have
    
    dictionary = PyDictionary(terms)
    synonymlists = dictionary.getSynonyms(formatted=False)
    
    #"synonymlists" is a list of lists. each list is synonyms of a word in "terms"
    #dictionary.getSynonyms(formatted = True) would return a list of dicts
    
    synonyms = [word for sublist in synonymlists for word in sublist ] #flatten list of lists into a list
    synonyms = list(set(synonyms)) #remove duplicates
    
    expanded_codings[category] += synonyms
    
for category,terms in expanded_codings.iteritems():
    print "%s - %d terms" % (category, len(terms))
    

In [None]:
# do we like our new categories of words? e.g. "flower" and "paraphernalia" are both in "action"
# and "damaged" and "wounded" are in "descriptor"
# and "bankrupt" is in "health"

# also, should check if any word appears in multiple categories - how choose which one?
expanded_codings

## Open and work with data

In [None]:
df = pd.read_csv('data/word_freq_by_race.csv')

In [None]:
top200white = df.sort_values('white',ascending=False).head(200)['word']
#for some reason, #24048 gets put at the top?

In [None]:
top200other = df.sort_values('other',ascending=False).head(200)['word']

In [None]:
top200other[~top200other.isin(top200white)]

## Find Synonyms

In [None]:
dictionary = PyDictionary("threw")
dictionary.getSynonyms()

## Analyze Category Use by Race

In [None]:
uses = {}
for category,terms in codings.iteritems():
    white_uses = 0
    other_uses = 0
    uses[category] = {'white':df[df['word'].isin(terms)].sum(axis=0)['white pct'],
                      'other':df[df['word'].isin(terms)].sum(axis=0)['other pct']}
uses

## Count number of times each qb is mentioned in their corpora

In [7]:
# count number of times each qb is mentioned (full name) in their own .txt file of sentences

qbfiles = listdir(r'data\corpora')
names = [name[:-4] for name in qbfiles]

counts = {}

for n in names:
    filename = 'data\\corpora\\'+n+'.txt'
    
    with open(filename, 'r') as f:
        alltext = f.read()
    
    counts[n] = alltext.count(n)


In [34]:
# get races of each quarterback according to qb_table
races = pd.read_csv('qb-table.csv', index_col=2)
allraces = list(set(races.index))

whiteqbs = list(races.loc['white']['qb_name'])

nonwhiteqbs = []
for race in allraces:
    if race != 'white':
        addnames = races.loc[race]['qb_name']
        nonwhiteqbs += [addnames]

#nonwhiteqbs = [list(races.loc[race]['qb_name']) for race in allraces if race != 'white']
nonwhiteqbs

[race
 hispanic       Tony Romo
 hispanic    Mark Sanchez
 Name: qb_name, dtype: object, 'Marcus Mariota', race
 black         Tyrod Taylor
 black            EJ Manuel
 black           Cam Newton
 black          Matt Cassel
 black         Josh Freeman
 black    Teddy Bridgewater
 black         Michael Vick
 black     Colin Kaepernick
 black       Russell Wilson
 black       Jameis Winston
 Name: qb_name, dtype: object]