In [1]:
import pandas as pd
import numpy as np
import unicodecsv as csv
from PyDictionary import PyDictionary
from os import listdir

## Open manually coded words

In [2]:
# manual-word-tags.csv contains 249 most common words in corpus,
# manually sorted into "name", "personal", "stopword", "descriptor", "health", and "action"
# here we import them into a dictionary, "codings"

codings = {}
reader = csv.reader( open('manual-word-tags.csv','rU'))
reader.next()
for row in reader:
    word = row[0]
    tag = row[1]
    if tag not in codings:
        codings[tag] = []
    codings[tag].append(word)
for category,terms in codings.iteritems():
    print "%s - %d terms" % (category, len(terms))

name - 50 terms
personal - 14 terms
stopword - 115 terms
descriptor - 25 terms
health - 16 terms
action - 29 terms


In [3]:
# use PyDictionary to broaden word categories by getting synonyms of all words in each category

to_expand = ['personal','descriptor','health','action']
expanded_codings = {}

for category in to_expand:
    expanded_codings[category] = codings[category][:]
    
for category in to_expand:
    
    dictionary = PyDictionary(expanded_codings[category])
    synonymlists = dictionary.getSynonyms(formatted=False)
    
    #"synonymlists" is a list of lists. each list is synonyms of a word in "terms"
    #dictionary.getSynonyms(formatted = True) would return a list of dicts
    
    synonyms = [word for sublist in synonymlists for word in sublist ] #flatten list of lists into a list
    synonyms = list(set(synonyms)) #remove duplicates
    
    #remove all words that are already in current list of words
    #note that this biases word tags: if a new word is tagged both "personal" and "health", the code decides
    #to only tag it with "personal" and remove it from "health"
    allwords = expanded_codings.values()
    allwords = [word for sublist in allwords for word in sublist]
    
    #if the word is already tagged something else in "codings", don't include it here.
    allcodingswords = codings.values()
    allcodingswords = [word for sublist in allcodingswords for word in sublist]
    
    #if the word is in our precompiled list of stopwords (stop-words-english4.txt), don't include it here
    with open('stop-words-english4.txt', 'r') as f:
        stopwords = f.read().split()
    
    #remove all undesired words
    synonyms = [word for word in synonyms if word not in allwords + allcodingswords + stopwords]
    
    expanded_codings[category] += synonyms
    
for category,terms in expanded_codings.iteritems():
    print "%s - %d terms" % (category, len(terms))
    
# write to a CSV "expanded-word-tags.csv"
with open('expanded-word-tags.csv', 'w') as f:
    f.write('word,tag\n')
    
    for category in to_expand:
        for s in expanded_codings[category]:
            towrite = s + ',' + category + '\n'
            f.write(towrite)

# check if there are any words in multiple categories
allwords = expanded_codings.values()
allwords = [word for sublist in allwords for word in sublist] #flatten list of lists to a 1-level list
repeats = [word for word in allwords if allwords.count(word) > 1]
repeats = list(set(repeats))
if len(repeats) == 0:
    print "No repeats!"
else:
    print "Repeated words:", repeats

personal - 73 terms
descriptor - 119 terms
health - 73 terms
action - 132 terms
No repeats!




 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


In [4]:
# we want to manually check our new word tags: this is done in "expanded-word-tags-manual-edit.csv"
# so we re-import the words into a new dictionary, expanded_manual_codings:

expanded_manual_codings = {}
reader = csv.reader( open('expanded-word-tags-manual-edit.csv','rU'))
reader.next()
for row in reader:
    word = row[0]
    tag = row[1]
    if tag not in expanded_manual_codings:
        expanded_manual_codings[tag] = []
    expanded_manual_codings[tag].append(word)
for category,terms in expanded_manual_codings.iteritems():
    print "%s - %d terms" % (category, len(terms))

personal - 71 terms
stopword - 119 terms
health - 46 terms
action - 68 terms
descriptor - 93 terms


## Open and work with data

In [5]:
df = pd.read_csv('data/word_freq_by_race.csv')

In [None]:
top200white = df.sort_values('white',ascending=False).head(200)['word']
#for some reason, #24048 gets put at the top?

In [None]:
top200other = df.sort_values('other',ascending=False).head(200)['word']

In [None]:
top200other[~top200other.isin(top200white)]

## Analyze Category Use by Race

In [6]:
# using codings, no synonyms
uses = {}
for category,terms in codings.iteritems():
    white_uses = 0
    other_uses = 0
    uses[category] = {'white':df[df['word'].isin(terms)].sum(axis=0)['white pct'],
                      'other':df[df['word'].isin(terms)].sum(axis=0)['other pct']}
uses

{u'action': {'other': 0.03791378, 'white': 0.038188879999999994},
 u'descriptor': {'other': 0.040160762, 'white': 0.04241355999999999},
 u'health': {'other': 0.028549151, 'white': 0.023718885999999995},
 u'name': {'other': 0.046217126, 'white': 0.038795537},
 u'personal': {'other': 0.019783949, 'white': 0.016239604999999997},
 u'stopword': {'other': 0.3200348539999997, 'white': 0.3064274950000001}}

In [7]:
#using expanded_codings, with synonyms
uses = {}
for category,terms in expanded_codings.iteritems():
    white_uses = 0
    other_uses = 0
    uses[category] = {'white':df[df['word'].isin(terms)].sum(axis=0)['white pct'],
                      'other':df[df['word'].isin(terms)].sum(axis=0)['other pct']}
uses

{'action': {'other': 0.04337066300000003, 'white': 0.04362302499999999},
 'descriptor': {'other': 0.04620107200000006, 'white': 0.047628353000000005},
 'health': {'other': 0.030651998, 'white': 0.025786419999999984},
 'personal': {'other': 0.023033549000000014, 'white': 0.020322015999999988}}

In [8]:
#using expanded_manual_codings
uses = {}
for category,terms in expanded_manual_codings.iteritems():
    white_uses = 0
    other_uses = 0
    uses[category] = {'white':df[df['word'].isin(terms)].sum(axis=0)['white pct'],
                      'other':df[df['word'].isin(terms)].sum(axis=0)['other pct']}
uses

{u'action': {'other': 0.04104851500000001, 'white': 0.04140602299999999},
 u'descriptor': {'other': 0.04442236800000004, 'white': 0.046455509},
 u'health': {'other': 0.030743599, 'white': 0.025870138999999986},
 u'personal': {'other': 0.022587996000000013, 'white': 0.019858655999999985},
 u'stopword': {'other': 0.004454803999999995, 'white': 0.003769487}}

## Count number of times each qb is mentioned in their corpora

In [None]:
# need a proxy for measuring "how athletic/good each qb is"
# "rushing yards" is sometimes used - plot rushing yards for each quarterback, color by white/nonwhite
# also see if you can find a source for forty-yard dash times - make the same chart as above

In [None]:
# count number of times each qb is mentioned (full name) in their own .txt file of sentences

#TODO: give me a bar chart of number of sentences per quarterback
#and also a chart white vs other, number of sentences divided by # of quarterbacks in each category

qbfiles = listdir(r'data\corpora')
names = [name[:-4] for name in qbfiles]

counts = {}

for n in names:
    filename = 'data\\corpora\\'+n+'.txt'
    
    with open(filename, 'r') as f:
        alltext = f.read()
    
    counts[n] = alltext.count(n)


In [None]:
# get races of each quarterback according to qb_table
races = pd.read_csv('qb-table.csv', index_col=2)
allraces = list(set(races.index))

whiteqbs = list(races.loc['white']['qb_name'])

nonwhiteqbs = []
for race in allraces:
    if race != 'white':
        addnames = races.loc[race]['qb_name']
        nonwhiteqbs += [addnames]

#nonwhiteqbs = [list(races.loc[race]['qb_name']) for race in allraces if race != 'white']
nonwhiteqbs