Asociación de Palabras

In [68]:
from xml.dom import minidom
xmldoc = minidom.parse('ap/ap.xml')
doclist = xmldoc.getElementsByTagName('DOC')

docs = {}
corpus_text = ''

for doc in doclist:
    docno = doc.getElementsByTagName('DOCNO')[0].firstChild.data.strip()
    text = doc.getElementsByTagName('TEXT')[0].firstChild.data.strip()
    docs[docno] = text
    corpus_text += text

In [10]:
from nltk.corpus import wordnet as wn

def is_noun(tag):
    return tag in ['NN', 'NNS', 'NNP', 'NNPS']

def is_verb(tag):
    return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

def is_adverb(tag):
    return tag in ['RB', 'RBR', 'RBS']

def is_adjective(tag):
    return tag in ['JJ', 'JJR', 'JJS']

def penn_to_wn(tag):
    if is_adjective(tag):
        return wn.ADJ
    elif is_noun(tag):
        return wn.NOUN
    elif is_adverb(tag):
        return wn.ADV
    elif is_verb(tag):
        return wn.VERB
    return None

In [45]:
import re

cList = {
  "ain't": "am not",
  "aren't": "are not",
  "can't": "cannot",
  "can't've": "cannot have",
  "'cause": "because",
  "could've": "could have",
  "couldn't": "could not",
  "couldn't've": "could not have",
  "didn't": "did not",
  "doesn't": "does not",
  "don't": "do not",
  "hadn't": "had not",
  "hadn't've": "had not have",
  "hasn't": "has not",
  "haven't": "have not",
  "he'd": "he would",
  "he'd've": "he would have",
  "he'll": "he will",
  "he'll've": "he will have",
  "he's": "he is",
  "how'd": "how did",
  "how'd'y": "how do you",
  "how'll": "how will",
  "how's": "how is",
  "I'd": "I would",
  "I'd've": "I would have",
  "I'll": "I will",
  "I'll've": "I will have",
  "I'm": "I am",
  "I've": "I have",
  "isn't": "is not",
  "it'd": "it had",
  "it'd've": "it would have",
  "it'll": "it will",
  "it'll've": "it will have",
  "it's": "it is",
  "let's": "let us",
  "ma'am": "madam",
  "mayn't": "may not",
  "might've": "might have",
  "mightn't": "might not",
  "mightn't've": "might not have",
  "must've": "must have",
  "mustn't": "must not",
  "mustn't've": "must not have",
  "needn't": "need not",
  "needn't've": "need not have",
  "o'clock": "of the clock",
  "oughtn't": "ought not",
  "oughtn't've": "ought not have",
  "shan't": "shall not",
  "sha'n't": "shall not",
  "shan't've": "shall not have",
  "she'd": "she would",
  "she'd've": "she would have",
  "she'll": "she will",
  "she'll've": "she will have",
  "she's": "she is",
  "should've": "should have",
  "shouldn't": "should not",
  "shouldn't've": "should not have",
  "so've": "so have",
  "so's": "so is",
  "that'd": "that would",
  "that'd've": "that would have",
  "that's": "that is",
  "there'd": "there had",
  "there'd've": "there would have",
  "there's": "there is",
  "they'd": "they would",
  "they'd've": "they would have",
  "they'll": "they will",
  "they'll've": "they will have",
  "they're": "they are",
  "they've": "they have",
  "to've": "to have",
  "wasn't": "was not",
  "we'd": "we had",
  "we'd've": "we would have",
  "we'll": "we will",
  "we'll've": "we will have",
  "we're": "we are",
  "we've": "we have",
  "weren't": "were not",
  "what'll": "what will",
  "what'll've": "what will have",
  "what're": "what are",
  "what's": "what is",
  "what've": "what have",
  "when's": "when is",
  "when've": "when have",
  "where'd": "where did",
  "where's": "where is",
  "where've": "where have",
  "who'll": "who will",
  "who'll've": "who will have",
  "who's": "who is",
  "who've": "who have",
  "why's": "why is",
  "why've": "why have",
  "will've": "will have",
  "won't": "will not",
  "won't've": "will not have",
  "would've": "would have",
  "wouldn't": "would not",
  "wouldn't've": "would not have",
  "y'all": "you all",
  "y'alls": "you alls",
  "y'all'd": "you all would",
  "y'all'd've": "you all would have",
  "y'all're": "you all are",
  "y'all've": "you all have",
  "you'd": "you had",
  "you'd've": "you would have",
  "you'll": "you you will",
  "you'll've": "you you will have",
  "you're": "you are",
  "you've": "you have"
}

caseInsCList = {}

for key, value in cList.items():
    caseInsCList[key.lower()] = value

c_re = re.compile('(%s)' % '|'.join(caseInsCList.keys()), re.IGNORECASE)

def expandContractions(text, c_re=c_re):
    def replace(match):
        return caseInsCList[match.group(0).lower()]
    return c_re.sub(replace, text)

In [177]:
import nltk
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk.corpus import stopwords

from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()

def lemmatize_with_tag(word, tag):
    wordType = penn_to_wn(tag)
    if wordType is None:
        return None
    return wordnet_lemmatizer.lemmatize(word, pos=wordType)

vocabulary = {}
corpus_tokens = []
corpus_processed_tokens = []

for docno, text in docs.items():
    sents = sent_detector.tokenize(text)
    for sent in sents:
        sent = expandContractions(sent)
        tokens = tokenizer.tokenize(sent)
        corpus_tokens += tokens
        token_with_tags = nltk.pos_tag(tokens)
        for token_with_tag in token_with_tags:
            word = lemmatize_with_tag(token_with_tag[0], token_with_tag[1])
            if word is not None and word not in stopwords.words('english'):
                corpus_processed_tokens.append(word)
                if word in vocabulary:
                    vocabulary[word] += 1
                else:
                    vocabulary[word] = 1

In [167]:
print(len(vocabulary))

37118


In [168]:
import operator
sorted_vocabulary = sorted(vocabulary.items(), key=operator.itemgetter(1))
print(sorted_vocabulary)



In [169]:
the500 = list(map(lambda x: x[0], sorted_vocabulary[-500:]))[::-1]
print(the500)

['say', 'year', 'percent', 'people', 'also', 'government', 'make', 'U.S.', 'official', 'report', 'last', 'go', 'take', '_', 'state', 'time', 'Bush', 'include', 'new', 'get', 'day', 'first', 'Soviet', 'tell', 'week', 'New', 'United', 'company', 'use', 'call', 'work', 'month', 'country', 'give', 'come', 'today', 'plan', 'president', 'police', 'Thursday', 'American', 'group', 'States', 'Tuesday', 'market', 'member', 'price', 'President', 'issue', 'Monday', 'high', 'Friday', 'force', 'Wednesday', 'know', 'think', 'want', 'begin', 'find', 'charge', 'leader', 'home', 'end', 'York', 'leave', 'show', 'hold', 'case', 'program', 'kill', 'good', 'help', 'military', 'support', 'increase', 'nation', 'House', 'right', 'see', 'federal', 'late', 'order', 'many', 'vote', 'city', 'talk', 'ask', 'expect', 'rate', 'part', 'political', 'former', 'way', 'spokesman', 'campaign', 'party', 'court', 'pay', 'news', 'rise', 'trade', 'close', 'law', 'Department', 'move', 'business', 'back', 'early', 'try', 'Nation

In [188]:
from nltk.collocations import BigramCollocationFinder
bigram_measures = nltk.collocations.BigramAssocMeasures()

def getBestPairContaining(word, tokens):
    finder = BigramCollocationFinder.from_words(tokens)
    finder.apply_ngram_filter(lambda w1, w2: word not in (w1, w2))
    return finder.nbest(bigram_measures.pmi, 1)[0]

In [189]:
for word in the500:
    print(word)
    print(getBestPairContaining(word, corpus_tokens))

say
('Demographers', 'say')
year
('hectic', 'year')
percent
('1.0', 'percent')
people
('1,008', 'people')
also
('Flames', 'also')
government
('Communist-dominated', 'government')
make
('_Quickly', 'make')
U.S.
('U.S.', 'warship')
official
('embezzeling', 'official')
report
('28-page', 'report')
last
('120.30', 'last')
go
('dared', 'go')
take
('breeze', 'take')
_
('1,205.75', '_')
state
('10,520', 'state')
time
('daylight-saving', 'time')
Bush
('Bush', 'affectionately')
include
('Exceptions', 'include')
new
('11,300', 'new')
get
('get', 'Zhelyu')
day
('155th', 'day')
first
('first', 'all-woman')
Soviet
('Soviet', 'Lunakod')
tell
('tell', 'ya')
week
('fitful', 'week')
New
('New', 'Delhi')
United
('United', 'Technologies')
company
('Disney-controlled', 'company')
use
('Condom', 'use')
call
('intimates', 'call')
work
('work', 'stoppage')
month
('90th', 'month')
country
('democratized', 'country')
give
('give', 'continuity')
come
('dares', 'come')
today
('1.7695', 'today')
plan
('cash-for-f

('exploitable', 'source')
union
('1.6-million-member', 'union')
never
('never', 'ceases')
appear
('Hannah', 'appear')
small
('circumnavigating', 'small')
prison
('22-year', 'prison')
employee
('employee', 'stock-ownership')
strike
('four-week', 'strike')
turn
('turn', 'soliders')
Co.
('Co.', 'Amplifying')
Committee
('Anti-Discrimination', 'Committee')
write
('write', 'Rosty')
reach
('reach', '200,000-250,000')
action
('action', 'adventures')
City
('Air-Conditioned', 'City')
chairman
('chairman', 'K.F')
involve
('involve', 'trimming')
less
('less', 'abrasive')
budget
("'91", 'budget')
estimate
('degradation', 'estimate')
aid
('15-billion', 'aid')
State
('Brushy', 'State')
June
('June', '10-17')
board
('Scrabble', 'board')
March
('Closes', 'March')
general
('four-star', 'general')
concern
('deep-seated', 'concern')
interview
('14-minute', 'interview')
committee
('ministerial', 'committee')
German
('1.5245', 'German')
buy
('buy', 'hard-to-get')
Japan
('Japan', 'Chambaer')
Ms.
('Ms.', 'Wol

In [190]:
for word in the500:
    print(word)
    print(getBestPairContaining(word, corpus_processed_tokens))

say
('Buth', 'say')
year
('1-to-10', 'year')
percent
('DK-NA', 'percent')
people
('Funerals', 'people')
also
('14-10', 'also')
government
('Bhutto', 'government')
make
("'80s", 'make')
U.S.
('1869-1951', 'U.S.')
official
('Baldus', 'official')
report
('28-page', 'report')
last
('Arcadia', 'last')
go
('Automakers', 'go')
take
('Castel', 'take')
_
('1981-86', '_')
state
('Moslem-majority', 'state')
time
('fainter', 'time')
Bush
('257-176', 'Bush')
include
('Assts', 'include')
new
('Cupid', 'new')
get
('12-1', 'get')
day
('million-share', 'day')
first
('Bridgeton', 'first')
Soviet
('Approval', 'Soviet')
tell
('Conti', 'tell')
week
('Briefs', 'week')
New
('Lauder', 'New')
United
('United', 'Technologies')
company
('Disney-controlled', 'company')
use
('Adamses', 'use')
call
('212-587-1111', 'call')
work
('Deidre', 'work')
month
('Antall', 'month')
country
('Centres', 'country')
give
('431-355', 'give')
come
('Condemnations', 'come')
today
('Civilians', 'today')
plan
('cash-for-food', 'plan'

('Tchibanga', 'small')
prison
('22-year', 'prison')
employee
('10-million-share', 'employee')
strike
('Sympathy', 'strike')
turn
('Ballots', 'turn')
Co.
('Co.', 'Amplifying')
Committee
('Anti-Discrimination', 'Committee')
write
('Barkett', 'write')
reach
('Campomanas', 'reach')
action
('Sylmar', 'action')
City
('City', 'Poros')
chairman
('Ayres', 'chairman')
involve
('SII', 'involve')
less
('Broadside', 'less')
budget
("'91", 'budget')
estimate
('Cap-Haitien', 'estimate')
aid
('15-billion', 'aid')
State
('Armacost', 'State')
June
('June', '16,100-foot')
board
('Bross', 'board')
March
('Closes', 'March')
general
('four-star', 'general')
concern
('1968', 'concern')
interview
('14-minute', 'interview')
committee
('40-man', 'committee')
German
('Budweis', 'German')
buy
('Dominguez', 'buy')
Japan
('Atsugi', 'Japan')
Ms.
('Drenkmann', 'Ms.')
file
('Gregorie', 'file')
recent
('APS', 'recent')
study
('NASA-funded', 'study')
Court
('Chancery', 'Court')
require
('Corridor', 'require')
refuse
('H