# Palabras clave (2ª versión)

## Cargar dataset generado

In [None]:
import json
import zlib

import config

# Import
with open(config.DATASET_MUCHOCINE, 'r') as fd:
    documents = json.loads(zlib.decompress(fd.read()))
    
my_document = documents[1478]  # 'Harry Potter y la piedra filosofal'

In [None]:
print my_document['title']

## Calcular TF

In [None]:
from collections import Counter

for d in documents:
    d['TF'] = Counter(d['body_tokens'])

In [None]:
sorted(my_document['TF'].items(), reverse=True, key=lambda (term,freq):freq)[:10]

## Calcular IDF

In [None]:
from collections import defaultdict
from math import log

term_docs = defaultdict(lambda :0)  # Number of documents a term appears in
for d in documents:
    for term in d['TF'].keys():
        term_docs[term] += 1

In [None]:
n_docs_total = float(len(documents))
idf_terms = {term: log(n_docs_total/n_docs) for term,n_docs in term_docs.iteritems()}

In [None]:
sorted(
    [
        (term, idf_terms[term])
        for term in 
        list(set(my_document['TF'].keys()).intersection(set(idf_terms.keys())))
    ],
    reverse=True,
    key=lambda (term,freq):freq
)[:10]

## Calcular [TF-IDF](https://es.wikipedia.org/wiki/Tf-idf)

In [None]:
for d in documents:
    d['TFIDF'] = {t: f*idf_terms[t] for t,f in d['TF'].iteritems()}

In [None]:
def get_keywords(tfidf):
    return [
        term
        for term, score in sorted(
            tfidf.items(),
            key=lambda (term,score): score,
            reverse=True
        )
    ][:10]

def explore(title):
    print "Título: {}".format(title)
    for kws in [get_keywords(d['TFIDF']) for d in documents if title==d['title']]:
        print ' ->  {}'.format(', '.join(kws))

In [None]:
explore(u'Harry Potter y la piedra filosofal')

In [None]:
explore('High School Musical')

In [None]:
explore('Los puentes de Madison')

In [None]:
explore('2001, Odisea del espacio')

In [None]:
explore('Alien vs Predator 2')

In [None]:
explore('Apocalypto')