# Palabras clave (2ª versión)

## Cargar dataset generado

In [1]:
import json
import zlib

import config

# Import
with open(config.DATASET_MUCHOCINE, 'r') as fd:
    documents = json.loads(zlib.decompress(fd.read()))
    
my_document = documents[1478]  # 'Harry Potter y la piedra filosofal'

In [2]:
print my_document['title']

Harry Potter y la piedra filosofal


## Calcular TF

In [3]:
from collections import Counter

for d in documents:
    d['TF'] = Counter(d['body_tokens'])

In [4]:
sorted(my_document['TF'].items(), reverse=True, key=lambda (term,freq):freq)[:10]

[(u'pelicul', 4),
 (u'gust', 3),
 (u'primer', 2),
 (u'personaj', 2),
 (u'encant', 2),
 (u'fan', 2),
 (u'magic', 2),
 (u'cas', 2),
 (u'cre', 2),
 (u'harry', 2)]

## Calcular IDF

In [5]:
from collections import defaultdict
from math import log

term_docs = defaultdict(lambda :0)  # Number of documents a term appears in
for d in documents:
    for term in d['TF'].keys():
        term_docs[term] += 1

In [6]:
n_docs_total = float(len(documents))
idf_terms = {term: log(n_docs_total/n_docs) for term,n_docs in term_docs.iteritems()}

In [7]:
sorted(
    [
        (term, idf_terms[term])
        for term in 
        list(set(my_document['TF'].keys()).intersection(set(idf_terms.keys())))
    ],
    reverse=True,
    key=lambda (term,freq):freq
)[:10]

[(u'hacerinfantiloid', 8.263074835802596),
 (u'magosl', 8.263074835802596),
 (u'warsl', 8.263074835802596),
 (u'estadopd2', 8.263074835802596),
 (u'bienbesitospd', 8.263074835802596),
 (u'canteresum', 8.263074835802596),
 (u'vej', 7.569927655242652),
 (u'hogarts', 7.164462547134487),
 (u'hagr', 6.653636923368497),
 (u'ale', 6.317164686747284)]

## Calcular [TF-IDF](https://es.wikipedia.org/wiki/Tf-idf)

In [8]:
for d in documents:
    d['TFIDF'] = {t: f*idf_terms[t] for t,f in d['TF'].iteritems()}

In [9]:
def get_keywords(tfidf):
    return [
        term
        for term, score in sorted(
            tfidf.items(),
            key=lambda (term,score): score,
            reverse=True
        )
    ][:10]

def explore(title):
    print "Título: {}".format(title)
    for kws in [get_keywords(d['TFIDF']) for d in documents if title==d['title']]:
        print ' ->  {}'.format(', '.join(kws))

In [10]:
explore(u'Harry Potter y la piedra filosofal')

Título: Harry Potter y la piedra filosofal
 ->  mag, inan, snap, voldemort, nios, reforz, fantasi, huerfan, cuaj, magi
 ->  hacerinfantiloid, warsl, magosl, estadopd2, bienbesitospd, canteresum, pott, fan, vej, hogarts


In [11]:
explore('High School Musical')

Título: High School Musical
 ->  disc, gastronteritis, meditant, disney, herbi, duff, chavalit, ostra, revolc, ejem
 ->  channel, school, programacion, high, infantilciment, aptisim, unidosel, dinty, juvenil, coreografi


In [12]:
explore('Los puentes de Madison')

Título: Los puentes de Madison
 ->  amor, espiritual, renuev, cruc, bla, desarraig, francesc, panfletari, conserv, famili


In [13]:
explore('2001, Odisea del espacio')

Título: 2001, Odisea del espacio
 ->  2001, kubrick, strauss, moonwatch, monolit, bowm, astronaut, 1977, primit, inedit


In [14]:
explore('Alien vs Predator 2')

Título: Alien vs Predator 2
 ->  terrorcon, 300rod, pestiotod, streus, clipl, rescuem, aburrid, super, predators, doblet
 ->  predator, avp, xenoform, predali, ali, nav, straus, versus, aliens, extraterrestr
 ->  predator, cutr, vs, ali, evitentel, telivision, cumpliri, contien, patet, teen
 ->  predator, aliens, vs, ali, requiem, depred, ident, jeunet, straus, ripley
 ->  straus, depred, videojueg, ali, silvestri, predator, constantin, vs, pearl, aliens
 ->  straus, vs, depred, version, poseenali, avpla, 126, pasajerordqu, dolaresinevit, 171
 ->  depred, aliens, ud, nav, nom, ali, pueblit, convoc, parasit, hellip
 ->  aliens, depred, raz, anchas, exact, cazarlosel, comicbas, exprimamosl, peleal, peliculapart
 ->  cuartill, jej, aliens, tont, queresum, rubickel, superoriginal, pantallasin, fiessst, screeners
 ->  predator, ali, mentiri, mojon, hug, mach, sanchez, tierr, jugab, ciruel


In [15]:
explore('Apocalypto')

Título: Apocalypto
 ->  may, nativ, provinci, geograf, apocalypt, crist, crei, aportari, autosacrifici, excasez
 ->  apocalypt, gibson, may, caz, civilizacion, guerrer, jagu, persecucion, cazador, capturaronreal
 ->  gibson, jagu, mel, may, apocalypt, moj, cultur, vais, sacrifici, crist
 ->  aborig, apocalypt, gibson, jabali, aborigen, may, aventur, variabl, espesur, colect
 ->  civilizacion, personajesaunqu, caractetiz, cintatampoc, extremezc, violent, histor, espereis, leg, decliv
 ->  espectaculom, subtituloshay, vezn, aztec, tub, son, entiend, aviv, brasile, bendit
 ->  gibson, jagu, max, apocalypt, mel, selv, salvaj, cazador, bosquey, seml
 ->  gibson, aztec, excedent, metrajeconoc, australianocu, sacrificiosl, inhabit, espectadorper, abc, pid
 ->  may, alde, libeeeertaa, rarosresum, pocol, familiahac, destru, grab, trepid, cautiveri
 ->  mel, indigen, musculatur, jagu, creenciasv, elementalesm, masal, facilel, seventh, cintascont
 ->  apocalypt, gibson, mel, temporal, fotogramal,