# Limpieza de datos y preprocesado de textos

In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import re
import unicodedata,string

import config
PATH=config.DATASET_MUCHOCINE_RAW + "/"

def clean(x):
   x = unicodedata.normalize('NFKD', x).encode('ascii','ignore').lower()
   replace_punctuation = string.maketrans(string.punctuation, ' '*len(string.punctuation))
   x = x.translate(replace_punctuation)
   x = re.sub('@%$&[\n/:!,;)()_?¿¡<>]', ' ', x)
   x = re.sub(' - ', ' ', x)
   x = re.sub(' +',' ', x).strip()
   return x

def parseFile(filename):
    
    try:
        data = open(filename,'r').read()
        soup = BeautifulSoup(data, "html.parser")
        author = soup.find("review")["author"]
        title = soup.find("review")["title"]
        review_text = soup.body.get_text()
        letters_only = clean(review_text)
        result = ','.join([clean(author), clean(title), clean(review_text)])

    except Exception as e:
        print e
        result = []
        
    return result

parseFile(PATH + str(999) + ".xml")

In [None]:
N_DOCS = 1000
mydict = {i:parseFile(PATH+str(i)+".xml") for i in range(N_DOCS)}
df = pd.DataFrame(data=mydict.items(), columns=['id', 'text'])
df['author'] = df['text'].str.split(',').str[0]
df['title'] = df['text'].str.split(',').str[1]
df['text'] = df['text'].str.split(',').str[2]

df = df[df["text"] != ""].dropna()

df.to_csv(PATH+"clean_reviews.csv", index=False)
df.head()

# Representación Bag of Words con TF-IDF

In [None]:
#corpus = ['This is the first document.','This is the second second document.','And the third one.','Is this the first document?']
corpus = df["text"].tolist()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer=u'word', use_idf=True, max_df=0.3, lowercase=False)
#vectorizer = TfidfVectorizer(analyzer=u'word', use_idf=True)
tfidf_matrix = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()

# Extracción de palabras clave

In [None]:
import operator
import numpy as np

def getKeywords(myvector, top_n, features):
    myweights = myvector.toarray().flatten()
    indices = np.argsort(myweights)[::-1]
    top_features = [features[i] for i in indices[:top_n]]
    return top_features 

N_KEYWORDS = 10
def keywords(myid):
    return getKeywords(tfidf_matrix[myid:myid+1], N_KEYWORDS, features)

print keywords(1)

In [None]:
df["keywords"] = map(keywords, range(len(df)))

In [None]:
df.head()

# Clustering

In [None]:
from sklearn.cluster import KMeans

num_clusters = 100
clustering = KMeans(n_clusters=num_clusters)
%time clustering.fit(tfidf_matrix)

clusters = clustering.labels_.tolist()

In [None]:
df["cluster"] = clusters
df.head()

In [None]:
df[df["cluster"] == 4]

In [None]:
df[df["cluster"] == 1]