In [1]:
import pandas as pd
from sklearn.cluster import KMeans
import nltk
import gensim



In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### 1. importamos el dataset

In [4]:
df_sentiments = pd.read_csv("sentiments.csv" , delimiter=";")

In [5]:
df_sentiments.head()

Unnamed: 0,Sentiments,Unnamed: 1
0,The Da Vinci Code book is just awesome.,
1,this was the first clive cussler i've ever rea...,
2,i liked the Da Vinci Code a lot.,
3,i liked the Da Vinci Code a lot.,
4,I liked the Da Vinci Code but it ultimatly did...,


In [6]:
df_sentiments = df_sentiments[["Sentiments"]]

In [7]:
df_sentiments.head()

Unnamed: 0,Sentiments
0,The Da Vinci Code book is just awesome.
1,this was the first clive cussler i've ever rea...
2,i liked the Da Vinci Code a lot.
3,i liked the Da Vinci Code a lot.
4,I liked the Da Vinci Code but it ultimatly did...


In [8]:
df_sentiments.shape

(6918, 1)

#### 2. Tokenizacion

In [9]:
df_sentiments["words_tk"] = df_sentiments.apply(lambda row: nltk.word_tokenize(row["Sentiments"]) , axis=1)

In [10]:
df_sentiments.head()

Unnamed: 0,Sentiments,words_tk
0,The Da Vinci Code book is just awesome.,"[The, Da, Vinci, Code, book, is, just, awesome..."
1,this was the first clive cussler i've ever rea...,"[this, was, the, first, clive, cussler, i, 've..."
2,i liked the Da Vinci Code a lot.,"[i, liked, the, Da, Vinci, Code, a, lot, .]"
3,i liked the Da Vinci Code a lot.,"[i, liked, the, Da, Vinci, Code, a, lot, .]"
4,I liked the Da Vinci Code but it ultimatly did...,"[I, liked, the, Da, Vinci, Code, but, it, ulti..."


In [11]:
df_sentiments["tokens"] = df_sentiments.apply(lambda row: len(row["words_tk"]) , axis=1)

In [12]:
df_sentiments.head()

Unnamed: 0,Sentiments,words_tk,tokens
0,The Da Vinci Code book is just awesome.,"[The, Da, Vinci, Code, book, is, just, awesome...",9
1,this was the first clive cussler i've ever rea...,"[this, was, the, first, clive, cussler, i, 've...",27
2,i liked the Da Vinci Code a lot.,"[i, liked, the, Da, Vinci, Code, a, lot, .]",9
3,i liked the Da Vinci Code a lot.,"[i, liked, the, Da, Vinci, Code, a, lot, .]",9
4,I liked the Da Vinci Code but it ultimatly did...,"[I, liked, the, Da, Vinci, Code, but, it, ulti...",18


#### 3. Remocion de Stop Words

In [13]:
stop_words = set(nltk.corpus.stopwords.words("english"))

In [14]:
def stopWords(words_list):
    filtered_sentence = []
    for w in words_list:
        if w not in stop_words:
            filtered_sentence.append(w)
    return filtered_sentence

In [15]:
df_sentiments["words_filtered"] = df_sentiments.apply(lambda row : stopWords(row["words_tk"]) , axis=1)

In [16]:
df_sentiments.head()

Unnamed: 0,Sentiments,words_tk,tokens,words_filtered
0,The Da Vinci Code book is just awesome.,"[The, Da, Vinci, Code, book, is, just, awesome...",9,"[The, Da, Vinci, Code, book, awesome, .]"
1,this was the first clive cussler i've ever rea...,"[this, was, the, first, clive, cussler, i, 've...",27,"[first, clive, cussler, 've, ever, read, ,, ev..."
2,i liked the Da Vinci Code a lot.,"[i, liked, the, Da, Vinci, Code, a, lot, .]",9,"[liked, Da, Vinci, Code, lot, .]"
3,i liked the Da Vinci Code a lot.,"[i, liked, the, Da, Vinci, Code, a, lot, .]",9,"[liked, Da, Vinci, Code, lot, .]"
4,I liked the Da Vinci Code but it ultimatly did...,"[I, liked, the, Da, Vinci, Code, but, it, ulti...",18,"[I, liked, Da, Vinci, Code, ultimatly, n't, se..."


In [17]:
df_sentiments["tokens_word_filtered"] = df_sentiments.apply(lambda row: len(row["words_filtered"]) , axis=1)

In [18]:
df_sentiments.head()

Unnamed: 0,Sentiments,words_tk,tokens,words_filtered,tokens_word_filtered
0,The Da Vinci Code book is just awesome.,"[The, Da, Vinci, Code, book, is, just, awesome...",9,"[The, Da, Vinci, Code, book, awesome, .]",7
1,this was the first clive cussler i've ever rea...,"[this, was, the, first, clive, cussler, i, 've...",27,"[first, clive, cussler, 've, ever, read, ,, ev...",17
2,i liked the Da Vinci Code a lot.,"[i, liked, the, Da, Vinci, Code, a, lot, .]",9,"[liked, Da, Vinci, Code, lot, .]",6
3,i liked the Da Vinci Code a lot.,"[i, liked, the, Da, Vinci, Code, a, lot, .]",9,"[liked, Da, Vinci, Code, lot, .]",6
4,I liked the Da Vinci Code but it ultimatly did...,"[I, liked, the, Da, Vinci, Code, but, it, ulti...",18,"[I, liked, Da, Vinci, Code, ultimatly, n't, se...",11


#### 4. Segundo stop removing

In [19]:
df_sentiments["words_tk_relevant"] = df_sentiments.apply(lambda row: [x for x in row["words_filtered"] if len(x) > 2] , axis=1)

In [20]:
df_sentiments.head()

Unnamed: 0,Sentiments,words_tk,tokens,words_filtered,tokens_word_filtered,words_tk_relevant
0,The Da Vinci Code book is just awesome.,"[The, Da, Vinci, Code, book, is, just, awesome...",9,"[The, Da, Vinci, Code, book, awesome, .]",7,"[The, Vinci, Code, book, awesome]"
1,this was the first clive cussler i've ever rea...,"[this, was, the, first, clive, cussler, i, 've...",27,"[first, clive, cussler, 've, ever, read, ,, ev...",17,"[first, clive, cussler, 've, ever, read, even,..."
2,i liked the Da Vinci Code a lot.,"[i, liked, the, Da, Vinci, Code, a, lot, .]",9,"[liked, Da, Vinci, Code, lot, .]",6,"[liked, Vinci, Code, lot]"
3,i liked the Da Vinci Code a lot.,"[i, liked, the, Da, Vinci, Code, a, lot, .]",9,"[liked, Da, Vinci, Code, lot, .]",6,"[liked, Vinci, Code, lot]"
4,I liked the Da Vinci Code but it ultimatly did...,"[I, liked, the, Da, Vinci, Code, but, it, ulti...",18,"[I, liked, Da, Vinci, Code, ultimatly, n't, se...",11,"[liked, Vinci, Code, ultimatly, n't, seem, hold]"


In [21]:
df_sentiments["tokens_words_relevant"] = df_sentiments.apply(lambda row: len(row["words_tk_relevant"]) , axis=1)

In [22]:
df_sentiments.head()

Unnamed: 0,Sentiments,words_tk,tokens,words_filtered,tokens_word_filtered,words_tk_relevant,tokens_words_relevant
0,The Da Vinci Code book is just awesome.,"[The, Da, Vinci, Code, book, is, just, awesome...",9,"[The, Da, Vinci, Code, book, awesome, .]",7,"[The, Vinci, Code, book, awesome]",5
1,this was the first clive cussler i've ever rea...,"[this, was, the, first, clive, cussler, i, 've...",27,"[first, clive, cussler, 've, ever, read, ,, ev...",17,"[first, clive, cussler, 've, ever, read, even,...",13
2,i liked the Da Vinci Code a lot.,"[i, liked, the, Da, Vinci, Code, a, lot, .]",9,"[liked, Da, Vinci, Code, lot, .]",6,"[liked, Vinci, Code, lot]",4
3,i liked the Da Vinci Code a lot.,"[i, liked, the, Da, Vinci, Code, a, lot, .]",9,"[liked, Da, Vinci, Code, lot, .]",6,"[liked, Vinci, Code, lot]",4
4,I liked the Da Vinci Code but it ultimatly did...,"[I, liked, the, Da, Vinci, Code, but, it, ulti...",18,"[I, liked, Da, Vinci, Code, ultimatly, n't, se...",11,"[liked, Vinci, Code, ultimatly, n't, seem, hold]",7


In [23]:
df_sentiments["words_tk_relevant_joined"] = df_sentiments.apply(lambda row: " ".join(row["words_tk_relevant"]) , axis=1)

In [24]:
df_sentiments.head()

Unnamed: 0,Sentiments,words_tk,tokens,words_filtered,tokens_word_filtered,words_tk_relevant,tokens_words_relevant,words_tk_relevant_joined
0,The Da Vinci Code book is just awesome.,"[The, Da, Vinci, Code, book, is, just, awesome...",9,"[The, Da, Vinci, Code, book, awesome, .]",7,"[The, Vinci, Code, book, awesome]",5,The Vinci Code book awesome
1,this was the first clive cussler i've ever rea...,"[this, was, the, first, clive, cussler, i, 've...",27,"[first, clive, cussler, 've, ever, read, ,, ev...",17,"[first, clive, cussler, 've, ever, read, even,...",13,first clive cussler 've ever read even books l...
2,i liked the Da Vinci Code a lot.,"[i, liked, the, Da, Vinci, Code, a, lot, .]",9,"[liked, Da, Vinci, Code, lot, .]",6,"[liked, Vinci, Code, lot]",4,liked Vinci Code lot
3,i liked the Da Vinci Code a lot.,"[i, liked, the, Da, Vinci, Code, a, lot, .]",9,"[liked, Da, Vinci, Code, lot, .]",6,"[liked, Vinci, Code, lot]",4,liked Vinci Code lot
4,I liked the Da Vinci Code but it ultimatly did...,"[I, liked, the, Da, Vinci, Code, but, it, ulti...",18,"[I, liked, Da, Vinci, Code, ultimatly, n't, se...",11,"[liked, Vinci, Code, ultimatly, n't, seem, hold]",7,liked Vinci Code ultimatly n't seem hold


#### 5. Word2Vec Process

In [29]:
model_w2v = gensim.models.Word2Vec(df_sentiments.words_tk_relevant.values.tolist() , min_count=0 , vector_size=5 , sg=1)


In [30]:
def avgVectorSentece(list_word, model):
    vector = [model_w2v.wv[x] for x in list_word]
    return sum(vector)/len(vector)

In [31]:
df_sentiments["result_w2v"] = df_sentiments.apply(lambda row: avgVectorSentece(row["words_tk_relevant"] , model_w2v) , axis=1)


In [33]:
df_sentiments.head()

Unnamed: 0,Sentiments,words_tk,tokens,words_filtered,tokens_word_filtered,words_tk_relevant,tokens_words_relevant,words_tk_relevant_joined,result_w2v
0,The Da Vinci Code book is just awesome.,"[The, Da, Vinci, Code, book, is, just, awesome...",9,"[The, Da, Vinci, Code, book, awesome, .]",7,"[The, Vinci, Code, book, awesome]",5,The Vinci Code book awesome,"[0.25692824, 1.6083925, -0.08842452, -0.454953..."
1,this was the first clive cussler i've ever rea...,"[this, was, the, first, clive, cussler, i, 've...",27,"[first, clive, cussler, 've, ever, read, ,, ev...",17,"[first, clive, cussler, 've, ever, read, even,...",13,first clive cussler 've ever read even books l...,"[0.3124121, 1.0235685, 0.48780698, -0.32405186..."
2,i liked the Da Vinci Code a lot.,"[i, liked, the, Da, Vinci, Code, a, lot, .]",9,"[liked, Da, Vinci, Code, lot, .]",6,"[liked, Vinci, Code, lot]",4,liked Vinci Code lot,"[0.9609831, 1.4148765, 0.42268047, -0.05549311..."
3,i liked the Da Vinci Code a lot.,"[i, liked, the, Da, Vinci, Code, a, lot, .]",9,"[liked, Da, Vinci, Code, lot, .]",6,"[liked, Vinci, Code, lot]",4,liked Vinci Code lot,"[0.9609831, 1.4148765, 0.42268047, -0.05549311..."
4,I liked the Da Vinci Code but it ultimatly did...,"[I, liked, the, Da, Vinci, Code, but, it, ulti...",18,"[I, liked, Da, Vinci, Code, ultimatly, n't, se...",11,"[liked, Vinci, Code, ultimatly, n't, seem, hold]",7,liked Vinci Code ultimatly n't seem hold,"[0.56878424, 1.1457283, 0.37263504, 0.05806257..."


#### 6. Clustering de vectores de oraciones

In [34]:
kmeans = KMeans(n_clusters=400 , random_state=1).fit(df_sentiments.result_w2v.tolist())

In [35]:
df_sentiments["predicted_cluster"] = df_sentiments.apply(lambda row: kmeans.predict([row["result_w2v"].tolist()])[0] , axis=1)

In [40]:
df_sentiments.sort_values("predicted_cluster").head(40)

Unnamed: 0,Sentiments,words_tk,tokens,words_filtered,tokens_word_filtered,words_tk_relevant,tokens_words_relevant,words_tk_relevant_joined,result_w2v,predicted_cluster
77,i LOVE the da vinci code!,"[i, LOVE, the, da, vinci, code, !]",7,"[LOVE, da, vinci, code, !]",5,"[LOVE, vinci, code]",3,LOVE vinci code,"[1.429567, 1.2159837, -0.18167357, -1.0443093,...",0
5078,Harry Potter sucks.,"[Harry, Potter, sucks, .]",4,"[Harry, Potter, sucks, .]",4,"[Harry, Potter, sucks]",3,Harry Potter sucks,"[-0.57239026, 1.1760925, 1.2163883, -0.7332012...",1
5135,Harry Potter sucks.,"[Harry, Potter, sucks, .]",4,"[Harry, Potter, sucks, .]",4,"[Harry, Potter, sucks]",3,Harry Potter sucks,"[-0.57239026, 1.1760925, 1.2163883, -0.7332012...",1
5122,Harry Potter sucks.,"[Harry, Potter, sucks, .]",4,"[Harry, Potter, sucks, .]",4,"[Harry, Potter, sucks]",3,Harry Potter sucks,"[-0.57239026, 1.1760925, 1.2163883, -0.7332012...",1
5144,. Harry Potter sucks..,"[., Harry, Potter, sucks, ..]",5,"[., Harry, Potter, sucks, ..]",5,"[Harry, Potter, sucks]",3,Harry Potter sucks,"[-0.57239026, 1.1760925, 1.2163883, -0.7332012...",1
5069,Harry Potter sucks.,"[Harry, Potter, sucks, .]",4,"[Harry, Potter, sucks, .]",4,"[Harry, Potter, sucks]",3,Harry Potter sucks,"[-0.57239026, 1.1760925, 1.2163883, -0.7332012...",1
5065,Harry Potter sucks.,"[Harry, Potter, sucks, .]",4,"[Harry, Potter, sucks, .]",4,"[Harry, Potter, sucks]",3,Harry Potter sucks,"[-0.57239026, 1.1760925, 1.2163883, -0.7332012...",1
6664,So Brokeback Mountain was really depressing.,"[So, Brokeback, Mountain, was, really, depress...",7,"[So, Brokeback, Mountain, really, depressing, .]",6,"[Brokeback, Mountain, really, depressing]",4,Brokeback Mountain really depressing,"[0.13791016, 1.0707715, 0.33869937, -1.2424672...",2
6111,I'd just like to say that Brokeback Mountain i...,"[I, 'd, just, like, to, say, that, Brokeback, ...",15,"[I, 'd, like, say, Brokeback, Mountain, depres...",9,"[like, say, Brokeback, Mountain, depressing, m...",6,like say Brokeback Mountain depressing movie,"[0.13858767, 1.1235873, 0.26044187, -1.0612671...",2
6754,So Brokeback Mountain was really depressing.,"[So, Brokeback, Mountain, was, really, depress...",7,"[So, Brokeback, Mountain, really, depressing, .]",6,"[Brokeback, Mountain, really, depressing]",4,Brokeback Mountain really depressing,"[0.13791016, 1.0707715, 0.33869937, -1.2424672...",2


In [37]:
df_sentiments.shape

(6918, 10)

In [38]:
#df_sentiments[["Sentiments" , "words_tk_relevant_joined" , "predicted_cluster"]].to_csv("Clustering_Sentiments.csv" , sep="\t" , header=True , index=False)