In [54]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import gensim
from IPython.display import display, HTML

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


 **Load data file**

In [55]:
data = pd.read_csv('training.csv', sep=",", encoding = "ISO-8859-1", names=["ID", "DATE", "QUERY", "NAME", "COMMENT"], error_bad_lines=False)

In [56]:
data.head()

Unnamed: 0,ID,DATE,QUERY,NAME,COMMENT
0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [58]:
x_raw = data["COMMENT"]
x_raw = x_raw.values.tolist()
x_raw = [a.replace("just", "") for a in x_raw]
x_raw = [a.replace("good", "") for a in x_raw]
x_raw = [a.replace("like", "") for a in x_raw]
x_raw = [a.replace("today", "") for a in x_raw]
x_raw = [a.replace("day", "") for a in x_raw]
x_raw = [a.replace("got", "") for a in x_raw]
x_raw = [a.replace("don", "") for a in x_raw]
x_raw = [a.replace("quot", "") for a in x_raw]

**Preprocess**

In [35]:
#Preprocess corpus
import string
from string import punctuation
from nltk.corpus import stopwords

def preprocess(text):
    # word tokenisation, including punctuation removal
    tokenizer = RegexpTokenizer(r'\w+')

    # lowercasing
    text = [t.lower() for t in text]

    # stopword removal
    stop = set(stopwords.words('english'))
    text = [t for t in text if t not in stop]
    
    # lemmatisation
    lemmatiser = WordNetLemmatizer()
    text = [lemmatiser.lemmatize(t) for t in text]

    # remove numbers and empty space
    digits = ' 0123456789'
    text = [t for t in text if t not in digits]

    #tokenize sentences
    for sent in text:
        yield(gensim.utils.simple_preprocess(str(sent)))
        
    return text

**Define Model and parameters**

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

model = ['LDA', 'NMF']
vectorizer = ['count', 'tfidf']

def topic_model(data, model, vectorizer, n_topics=10):
    """
    model: method for topic model
    vectorizer: method for representation of words
    n_topics = number of topics
    """

    if vectorizer == 'count':
        vect = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')
        x_train = vect.fit_transform(x)

    if vectorizer == 'tfidf':
        vect = TfidfVectorizer(max_df=0.8, min_df=2, stop_words='english')
        x_train = vect.fit_transform(x)

    if model == 'LDA':
        LDA = LatentDirichletAllocation(n_components=n_topics)
        _model = LDA.fit(x_train)
        print('Done!')
    
    if model == 'NMF':
        nmf = NMF(n_components=n_topics, init= 'nndsvd')
        _model = nmf.fit(x_train)
        print('Done!')

    return _model, vect, x_train

**Prepare data to train**

In [83]:
x = list(preprocess(x_raw))
bigram = gensim.models.Phrases(x, min_count=2, threshold=40)
bigram = gensim.models.phrases.Phraser(bigram)
#trigram = gensim.models.Phrases(bigram[x], threshold=30)  
#x = [trigram[bigram[line]] for line in x]
x = [bigram[line] for line in x]
x = [' '.join(i) for i in x]


In [87]:
print('Training...')
LDA, lda_vect, lda_xtrain = topic_model(x, model='LDA', vectorizer='count')
#LDA.get_params()
nmf, nmf_vect, nmf_xtrain = topic_model(x, model='NMF', vectorizer='tfidf')
#nmf.get_params()

Training...
Done!
Done!


**Show topics infered**

In [89]:
def create_topics_table(model, vectorizer, n_words):
    keywords = np.array(vectorizer.get_feature_names())
    topics = []
    for weights in model.components_:
        top_index = (-weights).argsort()[:n_words]
        #topic = keywords.take(top_index)
        #w = np.sort(-weights)[:5]
        #w = w.round(1)
        #w = w.astype(str)
        #ind = map(' '.join, zip(topic, w))
        #topics.append(ind)
        topics.append(keywords.take(top_index))
    return topics

print('Top 10 words for each topic with LDA')
topic_keywords = create_topics_table(model=LDA, vectorizer=lda_vect, n_words=10)       
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
display(df_topic_keywords)

print('Top 10 words for each topic with NMF')
topic_keywords2 = create_topics_table(model=nmf, vectorizer=nmf_vect, n_words=10)
df_topic_keywords2 = pd.DataFrame(topic_keywords2)
df_topic_keywords2.columns = ['Word '+str(i) for i in range(df_topic_keywords2.shape[1])]
df_topic_keywords2.index = ['Topic '+str(i) for i in range(df_topic_keywords2.shape[0])]
display(df_topic_keywords2)



Top 10 words for each topic with LDA


Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
Topic 0,haha,oh,hope,yay,great,love,awesome,thank,ll,better
Topic 1,sleep,sad,think,im,feel,going,bored,rain,feeling,bed
Topic 2,new,thanks,love,just,music,listening,song,great,coming,ll
Topic 3,work,weekend,morning,going,tomorrow,time,hate,ll,week,fun
Topic 4,night,tomorrow,com,hope,sorry,good,know,tonight,great,fun
Topic 5,lol,im,sun,going,wish,come,yeah,happy,play,tho
Topic 6,twitter,http,com,thanks,yes,lol,follow,know,people,bit_ly
Topic 7,home,time,com,http_twitpic,amp,just,need,car,mom,coffee
Topic 8,want,watching,miss,watch,love,lt,bit,movie,finished,tv
Topic 9,really,hair,love,make,eat,know,lol,amp,ve,face


Top 10 words for each topic with NMF


Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
Topic 0,time,know,lol,im,twitter,think,oh,ll,hope,feel
Topic 1,work,tomorrow,getting_ready,morning,hours,weekend,tired,wanna,week,early
Topic 2,going,bed,tomorrow,school,tonight,sleep,im,soon,watch,later
Topic 3,love,lt,song,new,watching,life,haha,guys,amazing,movie
Topic 4,really,bad,feel,sad,need,tired,sick,wanna,feeling,bored
Topic 5,home,night,sleep,tired,bed,tomorrow,good,fun,finally,sick
Topic 6,thanks,follow,following,great,hey,followfri,guys,haha,awesome,ff
Topic 7,miss,gonna,friends,ll,baby,guys,come,lt,school,sad
Topic 8,com,http_twitpic,http_plurk,http,http_tinyurl,www,new,says,bit_ly,followers_using
Topic 9,want,sleep,school,doesn,tomorrow,come,leave,bed,new,watch


**Words for each topics (Human defined) LDA vs NMF**

In [10]:
import numpy as np
import pandas as pd
from IPython.display import display, HTML

topics_lda = np.array([['Expression', 'Feelings', 'Entertainment', 'Work', 'Activities', 'Outdoors', 'Twitter/Internet', 'Activities', 'Entertainment/Movies', 'Products']])
df = pd.DataFrame(data=topics_lda)
df.index = ['Words']
df.columns = ['Topic '+str(i) for i in range(10)]
print('LDA')
display(df)
topics_nmf = np.array([['Feelings', 'Work', 'Activities ', 'Entertainment', 'Feelings/Mood', 'Health', 'Relationship/Following', 'Feelings', 'Internet', 'Activities']])
df2 = pd.DataFrame(data=topics_nmf)
df2.index = ['Words']
df2.columns = ['Topic '+str(i) for i in range(10)]
print('\n NMF')
display(df2)

LDA


Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9
Words,Expression,Feelings,Entertainment,Work,Activities,Outdoors,Twitter/Internet,Activities,Entertainment/Movies,Products



 NMF


Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9
Words,Feelings,Work,Activities,Entertainment,Feelings/Mood,Health,Relationship/Following,Feelings,Internet,Activities


**Show reviews and topic assigned**

LDA

In [90]:
# Topic ID for each review in dataset: review ID, review text, topic ID

def show_results(model, x_train):
    topic_values = model.transform(x_train)
    data['Topic'] = topic_values.argmax(axis=1)

    #data[['REVIEW_TEXT', 'Topic']]

    result = data[['COMMENT', 'Topic']][:50]

    display(HTML(result.to_html()))

In [91]:
lda_result = show_results(LDA, lda_xtrain)

Unnamed: 0,COMMENT,Topic
0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D",7
0,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!,6
0,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds,9
0,my whole body feels itchy and like its on fire,1
0,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.",4
0,@Kwesidei not the whole crew,0
0,Need a hug,7
0,"@LOLTrish hey long time no see! Yes.. Rains a bit ,only a bit LOL , I'm fine thanks , how's you ?",5
0,@Tatiana_K nope they didn't have it,4
0,@twittera que me muera ?,1


NMF

In [92]:
nmf_result = show_results(nmf, nmf_xtrain)

Unnamed: 0,COMMENT,Topic
0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D",8
0,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!,0
0,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds,0
0,my whole body feels itchy and like its on fire,0
0,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.",0
0,@Kwesidei not the whole crew,0
0,Need a hug,0
0,"@LOLTrish hey long time no see! Yes.. Rains a bit ,only a bit LOL , I'm fine thanks , how's you ?",6
0,@Tatiana_K nope they didn't have it,0
0,@twittera que me muera ?,0


In [93]:
import joblib

joblib.dump(LDA, 'lda_model.jl')
#lda_model = joblib.load('lda_model.jl')

joblib.dump(nmf, 'nmf_model.jl')
#lda_model = joblib.load('lda_model.jl')

['nmf_model.jl']

In [94]:
!pip install pyLDAvis
from pyLDAvis import sklearn as sklearn_lda
import os
import pickle 
import pyLDAvis
LDAvis_data_filepath = os.path.join('./ldavis_prepared_'+str(10))

if 1 == 1:
    x_train = vect.fit_transform(x)
    LDAvis_prepared = sklearn_lda.prepare(LDA, lda_xtrain, lda_vect)
with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

with open(LDAvis_data_filepath, 'rb') as f:
     LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, './ldavis_prepared_'+ str(10) +'.html')

pyLDAvis.display(LDAvis_prepared)

Collecting pyLDAvis
[?25l  Downloading https://files.pythonhosted.org/packages/a5/3a/af82e070a8a96e13217c8f362f9a73e82d61ac8fff3a2561946a97f96266/pyLDAvis-2.1.2.tar.gz (1.6MB)
[K     |████████████████████████████████| 1.6MB 2.8MB/s 
Collecting funcy
[?25l  Downloading https://files.pythonhosted.org/packages/ce/4b/6ffa76544e46614123de31574ad95758c421aae391a1764921b8a81e1eae/funcy-1.14.tar.gz (548kB)
[K     |████████████████████████████████| 552kB 17.6MB/s 
Building wheels for collected packages: pyLDAvis, funcy
  Building wheel for pyLDAvis (setup.py) ... [?25l[?25hdone
  Created wheel for pyLDAvis: filename=pyLDAvis-2.1.2-py2.py3-none-any.whl size=97711 sha256=726435c7629aab8482dbf7c056b431069d6c564c71526835efc522112d9b8213
  Stored in directory: /root/.cache/pip/wheels/98/71/24/513a99e58bb6b8465bae4d2d5e9dba8f0bef8179e3051ac414
  Building wheel for funcy (setup.py) ... [?25l[?25hdone
  Created wheel for funcy: filename=funcy-1.14-py2.py3-none-any.whl size=32042 sha256=a8993d07