## LDA topic modeling


In [51]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import csv

In [2]:
documents = pd.read_csv('train_processed.csv')

In [3]:
documents.head()

Unnamed: 0,Label,Review
0,__label__1,this book goes into great detail on the histor...
1,__label__0,no chance to say whatsoever the vhs in in ntsc...
2,__label__1,an example of steinbecks early work with migra...
3,__label__1,this game is a long awaited relief to the othe...
4,__label__0,bought this brought it home wont play apparent...


In [4]:
len(documents)

1260000

In [7]:
# separate the dataframe into positive and negative reviews
negative = documents[documents['Label']=='__label__0'].sample(25000)
positive = documents[documents['Label']=='__label__1'].sample(25000)

In [8]:
positive.shape

(25000, 2)

## Topic Modeling

In [9]:
# define a simple tokenizer (NLTK won't be available to us later on, in our Lambda function)

def simple_tokenizer(input_text):
    REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(=)|(`)")
    REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)|(\n)|(\t)")
    tokens = REPLACE_NO_SPACE.sub("", input_text.lower())
    tokens = REPLACE_WITH_SPACE.sub(" ", tokens) # note that blazing text expects space-separated tokens
    return tokens

In [10]:
# Get the top 15 keywords for each topic

def show_topics(vectorizer, lda_model, n_words):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

In [11]:
def create_topics(dataframe):
    # extract the reviews
    documents=dataframe['Review'].values
    # tokenize the text
    tok_documents = [simple_tokenizer(doc) for doc in documents]
    # vectorize with TF-IDF
    no_features = 1000
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
    tfidf = tfidf_vectorizer.fit_transform(tok_documents)
    # extract feature names
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()
    # Instantiate the LDA class object from scikitlearn
    lda = LatentDirichletAllocation(n_components=10, 
                                    max_iter=5, 
                                    learning_method='online', 
                                    learning_offset=50.,
                                    random_state=0)

    # Fit our LDA model onto the vectorized text data
    lda.fit(tfidf)
    # column names
    topicnames = ['Topic' + str(i) for i in range(lda.n_components)]
    # Topic-Keyword Matrix
    df_topic_keywords = pd.DataFrame(lda.components_)
    # Assign Column and Index
    df_topic_keywords.columns = tfidf_vectorizer.get_feature_names()
    df_topic_keywords.index = topicnames
    # get the top 15 words
    topic_keywords = show_topics(vectorizer=tfidf_vectorizer, lda_model=lda, n_words=15)
    # Topic - Keywords Dataframe
    df_topic_keywords = pd.DataFrame(topic_keywords)
    df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
    df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
    # Extract topic probability scores with LDA Transform
    topic_probability_scores = lda.transform(tfidf)
    # create a dataframe with our results
    df_final = pd.DataFrame(documents, columns=['text'])
    dom_topics=[np.argmax(topic_probability_scores[index]) for index in range(len(topic_probability_scores))]
    df_final['pred_topic']=dom_topics
    df_final['topic_words']=df_final['pred_topic'].apply(lambda x: df_topic_keywords.iloc[x].values.tolist())
    return df_final

In [12]:
# apply the function to our datasets
final_pos = create_topics(positive)

In [13]:
final_neg = create_topics(negative)

In [58]:
# check out the results
final_neg.head()

Unnamed: 0,text,Label,topic_words
0,i only read the first 2 chapters of this book ...,__label____label____label____label__8,"[book, read, story, books, characters, like, r..."
1,i have to start by saying i love this movie an...,__label____label____label____label__7,"[movie, film, watch, movies, dvd, bad, just, l..."
2,i started this book a gift threw it across the...,__label____label____label____label__8,"[book, read, story, books, characters, like, r..."
3,poor quality lasted for less than a month even...,__label____label____label____label__6,"[product, use, bought, just, months, used, buy..."
4,im sorry to say that this book almost put me t...,__label____label____label____label__8,"[book, read, story, books, characters, like, r..."


In [118]:
print(final_neg[:20000].shape)
print(final_neg[20000:].shape)

(20000, 3)
(5000, 3)


In [119]:
def prep_for_bt(df1, output_train, output_valid):
    # Prefix the index-ed label with __label__
    df = df1.copy()
    df.rename(columns={'pred_topic':'Label'}, inplace=True)
    df.drop(['topic_words'], axis=1, inplace=True)
#     df['Label']=df['Label'].apply(lambda row: "__label__" +  str(row) )
    
    # convert the transformed dataframe into a list
    transformed_train = np.array(df[:20000]).tolist()
    transformed_validation = np.array(df[20000:]).tolist()

    # write to csv file (for blazingtext)
    with open(output_train, 'w') as csvoutfile:
        csv_writer = csv.writer(csvoutfile, delimiter=' ', lineterminator='\n') # notice the delimiter.
        csv_writer.writerows(transformed_train)

    # write to csv file (for blazingtext)
    with open(output_valid, 'w') as csvoutfile:
        csv_writer = csv.writer(csvoutfile, delimiter=' ', lineterminator='\n') # notice the delimiter.
        csv_writer.writerows(transformed_validation)

In [120]:
# save to file
prep_for_bt(final_neg, 'topics_negative.train', 'topics_negative.validation')
prep_for_bt(final_neg, 'topics_positive.train', 'topics_positive.validation')

## Make a list of the top 10 words for each topic

In [107]:
pos_topics = final_pos.groupby('Label')[['topic_words']].max()
pos_topics.to_csv('top10words_pos.csv', index=False)


In [121]:
neg_topics = final_neg.groupby('Label')[['topic_words']].max()
neg_topics.to_csv('top10words_neg.csv', index=False)

In [109]:
pos_topics.values

array([[list(['cold', 'memory', 'program', 'mouse', 'shes', 'period', 'crazy', 'minor', 'relate', 'wall', 'historical', 'software', 'war', 'outside', 'role'])],
       [list(['condition', 'arrived', 'product', 'amazon', 'expected', 'service', 'order', 'delivery', 'ordered', 'quickly', 'price', 'came', 'great', 'headphones', 'exactly'])],
       [list(['book', 'information', 'great', 'informative', 'helpful', 'easy', 'guide', 'questions', 'useful', 'reference', 'text', 'good', 'excellent', 'recommend', 'read'])],
       [list(['movie', 'film', 'dvd', 'movies', 'watch', 'great', 'season', 'series', 'seen', 'watching', 'good', 'best', 'love', 'acting', 'tv'])],
       [list(['book', 'read', 'books', 'story', 'reading', 'life', 'characters', 'great', 'good', 'author', 'written', 'like', 'time', 'really', 'people'])],
       [list(['video', 'workout', 'everyday', 'speakers', 'white', 'tape', 'ok', 'gone', 'videos', 'green', 'body', 'hooked', 'black', 'blue', 'rich'])],
       [list(['cd', '

In [122]:
neg_topics.values

array([[list(['book', 'information', 'books', 'author', 'read', 'guide', 'edition', 'useful', 'errors', 'good', 'text', 'learn', 'does', 'pages', 'written'])],
       [list(['cd', 'album', 'music', 'songs', 'like', 'song', 'sound', 'band', 'just', 'good', 'listen', 'sounds', 'dont', 'rock', 'voice'])],
       [list(['product', 'work', 'use', 'like', 'does', 'software', 'windows', 'bought', 'program', 'did', 'tried', 'buy', 'dont', 'just', 'good'])],
       [list(['game', 'games', 'play', 'video', 'toy', 'dont', 'money', 'fun', 'like', 'graphics', 'just', 'playing', 'really', 'kids', 'buy'])],
       [list(['amazon', 'product', 'received', 'item', 'ordered', 'phone', 'sent', 'return', 'order', 'service', 'dvd', 'work', 'did', 'buy', 'refund'])],
       [list(['kindle', 'hair', 'points', 'rate', 'sleep', 'eyes', 'guys', 'michael', 'face', 'guy', 'sold', 'people', 'lines', 'skin', 'body'])],
       [list(['product', 'use', 'bought', 'just', 'months', 'used', 'buy', 'time', 'work', 'plasti