In [25]:
import numpy as np
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
import pyLDAvis
import pyLDAvis.sklearn
from preprocess import lemm_string

pyLDAvis.enable_notebook()

# load and drop NaNs
train = pd.read_excel('../../tweets/csv/train.xls', header=None)
train.dropna(inplace=True)
train[0] = train[0].map(lemm_string)

test = pd.read_excel('../../tweets/csv/test1.xls')
test.dropna(inplace=True)
test['tweets'] = test['tweets'].map(lemm_string)

# segment data by company
# Starbucks
sbux_train = train[train[0].str.contains('starbucks')]
sbux_test = test[test['tweets'].str.contains('starbucks')]
# Chipotle
cmg_train = train[train[0].str.contains('chipotle')]
cmg_test = test[test['tweets'].str.contains('chipotle')]
# McDonalds
mcd_train = train[train[0].str.contains('mcdonalds')]
mcd_test = test[test['tweets'].str.contains('mcdonalds')]

n_features = 500
ngrams = [1,2]
max_df = .5
n_topics = 8
n_top_words = 10


def print_top_words(model, feature_names, n_top_words):
    words = []
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % (topic_idx + 1))
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))


def tfidf(train, test):
    tf = CountVectorizer(stop_words=ENGLISH_STOP_WORDS, max_features=n_features, ngram_range=ngrams, max_df=.5)
    train = tf.fit_transform(train)
    feature_names = tf.get_feature_names()
    test = tf.transform(test)
    return train, test, feature_names, tf


def decompose(model, train, test, tf):
    feature_names = tf.get_feature_names()
    model.fit(train)
    print_top_words(model, feature_names, n_top_words)
    train_dist_preds = model.transform(train)
    train_preds = np.argmax(train_dist_preds, axis=1)
    test_dist_preds = model.transform(test)
    test_preds = np.argmax(test_dist_preds, axis=1)
    return train_preds, test_preds, model

def topic_summaries(df, test_preds, mod_name):
    df['topics'] = test_preds.tolist()
    df['topics'] = df['topics'] + 1
    file_dir = '../../tweets/topic_dfs/{}_topic_preds.csv'.format(mod_name)
    df.to_csv(file_dir)
    topic_df = pd.read_csv(file_dir)
    summary = topic_df.groupby('topics')['labels'].mean()
    count = topic_df.groupby(['topics','labels'])['tweets'].count()
    print summary
    return summary, count

In [26]:
# UPDATE COMPANY HERE
train =  cmg_train.values.reshape(cmg_train.values.shape[0],)
# UPDATE COMPANY HERE
test = cmg_test['tweets'].values.reshape(cmg_test.values.shape[0],)
train, test, feature_names, tf = tfidf(train, test)
lda = LatentDirichletAllocation(n_topics=n_topics, learning_method='online', learning_offset=50.)
nmf = NMF(n_components=n_topics, init='random')
lsa = TruncatedSVD(n_components=n_topics, algorithm='randomized')
###############################################
model = lda # Set model here
mod_name = 'lda'
###############################################
train_preds, test_preds, model = decompose(model, train, test, tf)
# UPDATE COMPANY HERE
summary, count = topic_summaries(cmg_test, test_preds, mod_name=mod_name)

Topic #1:
burrito, food, right, chipotle burrito, youre, white, good, ass, line, sounds
Topic #2:
like, im, day, love, need, eating, valentines, craving, ive, craving chipotle
Topic #3:
sob, people, really, thinkingface, extra, order, im, thats, lunch, gonna
Topic #4:
free, bowl, time, free chipotle, know, burrito, mexican, ill, fundraiser, food
Topic #5:
eat, chipotletweets, eat chipotle, good, hearteyes, lol, burritos, come, day, skintone3
Topic #6:
want, want chipotle, joy, just, got, jacksfilms, im, support, think, joy joy
Topic #7:
dont, better, life, yum, work, make, tomorrow, buy, bring, best
Topic #8:
today, chicken, weary, taco, bad, chipotle today, sauce, new, cheese, ate
topics
1    2.160714
2    2.396825
3    2.152542
4    2.311111
5    2.479167
6    2.698630
7    2.266667
8    2.204082
Name: labels, dtype: float64


In [27]:
print count


topics  labels
1       1.0       15
        2.0       17
        3.0       24
2       1.0       13
        2.0       12
        3.0       38
3       1.0       18
        2.0       14
        3.0       27
4       1.0        8
        2.0       15
        3.0       22
5       1.0       10
        2.0        5
        3.0       33
6       1.0        5
        2.0       12
        3.0       56
7       1.0       10
        2.0       13
        3.0       22
8       1.0       13
        2.0       13
        3.0       23
Name: tweets, dtype: int64


In [24]:
pyLDAvis.sklearn.prepare(model, train, tf, R=20)