In [2]:
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams
import gensim as gs
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
import lda
from load import all_subreddits_data, tv_subreddits_data, author_data


In [5]:
# data is of the form {class_label: list of documents}
all_data = author_data(10)
# tv_data = tv_subreddits_data()

In [6]:
# all_data2 = {'buildapc': all_data['buildapc'], 'anime': all_data['anime']}
all_data.keys()

[u'Stacieinhorrorland',
 u'JustMe80',
 u'Flowsephine',
 u'MexicanSpaceProgram',
 u'MrJacksEnigma',
 u'Asdyc',
 u'beauty_and_the_beach',
 u'without_gravity',
 u'jojodancer5',
 u'blamb211',
 u'-_-Equinox666-_-',
 u'GustavoFrings',
 u'diegojones4',
 u'KubrickIsMyCopilot',
 u'tfyuhjnbgf',
 u'suddenweightloss',
 u'tinyhousebuilder',
 u'iKnowALotOfStuff',
 u'sybaritic_footstool',
 u'Screwj4ck',
 u'anthonymyers3000',
 u'Late_Night_Grumbler',
 u'Springheeljac',
 u'StiltzkinTheMoogle',
 u'roguetroll',
 u'NaturalInclination',
 u'GeorgeFromManagement',
 u'Sexyschizophrenic',
 u'snow_yoshi',
 u'Megaross',
 u'CorDeFerrum',
 u'goodgirl112',
 u'MyBiologicalRomance',
 u'iam4real',
 u'cocofoshosho1122',
 u'OptimisticRobotLord',
 u'NakayamaTakayoshi',
 u'UniversalChairs',
 u'Universal-Cereal-Bus',
 u'CAN_ZIGZAG',
 u'InsertSomeName',
 u'DefenestratedEgo',
 u'BlueInventive',
 u'BiagioLargo',
 u'Back2Bach',
 u'KeganRhode',
 u'crogi',
 u'silverblaze92']

In [7]:
def tt_split(data, test_size=.1):
    """Splits a dictionary {class_label: list of documents}"""
    """into two dictionaries of the same shape"""
    train_data = {}; test_data = {}
    for label, docs in data.iteritems():
        train, test = train_test_split(docs, test_size=test_size)
        train_data[label] = train
        test_data[label] = test
    return train_data, test_data
        
        

In [8]:
def tokens_to_vocab(class_tokens):
    """{class_label : list of tokenized documents} -> vocab"""
    vocab = set([])
    for _class, tokenized_docs in class_tokens.iteritems():
        for d in tokenized_docs:
            vocab = vocab.union(set(d))
    return {word: i for i, word in enumerate(vocab)}
        

def word_tokenize_doc(doc):
    """Word tokenize a single document"""
    to_remove = set(['http', 'faq', 'https', 'amp','source', 'deletion', 'sfw',
              'nsfw', 'gt', 'gon', 'na', 'delete', 'comment', 'profile'])
    def _filter(w):
        return all([w.isalnum(), w not in stopwords.words('english'), w not in to_remove])
    tokens = word_tokenize(doc)
    tokens = filter(_filter, tokens)
    return tokens

def tokenize_all_words(data):
    """basic get_tokens method"""
    """{class_label: list of documents} ->""" 
    """{class_label : list of tokenized documents}"""
    for c, docs in data.iteritems():
        data[c] = map(word_tokenize_doc, docs)
    return data

# test_data = {
#     'c1': ['aa bb c', 'bb dd'],
#     'c2': ['dd ee ff', 'ee gg']
# }
# test_tokens = tokenize_all_words(test_data)
# test_vocab = tokens_to_vocab(test_tokens)


In [9]:
# def get_lda_topics(train_tokens, vocab, n_topics=30):
#     """{class_label: list of tokenized docs} ->"""
#     """{class_label: list of topic distributions from LDA"""
#     all_class_tokens = train_tokens.values()
#     flattened_classes = [item for sublist in all_class_tokens for item in sublist]
#     flattened_documents = [item for sublist in flattened_classes for item in sublist]
#     vectorizer = CountVectorizer(min_df=2, vocabulary=vocab)
#     X = vectorizer.fit_transform(flattened_documents)
#     lda = LatentDirichletAllocation(n_topics=n_topics)
#     X_new = lda.fit_transform(X.toarray())
#     return X_new

def get_hlda_models(train_tokens, vocab, n_topics=40):
    models = {}
    dictionary = gs.corpora.Dictionary(map(lambda x: [x], vocab.keys()))
    for label, docs in train_tokens.items():
        corpus = [dictionary.doc2bow(d) for d in docs]
        models[label] = gs.models.HdpModel(corpus, dictionary, T=n_topics)
    return models, dictionary

def get_lda_models(train_tokens, vocab, n_topics=40):
    all_models = {}
    def fit_model((label, docs)):
        model = lda.LDA(n_topics=n_topics, n_iter=1500)
        vectorizer = CountVectorizer(min_df=2, vocabulary = vocab, stop_words=None)
        X = vectorizer.fit_transform(map(lambda s: ' '.join(s), docs))
        model.fit(X)
        all_models[label] = model
        print 'done fitting for ', label
    map(fit_model, train_tokens.items())
    return all_models


def hlda_pred(models, dictionary, doc):
    corpus = [dictionary.doc2bow(word_tokenize_doc(doc))]
    label_score = []
    for label, hdp in models.iteritems():
        label_score.append((label, hdp.evaluate_test_corpus(corpus)))
    return max(label_score, key = lambda x:x[1])[0]

def lda_pred(models, vocab, doc):
    """Get a class prediction for a document """
    tokenized = word_tokenize_doc(doc)
    vectorizer = CountVectorizer(min_df=1, vocabulary = vocab, stop_words=None)
    X = vectorizer.fit_transform([' '.join(tokenized)])
    label_score = []
    for label, model in models.iteritems():
        n_topics = len(model.components_)
        topic_dist = model.transform(X)
        log_likelihood = 0
        for token in tokenized:
            if token in vocab:
                max_likelihood = -1 * 10 ** 8
                for topic in range(n_topics):
                    print label, token, topic, model.components_[topic][vocab[token]],(topic_dist[0][topic])
                    ll = np.log(model.components_[topic][vocab[token]]) + np.log(topic_dist[0][topic])
                    max_likelihood = max_likelihood if max_likelihood > ll else ll
                log_likelihood += max_likelihood
        label_score.append((label, log_likelihood))
    return max(label_score, key = lambda x:x[1])[0]

# models = get_lda_models(test_tokens, test_vocab, n_topics=2)


In [10]:
# lda_pred(models, test_vocab, 'aa aa')

# models['c2'].topic_word_, test_vocab

In [11]:
# def run_model(data, get_tokens=tokenize_all_words,
#               get_models = get_lda_models):
#     """Vectorizes, topic models, classifies, and returns score"""
data = all_data
get_tokens = tokenize_all_words
get_models = get_lda_models

train, test = tt_split(data)
print 'done splitting'
train_tokens = get_tokens(train)
print 'done tokenizing'
vocab = tokens_to_vocab(train_tokens)



done splitting
done tokenizing


In [12]:
hlda_models, dictionary = get_hlda_models(train_tokens, vocab, n_topics = 40)



In [13]:
correct = 0
total = 0
for label, docset in test.iteritems():
    for doc in docset:
        total += 1
        if hlda_pred(hlda_models, dictionary, doc) == label:
            correct += 1
print correct, total

205 912


In [14]:
# a = [u'StiltzkinTheMoogle',
#  u'Stacieinhorrorland',
#  u'JustMe80',
#  u'Flowsephine',
#  u'MexicanSpaceProgram',
#  u'MrJacksEnigma',
#  u'Asdyc',
#  u'beauty_and_the_beach',]
# hlda_models.keys()
# map(lambda (l, t): map(lambda x: x[0], t), hlda_models['Flowsephine'].show_topics(formatted=False))

In [15]:
lda_models = get_lda_models(train_tokens, vocab)



done fitting for  Stacieinhorrorland
done fitting for 



 JustMe80
done fitting for 



 Flowsephine
done fitting for 



 MexicanSpaceProgram
done fitting for 



 Asdyc
done fitting for 



 without_gravity
done fitting for 



 jojodancer5
done fitting for 



 blamb211
done fitting for 



 -_-Equinox666-_-
done fitting for 



 diegojones4
done fitting for 



 KubrickIsMyCopilot
done fitting for 



 tfyuhjnbgf
done fitting for 



 suddenweightloss
done fitting for 



 sybaritic_footstool
done fitting for 



 iKnowALotOfStuff
done fitting for 



 tinyhousebuilder
done fitting for 



 DefenestratedEgo
done fitting for 



 Late_Night_Grumbler
done fitting for 



 Springheeljac
done fitting for 



 anthonymyers3000
done fitting for 



 roguetroll
done fitting for 



 UniversalChairs
done fitting for 



 NaturalInclination
done fitting for 



 GeorgeFromManagement
done fitting for 



 Sexyschizophrenic
done fitting for 



 Universal-Cereal-Bus
done fitting for 



 snow_yoshi
done fitting for 



 Megaross
done fitting for 



 CorDeFerrum
done fitting for 



 goodgirl112
done fitting for 



 MyBiologicalRomance
done fitting for 



 KeganRhode
done fitting for 



 iam4real
done fitting for 



 cocofoshosho1122
done fitting for 



 OptimisticRobotLord
done fitting for 



 silverblaze92
done fitting for 



 NakayamaTakayoshi
done fitting for 



 beauty_and_the_beach
done fitting for 



 GustavoFrings
done fitting for 



 StiltzkinTheMoogle
done fitting for 



 InsertSomeName
done fitting for 



 Screwj4ck
done fitting for 



 BlueInventive
done fitting for 



 BiagioLargo
done fitting for 



 Back2Bach
done fitting for 



 CAN_ZIGZAG
done fitting for 



 crogi
done fitting for  MrJacksEnigma


In [16]:
correct = 0
total = 0
for label, docset in test.iteritems():
    for doc in docset:
        total += 1
        if pred(lda_models, vocab, doc) == label:
            correct += 1
print correct, total

NameError: name 'pred' is not defined