In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.sparse as sp
from time import time
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import pairwise_distances
from scipy.stats.stats import pearsonr
import graphlab
from sklearn.cross_validation import train_test_split
from math import sqrt
import scipy.sparse as sp
from scipy.sparse.linalg import svds
%matplotlib inline
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF as nmf

In [27]:
def build_text_vectorizer(contents, use_tfidf=True, use_stemmer=False, max_features=None):
    '''
    Build and return a **callable** for transforming text documents to vectors,
    as well as a vocabulary to map document-vector indices to words from the
    corpus. The vectorizer will be trained from the text documents in the
    `contents` argument. If `use_tfidf` is True, then the vectorizer will use
    the Tf-Idf algorithm, otherwise a Bag-of-Words vectorizer will be used.
    The text will be tokenized by words, and each word will be stemmed iff
    `use_stemmer` is True. If `max_features` is not None, then the vocabulary
    will be limited to the `max_features` most common words in the corpus.
    '''
    Vectorizer = TfidfVectorizer if use_tfidf else CountVectorizer
    tokenizer = RegexpTokenizer(r"[\w']+")
    stem = PorterStemmer().stem if use_stemmer else (lambda x: x)
    stop_set = set(stopwords.words('english'))

    # Closure over the tokenizer et al.
    def tokenize(text):
        tokens = tokenizer.tokenize(text)
        stems = [stem(token) for token in tokens if token not in stop_set]
        return stems

    vectorizer_model = Vectorizer(tokenizer=tokenize, max_features=max_features)
    vectorizer_model.fit(contents)
    vocabulary = np.array(vectorizer_model.get_feature_names())

    # Closure over the vectorizer_model's transform method.
    def vectorizer(X):
        return vectorizer_model.transform(X).toarray()

    return vectorizer, vocabulary


In [28]:
df = pd.read_csv("processed_data.csv")

In [29]:
df = df[df["user_review_count"] > 2]

In [30]:
df.shape

(47583, 36)

In [31]:
title = df.published_title.tolist()

In [32]:
vectorizer, vocabulary = build_text_vectorizer(title,
                             use_tfidf=True,
                             use_stemmer=False,
                             max_features=5000)
X = vectorizer(title)


In [33]:
nmf = nmf(n_components=15, max_iter=100, random_state=12345, alpha=0.0)
W = nmf.fit_transform(X)
H = nmf.components_
print 'reconstruction error:', nmf.reconstruction_err_

reconstruction error: 180.577482053


In [34]:
def hand_label_topics(H, vocabulary):
    '''
    Print the most influential words of each latent topic, and prompt the user
    to label each topic. The user should use their humanness to figure out what
    each latent topic is capturing.
    '''
    hand_labels = []
    for i, row in enumerate(H):
        top_five = np.argsort(row)[::-1][:20]
        print 'topic', i
        print '-->', ' '.join(vocabulary[top_five])
        label = raw_input('please label this topic: ')
        hand_labels.append(label)
        print
    return hand_labels

In [35]:
hand_labels = hand_label_topics(H, vocabulary)

topic 0
--> web developer 2 bootcamp course become job scratch modern workflow mastering ready stack django git full services development certified master
please label this topic: web development

topic 1
--> javascript understand es6 scratch 2016 jquery answers interview top questions next generation learn complete fast easy tutorial basics hour introduction
please label this topic: javascript

topic 2
--> 2 angular guide complete practical vuejs developers webpack nodejs mongodb ultimate apps beginners building ionic photography inventory drop shipping ebay
please label this topic: angular

topic 3
--> react redux native tutorial meteor graphql flux complete js advanced apps mastering router web node fullstack express universal ansible workflow
please label this topic: react redux

topic 4
--> learn angularjs code jumpstart scratch days linux marketing 5 digital building development hacking ethical android 10 programming projects jquery game
please label this topic: mobile apps

topi

In [36]:
len(W)

47583

In [37]:
def softmax(v, temperature=1.0):
    '''
    A heuristic to convert arbitrary positive values into probabilities.
    See: https://en.wikipedia.org/wiki/Softmax_function
    '''
    expv = np.exp(v / temperature)
    s = np.sum(expv)
    return expv / s

In [56]:
def analyze_article(article_index, title, W, hand_labels):
    '''
    Print an analysis of a single NYT articles, including the article text
    and a summary of which topics it represents. The topics are identified
    via the hand-labels which were assigned by the user.
    '''
    #print title[article_index]
    probs = softmax(W[article_index], temperature=0.01)
    lst = []
    for prob, label in zip(probs, hand_labels):
        lst.append((prob * 100, label))
    return max(lst)[1]


In [57]:
analyze_article(0, title, W, hand_labels)

'mobile apps'

In [58]:
title_category = {}
for i in xrange(len(title)):
     title_category[i] = analyze_article(i, title, W, hand_labels)

In [59]:
title_category

{0: 'mobile apps',
 1: 'mobile apps',
 2: 'mobile apps',
 3: 'mobile apps',
 4: 'mobile apps',
 5: 'mobile apps',
 6: 'mobile apps',
 7: 'mobile apps',
 8: 'mobile apps',
 9: 'mobile apps',
 10: 'mobile apps',
 11: 'mobile apps',
 12: 'mobile apps',
 13: 'mobile apps',
 14: 'mobile apps',
 15: 'mobile apps',
 16: 'mobile apps',
 17: 'mobile apps',
 18: 'mobile apps',
 19: 'mobile apps',
 20: 'mobile apps',
 21: 'mobile apps',
 22: 'mobile apps',
 23: 'mobile apps',
 24: 'mobile apps',
 25: 'mobile apps',
 26: 'mobile apps',
 27: 'mobile apps',
 28: 'mobile apps',
 29: 'mobile apps',
 30: 'mobile apps',
 31: 'mobile apps',
 32: 'mobile apps',
 33: 'mobile apps',
 34: 'mobile apps',
 35: 'mobile apps',
 36: 'mobile apps',
 37: 'mobile apps',
 38: 'mobile apps',
 39: 'mobile apps',
 40: 'mobile apps',
 41: 'mobile apps',
 42: 'mobile apps',
 43: 'mobile apps',
 44: 'mobile apps',
 45: 'mobile apps',
 46: 'mobile apps',
 47: 'mobile apps',
 48: 'mobile apps',
 49: 'mobile apps',
 50: 'mobi

In [60]:
len(title_category)

47583

In [61]:
df["title_category_nmf"] = title_category.values()

In [65]:
df.tail()

Unnamed: 0.1,Unnamed: 0,_class_x,content,course_id,created,id,modified,rating,title_x,user,...,user_review_count,course_review_count,course_rating_avg,instructor_name,instructor_title,new_course_id,new_user_id,course_category,Unnamed: 35,title_category_nmf
337071,337072,course_review,This course was great! Never heard of anyone ...,851826,2016-05-27T15:34:27Z,2281140,2016-05-27T15:34:27Z,5.0,,"{u'_class': u'user', u'display_name': u'Suzann...",...,3,20,4.5,u'James Canzanella',u'Utterly Simple Online Profits'}],728,16590,art,,"wordpress, rails, javascript, ruby"
337091,337092,course_review,This course provides a great introduction to c...,281692,2014-11-14T11:31:20Z,234800,2014-11-14T11:31:20Z,5.0,A great starting point for making Youtube vide...,"{u'_class': u'user', u'display_name': u'Kennet...",...,5,5,5.0,u'Robert Scot',u'Dr YouTube is a multi-media Marketing Exper...,194,13240,business,,"wordpress, rails, javascript, ruby"
337096,337097,course_review,I find the instructor easy to listen to as he ...,847460,2016-06-03T02:38:12Z,2356902,2016-06-18T09:00:30Z,5.0,,"{u'_class': u'user', u'display_name': u'Rudolf...",...,3,3,5.0,u'Diego Davila',u'Your Online Academy'}],724,59860,software development / mobile application,,"web design - html, css, wordpress"
337126,337127,course_review,Thank you very much for sharing these awesome ...,640408,2015-10-16T04:35:56Z,725148,2015-10-16T04:35:56Z,5.0,Amazing course!,"{u'_class': u'user', u'display_name': u'Vaness...",...,3,4,4.25,u'Sandor Kiss',u'Entrepreneur,521,86326,business,,mobile apps
337138,337139,course_review,I feel that I have been taken on a journey tow...,537750,2015-06-30T09:11:28Z,446584,2015-06-30T09:11:28Z,5.0,Inspirational!A supreme guide to setting and a...,"{u'_class': u'user', u'display_name': u'Katarz...",...,3,1,5.0,u'Jimmy Naraine',u'Winner of Udemy's Innovation Award & 78,422,54379,business,,java programming


In [66]:
df.to_csv("reg_nmf.csv")