In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction import text 
from pattern.en import parse
from pattern.en import pprint
from pattern.vector import stem, PORTER, LEMMA
punctuation = list(".,;:!?()[]{}`'\"@#$^&*+-|=~_")
from mrjob.job import MRJob
from collections import Counter, defaultdict
import nltk
from sklearn.feature_extraction.text import CountVectorizer
import gensim
import string
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
from gensim import corpora

Using TensorFlow backend.


In [2]:
df = pd.read_csv("processed_data.csv")

In [75]:
df.shape

(337211, 36)

In [3]:
df["resp"] = df["rating"] >= df["user_rating_avg"]

In [77]:
np.mean(df["resp"])

0.94625323610439749

In [78]:
priorp = np.mean(df["resp"])
priorn = 1 - priorp
priorp, priorn

(0.94625323610439749, 0.053746763895602512)

In [4]:
stop = set(stopwords.words('english'))
stop.add("course")
stop.add("course.")
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

In [5]:
df = df[df["user_review_count"] > 2]

In [6]:
df = df[df["content"].notnull()]

In [7]:
def is_ascii(s):
    return all(ord(c) < 128 for c in s)


In [8]:
df["valid_char"] = df["content"].apply(is_ascii)

In [9]:
df = df[df["valid_char"] == True]

In [10]:
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

In [11]:
df["clean_content"] = df["content"].apply(clean)

In [12]:
df["clean_content"].head()

7                                                  love
26    wonderful explanation clearly deep understandi...
35    learned lot hope youll release angular 20 sure...
40    excellent course anthony great job explaining ...
48    great far always anthony delivers great qualit...
Name: clean_content, dtype: object

In [13]:
def get_parts(text):
    nouns = set()
    descriptives=set()
    
    text = text.lower().split()
    text = [i for i in text if i not in stop]
    text = [i for i in text if i not in punctuation]
    text = [i for i in text if len(i) > 1]
    for word, pos in nltk.pos_tag(text): # remove the call to nltk.pos_tag if `sentence` is a list of tuples as described above

        if pos in ['NN', "NNP"]: # feel free to add any other noun tags
            nouns.add(word)
        elif pos in ["JJ", "JJR"]:
            descriptives.add(word)
    return list(nouns), list(descriptives)

In [14]:
df["review"] = df["clean_content"].apply(get_parts)

In [15]:
review_parts=df.review.tolist()

In [16]:
nouns = [e[0] for e in review_parts]

In [17]:
len(nouns)

46887

In [93]:
flat_nouns = [item for sublist in nouns for item in sublist]

In [18]:
dictionary = corpora.Dictionary(nouns)

In [19]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in nouns]

In [96]:
Lda = gensim.models.ldamodel.LdaModel

In [97]:
ldamodel = Lda(doc_term_matrix, num_topics=2, id2word = dictionary, passes=20)

In [33]:
ldamodel.print_topics()

[(0,
  u'0.018*"way" + 0.017*"time" + 0.015*"thing" + 0.012*"work" + 0.010*"beginner" + 0.009*"im" + 0.009*"anyone" + 0.008*"use" + 0.007*"lot" + 0.007*"video"'),
 (1,
  u'0.054*"instructor" + 0.041*"information" + 0.029*"thank" + 0.026*"lot" + 0.024*"explanation" + 0.021*"content" + 0.018*"material" + 0.018*"understand" + 0.018*"example" + 0.013*"teacher"')]

In [34]:
for bow in doc_term_matrix[0:100:15]:
    print bow
    print ldamodel.get_document_topics(bow)
    print " ".join([dictionary[e[0]] for e in bow])
    print "========================================"

[(0, 1)]
[(0, 0.25001156641181005), (1, 0.74998843358819001)]
love
[(13, 1), (47, 1), (48, 1)]
[(0, 0.12778427204630366), (1, 0.87221572795369628)]
content learning point
[(45, 1), (62, 1), (74, 1), (75, 1)]
[(0, 0.8810777943172069), (1, 0.11892220568279317)]
way cover perfect knowledge
[(6, 1), (13, 1), (17, 1), (60, 1), (61, 1), (91, 1), (107, 1), (108, 1)]
[(0, 0.36289018292176883), (1, 0.63710981707823122)]
lot content lesson detail everything time start attention
[(10, 1), (26, 1)]
[(0, 0.4866915613050587), (1, 0.51330843869494125)]
thank tony
[(127, 1), (155, 1)]
[(0, 0.49614729995503543), (1, 0.50385270004496452)]
teacher scene
[(2, 1), (50, 1)]
[(0, 0.16671904541311267), (1, 0.83328095458688733)]
material instructor


In [35]:
adj = [e[1] for e in review_parts]

In [36]:
len(adj)

46887

In [37]:
flat_adj = [item for sublist in adj for item in sublist]

In [38]:
flat_adj[:10]

[u'wonderful',
 u'deep',
 'angular',
 u'great',
 u'subject',
 u'excellent',
 u'content',
 u'great',
 u'old',
 u'current']

In [39]:
adjvo = set(flat_adj)
adjvocab ={}
for i, word in enumerate(adjvo):
    adjvocab[word] = i

In [40]:
adjvocab["great"]

1465

In [41]:
dictionary_adj = corpora.Dictionary(adj)

In [42]:
len(dictionary_adj)

7468

In [43]:
doc_term_matrix_adj = [dictionary_adj.doc2bow(doc) for doc in adj]

In [44]:
len(doc_term_matrix_adj)

46887

In [45]:
import itertools
xarray= map(lambda i: " ".join(list(itertools.chain.from_iterable(i))), adj)


In [46]:
len(xarray)

46887

In [47]:
resp = df["resp"].tolist()

In [48]:
len(resp), len(xarray)

(46887, 46887)

In [49]:
from sklearn.cross_validation import train_test_split
itrain, itest = train_test_split(xrange(len(xarray)), train_size=0.7)
mask=np.ones(len(xarray), dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)



In [50]:
X=np.array(xarray)
y=np.array(resp)

In [51]:
def make_xy(X_col, y_col, vectorizer):
    X = vectorizer.fit_transform(X_col)
    y = y_col
    return X, y


In [52]:
def log_likelihood(clf, x, y):
    prob = clf.predict_log_proba(x)
    negs = y == 0
    posivs = ~negs
    return prob[negs, 0].sum() + prob[posivs, 1].sum()

In [53]:
from sklearn.cross_validation import KFold

def cv_score(clf, x, y, score_func, nfold=5):
    
    result = 0
    for train, test in KFold(y.size, nfold): # split data into train/test groups, 5 times
        clf.fit(x[train], y[train]) # fit
        result += score_func(clf, x[test], y[test]) # evaluate score function on held-out data
    return result / nfold # average


In [54]:
len(adjvocab)

7468

In [55]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB


In [56]:
alphas = [ 0.1, 1, 5, 10, 50]
#min_dfs = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0.5, 0.9]

#Find the best value for alpha and min_df, and the best classifier
best_alpha = None
#best_min_df = None
max_loglike = -np.inf

for alpha in alphas:        
    vectorizer = CountVectorizer(vocabulary=adjvocab)       
    aX, ay = make_xy(X, y, vectorizer)
    tX=aX[mask]
    ty=ay[mask]
    clf = MultinomialNB(alpha=alpha)
    loglike = cv_score(clf, tX, ty, log_likelihood)

    if loglike > max_loglike:
        max_loglike = loglike
        best_alpha = alpha


In [57]:
best_alpha

0.1

In [58]:
vectorizer = CountVectorizer(vocabulary=adjvocab)
aX, ay = make_xy(X, y, vectorizer)
tX=aX[mask]
ty=ay[mask]
teX=aX[~mask]
tey=ay[~mask]

clfnew = MultinomialNB(alpha=best_alpha).fit(tX, ty)

training_accuracy = clfnew.score(tX, ty)
test_accuracy = clfnew.score(teX, tey)

print "Accuracy on training data: %0.2f" % (training_accuracy)
print "Accuracy on test data:     %0.2f" % (test_accuracy)

Accuracy on training data: 0.84
Accuracy on test data:     0.84


In [59]:
logpositives=dict(zip(vectorizer.get_feature_names(),clfnew.feature_log_prob_[1]))
lognegatives=dict(zip(vectorizer.get_feature_names(),clfnew.feature_log_prob_[0]))

In [60]:
vectorizer.get_feature_names()

[u'yellow',
 u'drawpose',
 u'francesco',
 u'nodejavascript',
 u'localized',
 u'unblocked',
 'fouryear',
 u'unanswered',
 u'accrej',
 u'shortcutshelpful',
 u'goodpractical',
 u'informativeif',
 u'digit',
 'rational',
 u'helpfulinsightful',
 'uncertain',
 'werid',
 u'invokable',
 u'prize',
 u'customizable',
 u'rabbitmq',
 u'oooh',
 u'straight',
 u'muchengagingpractical',
 u'tired',
 u'wellspoken',
 u'elegant',
 'second',
 u'inanimate',
 u'admire',
 u'excelllent',
 u'everthibg',
 u'hreflite',
 u'designing',
 u'resilient',
 u'numeral',
 u'specialist',
 u'widget',
 u'hero',
 'intentioned',
 u'specialise',
 u'affiliated',
 'conversational',
 u'gobut',
 u'uplifting',
 u'elaborate',
 u'controversy',
 u'military',
 u'numerical',
 u'golden',
 u'divide',
 u'breaksdown',
 'explained',
 u'lengthen',
 u'brought',
 u'unix',
 u'notary',
 u'udacitys',
 u'browse',
 u'dnn',
 u'webpack',
 u'understanable',
 u'strike',
 u'holy',
 'successful',
 u'jms',
 u'hurt',
 u'hole',
 u'hold',
 'addon',
 u'pursue',
 u

In [61]:
def calc_pplus(adjlist, lp, ln, pp,pn):
    if type(adjlist) == list:
        pos = np.exp(np.sum([lp[a] for a in adjlist]))
        neg = np.exp(np.sum([ln[a] for a in adjlist]))
        
    else:
        pos = np.sum(lp[adjlist])
        neg = np.sum(ln[adjlist])
        
    pplus=1./(1. + (neg*pn)/(pos*pp))
    return pplus

In [62]:
ids = df["id"].tolist()

In [63]:
len(ids)

46887

In [98]:
def choose_topic(ldamodel, bow):
    tee = ldamodel.get_document_topics(bow)
    if len(tee)==2:
        t1,t2=tee
        if t2[1] >= t1[1]:
            topicis=t2[0]
        else:
            topicis=t1[0]
    elif len(tee)==1:
        teetuple=tee[0]
        topicis=teetuple[0]
    return topicis


In [110]:
noun_dic = {}
for i in xrange(len(df)):
    noun = df.iloc[i]["review"][0]
    topic = max(ldamodel.get_document_topics(dictionary.doc2bow(df.iloc[i]["review"][0])), key = lambda i: i[1])
    if len(noun) == 0:
        noun_dic[i] = "no topic"
    
    elif topic[0] == 0:
        noun_dic[i] = "course content and video quality"
    else:
        noun_dic[i] = "instructor and explanations"
    

In [115]:
topics = noun_dic.values()

In [65]:
counter=0
reviewdict={}
for i, rid in enumerate(ids):
    rlist=[]
    nlist, alist = review_parts[i]
    ln=len(nlist)
    localbow=doc_term_matrix[counter:counter+ln]
    for bow, adj, noun in zip(localbow, alist, nlist):
        doc=" ".join([dictionary[e[0]] for e in bow])
        pplus=calc_pplus(adj, logpositives, lognegatives, priorp, priorn)
        topicis=choose_topic(ldamodel, bow)
        ldict={"topic": topicis, 'pplus':pplus}
        rlist.append(ldict)
    reviewdict[rid]=rlist
    counter=counter+ln

In [66]:
def topic(x):
    if len(reviewdict[x]) == 0:
        return "no topic"
    elif reviewdict[x][0]["topic"] == 0:
        return "instructor and his explanations"
    else:
        return "course content and video quality"
    

In [67]:
def pplus(x):
    if len(reviewdict[x]) == 0:
        return "no comments"
    elif reviewdict[x][0]["pplus"] > 0.5:
        return "positive"
    else:
        return "negative"

In [116]:
df["topic"] = pd.Series(topics)

In [117]:
len(df[df["topic"] == "no topic"])

747

In [177]:
df["topic"].head()

7                             no topic
26    course content and video quality
35    course content and video quality
40     instructor and his explanations
48    course content and video quality
Name: topic, dtype: object

In [118]:
df.to_csv("reg_text.csv")