In [1]:
import pandas as pd
import numpy as np
import nltk
import string
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn import naive_bayes

##  (a)
Download Sentiment Labelled Sentences Data Set. There are three data ﬁles under the root folder. yelp_labelled.txt, amazon_cells_labelled.txt and imdb_labelled.txt. Parse each ﬁle with the speciﬁcations in readme.txt. Are the labels balanced? If not, what’s the ratio between the two labels? Explain how you process these ﬁles. 

In [50]:
import csv
data1 = pd.read_csv('imdb_labelled.txt', delimiter="\t", names=['sentence', 'score'], quoting=csv.QUOTE_NONE, header = None, encoding='utf-8')
data2 = pd.read_csv('amazon_cells_labelled.txt', names=['sentence', 'score'], quoting=csv.QUOTE_NONE, delimiter="\t", header = None, encoding='utf-8')
data3 = pd.read_csv('yelp_labelled.txt', names=['sentence', 'score'], quoting=csv.QUOTE_NONE, delimiter="\t", header = None, encoding='utf-8')
df = data1.append([data2, data3])
print data1.groupby(['score']).count()
print data2.groupby(['score']).count()
print data3.groupby(['score']).count()
print df.groupby(['score']).count()

       sentence
score          
0           500
1           500
       sentence
score          
0           500
1           500
       sentence
score          
0           500
1           500
       sentence
score          
0          1500
1          1500


Process each files by parsing them into dataframe by using tab as a delimiter. Then group each dataframe by 'score' and count them. We get the count of each label in each dataframe.
The labels are balanced.
Ratio between lable 0 and lable 1 is: 1

## (b) 
Pick your preprocessing strategy. Since these sentences are online reviews, they may contain signiﬁcant amounts of noise and garbage. You may or may not want to do one or all of thefollowing. Explain the reasons for each of your decision (why or why not). <br>
• Lowercase all of the words. <br>
• Lemmatization of all the words(i.e.,convert every word to its root so that all of “running,” “run,” and “runs” are converted to “run” and and all of “good,” “well,” “better,” and “best” are converted to “good”; this is easily done using nltk.stem). <br>
• Strip punctuation. <br>
• Strip the stop words, e.g., “the”, “and”, “or”. <br>
• Something else? Tell us about it. <br>

1) Lowercase all the words: I will do that because uppercase and lowercase doesn't effect the meaning. (Except some abbreviations, which is rare.)<br>
2) Lemmatization of all the words: I will do that because these words have almost same meaning.<br>
3) Strip punctuation: I will do that since punctuation don't have much meaning/information.<br>
4) Strip the stop words: I will do that since these stop words don't have much meaning/information.<br>
5) What else: rule out the number words like "one" "two" "three", "1", "2", "3", which we think somewaht meaningless.

In [51]:
def preprocessing(A):
    stop_words = set(stopwords.words('english'))
    wnl = WordNetLemmatizer()
    ps = PorterStemmer()
    A = A.lower()                                       # 1) lower case
    A = A.split()                                       # 2) split by space
    A = " ".join([w for w in A if not w in stop_words]) # 3) filter by stop_words to delete cases like "shouldn't"
    A = nltk.word_tokenize(A)                           # 4) split by tokenize
    ret = []                                            # 5) deal with case like 'apple.banana' which can't be split by tokenize
    for elt in A:
        ret+=elt.split('.')
    ret = [w for w in ret if not w in stop_words]       # 6) filter by stop_words again
    num_set_digit = ['0','1','2','3','4','5','6','7','8','9']
    num_set_str = ['zero','one','two','three','four','five','six','seven','eight','nine']
    
                                                        # 7) delete the punctuation and number
    ret = [''.join(c for c in s if (c not in string.punctuation and c not in num_set_digit)) for s in ret if (s not in num_set_str)] 
    ret = [w for w in ret if not w in stop_words]
    ret = [s for s in ret if s]
    for i,elt in enumerate(ret):
        ret[i]=wnl.lemmatize(wnl.lemmatize(wnl.lemmatize(elt,'v'),'a'))
        ret[i]=ps.stem(ret[i])
    ret = [w for w in ret if not w in stop_words]
    return ret

In [35]:
df1=data1.copy()
df1['sentence'] = df1['sentence'].apply(preprocessing) 
df2=data2.copy()
df2['sentence'] = df2['sentence'].apply(preprocessing) 
df3=data3.copy()
df3['sentence'] = df3['sentence'].apply(preprocessing) 

## (c)
Split training and testing set. In this assignment, for each ﬁle, please use the ﬁrst 400 instances for each label as the training set and the remaining 100 instances as testing set. In total, there are 2400 reviews for training and 600 reviews for testing.

In [52]:
train1 = df1[df1['score']==0][:400].append(df1[df1['score']==1][:400])
test1 = df1[df1['score']==0][400:].append(df1[df1['score']==1][400:])
train2 = df2[df2['score']==0][:400].append(df2[df2['score']==1][:400])
test2 = df2[df2['score']==0][400:].append(df2[df2['score']==1][400:])
train3 = df3[df3['score']==0][:400].append(df3[df3['score']==1][:400])
test3 = df3[df3['score']==0][400:].append(df3[df3['score']==1][400:])

In [53]:
train = train1.append([train2,train3])
test = test1.append([test2,test3])

In [54]:
train_label = [0]*400+[1]*400+[0]*400+[1]*400+[0]*400+[1]*400
test_label = [0]*100+[1]*100+[0]*100+[1]*100+[0]*100+[1]*100

## (d)
Bag of Words model. Extract features and then represent each review using bag of words model, i.e., every word in the review becomes its own element in a feature vector. In order to do this, ﬁrst, make one pass through all the reviews in the training set (Explain why we can’t use testing set at this point) and build a dictionary of unique words. Then,make another pass through the review in both the training set and testing set and count up the occurrences of each word in your dictionary. The ith element of a review’s feature vector is the number of occurrences of the ith dictionary word in the review. Implement the bag of words model and report feature vectors of any two reviews in the training set. 

In [55]:
words_collection=train['sentence'].values
words = set()
for i in words_collection:
    words = words.union(i)
word_list = list(words)

wordsList = []
for i in range(train.shape[0]): #len(trainData)
    words1 = [0] * len(word_list)
    sub_list = words_collection[i]
    for j in range(len(word_list)):
        if word_list[j] in sub_list:
            count = len([w for w in sub_list if w==word_list[j]])
            words1[j] = count
    wordsList.append(words1)
wordsMatrix = np.matrix(wordsList)
print wordsMatrix.shape

df_train=pd.DataFrame(wordsMatrix, columns=word_list)

words_collection=test['sentence'].values
wordsList = []
for i in range(test.shape[0]):     # len(trainData)
    words = [0] * len(word_list)
    sub_list = words_collection[i]
    for j in range(len(word_list)):
        if word_list[j] in sub_list:
            words[j] = 1
    wordsList.append(words)
wordsMatrix = np.matrix(wordsList)
print wordsMatrix.shape

df_test=pd.DataFrame(wordsMatrix, columns=word_list)

(2400L, 3379L)
(600L, 3379L)


Explain why we can’t use testing set during the step of feature extraction: <br>
Answer: 
Will result in biased performance estimates.<br>
To get an unbiased performance estimate, the test data must not be used in any way to make choices about the model, including feature selection.<br>
Also since including testing data into feature selection will generate more features. It will probably cause some level of overfitting.

Report feature vectors of any two reviews in the training set:

In [56]:
df_train.iloc[[0,1]].values

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## (e)
Pick your post processing strategy. Since the vast majority of English words will not appear in most of the reviews, most of the feature vector elements will be 0. This suggests that we need a postprocessing or normalization strategy that combats the huge variance of the elements in the feature vector. You may want to use one of the following strategies. Whatever choices you make, explain why you made the decision.<br> 
• log-normalization. For each element of the feature vector x, transform it into f (x) = log(x+1).<br>
• l1 normalization. Normalize the l1 norm of the feature vector, x'=x/|x|. <br>
• l2 normalization. Normalize the l2 norm of the feature vector, x'=x/||x||. <br>
• Standardize the data by subtracting the mean and dividing by the variance. 

Explain: I pick the "log-normalization", because intuitively, 
it is true that the importance of a word increase when it appear twice in one review.
However, appearing twice doesn't means importance of this word was doubled. 
According to our intuitive sense, the contribution to the significance of the word that it appears for the second time is lower than it appears for the first time. That is why I pick log-normalization since this transformation make more sense according to our intuition.

In [57]:
def log_trans(x):
    return np.log(x+1)
df_train1 = df_train.copy()
df_test1 = df_test.copy()
df_train1 = df_train1.applymap(log_trans)
df_test1 = df_test1.applymap(log_trans)

## (f)
Sentiment prediction. Train a logistic regression model (you can use existing packages here) on the training set and test on the testing set. Report the classiﬁcation accuracy and confusion matrix. Inspecting the weight vector of the logistic regression, what are the words that play the most important roles in deciding the sentiment of the reviews? Repeat this with a Naive Bayes classiﬁer and compare performance. 

In [58]:
# Train the logistic regression model
optimal_features = df_train1.columns
X_optimal = df_train1[optimal_features]
y = train_label
logreg = LogisticRegression()
logreg.fit(X_optimal, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [59]:
# Apply the logistic model on the testing data set and get the prediction labels
y_test_pred = logreg.predict(df_test1[optimal_features])
def accuracy(Y_predict,Y_true):
    return metrics.accuracy_score(Y_true, Y_predict)
acc1 = accuracy(y_test_pred,test_label)
acc1

0.82333333333333336

Classiﬁcation accuracy: 0.82333

In [60]:
metrics.confusion_matrix(test_label, y_test_pred, labels=None, sample_weight=None)

array([[260,  40],
       [ 66, 234]], dtype=int64)

Confusion Matrix: <br>
       [[260,  40]<br>
        [ 66, 234]]

In [61]:
words_list1=[]
import_list1 =[]
words_list2=[]
import_list2 =[]

sort_index = np.argsort(logreg.coef_[0])
for i in range(20):
    words_list1.append(df_train1.columns[sort_index[i]])
    import_list1.append(logreg.coef_[0][sort_index[i]])
    
sort_index = np.argsort(logreg.coef_[0])[::-1]
for i in range(20):
    words_list2.append(df_train1.columns[sort_index[i]])
    import_list2.append(logreg.coef_[0][sort_index[i]])
print words_list1
print import_list1
print words_list2
print import_list2

[u'bad', u'poor', u'terribl', u'wast', u'slow', u'suck', u'aw', u'disappoint', u'pay', u'stupid', u'break', u'horribl', u'plot', u'start', u'piec', u'hour', u'bland', u'fail', u'avoid', u'rude']
[-3.2812639168359525, -2.2952325396057014, -1.8830955400031202, -1.8383040390163916, -1.6722630273227566, -1.6590876495976508, -1.6563185505203799, -1.5306297468568248, -1.5274601219199613, -1.4907652807106695, -1.4704691175824878, -1.4693957172218488, -1.4105980791144417, -1.3545151205861194, -1.3505709792640193, -1.3380327701836003, -1.3366533717593363, -1.3332305293879845, -1.3077009646160229, -1.2901152960382396]
[u'great', u'love', u'excel', u'delici', u'nice', u'amaz', u'fantast', u'best', u'awesom', u'beauti', u'happi', u'comfort', u'good', u'friendli', u'well', u'perfect', u'wonder', u'incred', u'price', u'cool']
[3.5869427072843343, 3.1320609338342966, 2.5626991828511585, 2.3783504171011569, 2.2501784334299848, 2.2076555578318624, 2.0426724896413044, 1.9575433108119216, 1.9155577409091

Words that play the most important roles in deciding the sentiment of the reviews:<br>
[u'bad', u'poor', u'terribl', u'wast', u'slow', u'suck', u'aw', u'disappoint', u'pay', u'stupid', u'break', u'horribl', u'plot', u'start', u'piec', u'hour', u'bland', u'fail', u'avoid', u'rude']<br>
Related importance:<br>
[-3.2812639168359525, -2.2952325396057014, -1.8830955400031202, -1.8383040390163916, -1.6722630273227566, -1.6590876495976508, -1.6563185505203799, -1.5306297468568248, -1.5274601219199613, -1.4907652807106695, -1.4704691175824878, -1.4693957172218488, -1.4105980791144417, -1.3545151205861194, -1.3505709792640193, -1.3380327701836003, -1.3366533717593363, -1.3332305293879845, -1.3077009646160229, -1.2901152960382396]<br>
Related importance:<br>
[u'great', u'love', u'excel', u'delici', u'nice', u'amaz', u'fantast', u'best', u'awesom', u'beauti', u'happi', u'comfort', u'good', u'friendli', u'well', u'perfect', u'wonder', u'incred', u'price', u'cool']<br>
[3.5869427072843343, 3.1320609338342966, 2.5626991828511585, 2.3783504171011569, 2.2501784334299848, 2.2076555578318624, 2.0426724896413044, 1.9575433108119216, 1.9155577409091826, 1.8910153220435948, 1.761196493089376, 1.688207045798499, 1.5759415371444434, 1.5566423448593143, 1.5521054396629341, 1.5365707566970275, 1.4999175751306653, 1.4181468618024908, 1.3815519531354241, 1.3646135503027823]

In [62]:
X_optimal.shape

(2400, 3379)

In [63]:
# Naive Bayes:
clf = naive_bayes.BernoulliNB()
clf.fit(X_optimal, y)
y_test_pred2 = clf.predict(df_test1[optimal_features])
acc2 = accuracy(y_test_pred2,test_label)
acc2

0.82333333333333336

Classiﬁcation accuracy: 0.82333

In [64]:
metrics.confusion_matrix(test_label, y_test_pred2, labels=None, sample_weight=None)

array([[257,  43],
       [ 63, 237]], dtype=int64)

Confusion Matrix: <br>
       [[257,  43]<br>
       [ 63, 237]]

In [65]:
words_list1=[]
import_list1 =[]
words_list2=[]
import_list2 =[]

sort_index = np.argsort(clf.coef_[0])
for i in range(20):
    words_list1.append(df_train1.columns[sort_index[i]])
    import_list1.append(clf.coef_[0][sort_index[i]])
    
sort_index = np.argsort(clf.coef_[0])[::-1]
for i in range(20):
    words_list2.append(df_train1.columns[sort_index[i]])
    import_list2.append(clf.coef_[0][sort_index[i]])
print words_list1
print import_list1
print words_list2
print import_list2

[u'inexperi', u'disconnect', u'sync', u'unsatisfi', u'sand', u'heist', u'charismafre', u'ticker', u'ticket', u'verg', u'fat', u'reader', u'fals', u'godfath', u'neglig', u'grab', u'humili', u'sashimi', u'unbear', u'dollar']
[-7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153]
[u'great', u'good', u'film', u'phone', u'love', u'movi', u'work', u'well', u'like', u'best', u'food', u'make', u'realli', u'place', u'time', u'excel', u'nice', u'go', u'servic', u'price']
[-2.0041457798627693, -2.0418861078456159, -2.6972929604227138, -2.7742540015588428, -2.8150759960790976, -2.8290622380538375, -2.8576356104978933, -3.0844089298626818, -3.121450201543031,

Words that play the most important roles in deciding the sentiment of the reviews:<br>
[u'inexperi', u'disconnect', u'sync', u'unsatisfi', u'sand', u'heist', u'charismafre', u'ticker', u'ticket', u'verg', u'fat', u'reader', u'fals', u'godfath', u'neglig', u'grab', u'humili', u'sashimi', u'unbear', u'dollar']
[-7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153]
[u'great', u'good', u'film', u'phone', u'love', u'movi', u'work', u'well', u'like', u'best', u'food', u'make', u'realli', u'place', u'time', u'excel', u'nice', u'go', u'servic', u'price']
[-2.0041457798627693, -2.0418861078456159, -2.6972929604227138, -2.7742540015588428, -2.8150759960790976, -2.8290622380538375, -2.8576356104978933, -3.0844089298626818, -3.121450201543031, -3.1404983965137254, -3.1599164823708272, -3.1599164823708272, -3.2205411041872618, -3.2415945133850945, -3.3075524811768919, -3.3075524811768919, -3.3540724968117845, -3.3540724968117845, -3.378170048390845, -3.4028626609812167]

## (g)
N-gram model. Similar to the bag of words model, but now you build up a dictionary of ngrams, which are contiguous sequences of words. For example, “Alice fell down the rabbit hole” would then map to the 2-grams sequence: ["Alice fell", "fell down", "down the", "the rabbit","rabbithole"], and all ﬁve of those symbols would be member softhen-gram dictionary. Try n=2, repeat (d)-(g) and report your results.

In [66]:
%%time
print 'start'
def two_gram(list_of_words):
    ret = []
    if(len(list_of_words)<2):
        return list_of_words
    for i in range(len(list_of_words)-1):
        ret.append(list_of_words[i]+' '+list_of_words[i+1])
    return ret

words_collection=train['sentence'].values
words_two = set()
for i in words_collection:
    words_two = words_two.union(two_gram(i))
word_list_two = list(words_two)

wordsList_two = []
for i in range(train.shape[0]): #len(trainData)
    words_two = [0] * len(word_list_two)
    sub_list = two_gram(words_collection[i])
    for j in range(len(word_list_two)):
        if word_list_two[j] in sub_list:
            count = len([w for w in sub_list if w==word_list_two[j]])
            words_two[j] = count
    wordsList_two.append(words_two)
wordsMatrix_two = np.matrix(wordsList_two)
print wordsMatrix_two.shape

df_train_two=pd.DataFrame(wordsMatrix_two, columns=word_list_two)

words_collection=test['sentence'].values
wordsList_two = []
for i in range(test.shape[0]):     # len(trainData)
    words_two = [0] * len(word_list_two)
    sub_list = two_gram(words_collection[i])
    for j in range(len(word_list_two)):
        if word_list_two[j] in sub_list:
            count = len([w for w in sub_list if w==word_list_two[j]])
            words_two[j] = count
    wordsList_two.append(words_two)
wordsMatrix_two = np.matrix(wordsList_two)
print wordsMatrix_two.shape

df_test_two=pd.DataFrame(wordsMatrix_two, columns=word_list_two)

df_train_two1 = df_train_two.copy()
df_train_two1 = df_train_two1.applymap(log_trans)
df_test_two1 = df_test_two.copy()
df_test_two1 = df_test_two1.applymap(log_trans)
print 'end'

start
(2400L, 10633L)
(600L, 10633L)
end
Wall time: 1min 48s


In [67]:
optimal_features = df_train_two1.columns
X_optimal = df_train_two1[optimal_features]
y = train_label
logreg = LogisticRegression()
logreg.fit(X_optimal, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [68]:
y_test_pred = logreg.predict(df_test_two1[optimal_features])
acc2 = accuracy(y_test_pred,test_label)
acc2

0.63666666666666671

Accuracy: 0.636666667

In [69]:
metrics.confusion_matrix(test_label, y_test_pred, labels=None, sample_weight=None)

array([[269,  31],
       [187, 113]], dtype=int64)

Confusion Matrix:<br>
    [[269,  31]<br>
    [187, 113]]<br>

In [70]:
words_list1=[]
import_list1 =[]
words_list2=[]
import_list2 =[]

sort_index = np.argsort(logreg.coef_[0])
for i in range(20):
    words_list1.append(word_list_two[sort_index[i]])
    import_list1.append(logreg.coef_[0][sort_index[i]])
#     print df_train_two1.columns[sort_index[i]], logreg.coef_[0][sort_index[i]]
# print ''
sort_index = np.argsort(logreg.coef_[0])[::-1]
for i in range(20):
#     print df_train_two1.columns[sort_index[i]], logreg.coef_[0][sort_index[i]]
    words_list2.append(word_list_two[sort_index[i]])
    import_list2.append(logreg.coef_[0][sort_index[i]])
print words_list1
print import_list1
print words_list2
print import_list2

[u'wast time', u'disappoint', u'wast money', u'custom servic', u'ever go', u'poor qualiti', u'stay away', u'piec junk', u'horribl', u'rat', u'bad phone', u'bad film', u'bad ever', u'buy product', u'realli bad', u'wait wait', u'make mistak', u'even bad', u'act bad', u'anytim soon']
[-1.6444815091250979, -1.2855503906807295, -1.1904644557780506, -0.86593733385660054, -0.83844893270422161, -0.81449177317578758, -0.80894479262792474, -0.77163482414965578, -0.75944116231123826, -0.75944116231123826, -0.75525665583100299, -0.75175495638943091, -0.7454796755159071, -0.74059635096571419, -0.7240540258640854, -0.71217195190918381, -0.70123416877166989, -0.68061171733171044, -0.66954482917475155, -0.66455038697387958]
[u'work great', u'highli recommend', u'great phone', u'great product', u'food good', u'realli good', u'easi use', u'great food', u'great film', u'reason price', u'good price', u'food delici', u'great servic', u'love place', u'film great', u'work fine', u'pretti good', u'well make',

Words that play the most important roles in deciding the sentiment of the reviews:<br>
[u'wast time', u'disappoint', u'wast money', u'custom servic', u'ever go', u'poor qualiti', u'stay away', u'piec junk', u'horribl', u'rat', u'bad phone', u'bad film', u'bad ever', u'buy product', u'realli bad', u'wait wait', u'make mistak', u'even bad', u'act bad', u'anytim soon']
[-1.6444815091250979, -1.2855503906807295, -1.1904644557780506, -0.86593733385660054, -0.83844893270422161, -0.81449177317578758, -0.80894479262792474, -0.77163482414965578, -0.75944116231123826, -0.75944116231123826, -0.75525665583100299, -0.75175495638943091, -0.7454796755159071, -0.74059635096571419, -0.7240540258640854, -0.71217195190918381, -0.70123416877166989, -0.68061171733171044, -0.66954482917475155, -0.66455038697387958]
[u'work great', u'highli recommend', u'great phone', u'great product', u'food good', u'realli good', u'easi use', u'great food', u'great film', u'reason price', u'good price', u'food delici', u'great servic', u'love place', u'film great', u'work fine', u'pretti good', u'well make', u'love', u'good product']
[2.0307227755669195, 1.7491887827540205, 1.2796754398367292, 1.1787131919639442, 1.0702006992867952, 1.0695341840187276, 1.0020469987169651, 0.9703220313588089, 0.92408224795863603, 0.90895296550718518, 0.89200123011964405, 0.8819378547993203, 0.8783656346385359, 0.86171341154701209, 0.83238795128683918, 0.82670170073928551, 0.80310424663431712, 0.78276069062816844, 0.77861898713901612, 0.77093319037129282]

In [71]:
# Naive Bayes:
clf = naive_bayes.BernoulliNB()
clf.fit(X_optimal, y)
y_test_pred2 = clf.predict(df_test_two1[optimal_features])
acc2 = accuracy(y_test_pred2,test_label)
acc2

0.63666666666666671

Accuracy: 0.63666666666666671

In [72]:
metrics.confusion_matrix(test_label, y_test_pred2, labels=None, sample_weight=None)

array([[270,  30],
       [188, 112]], dtype=int64)

Confusion Matrix:<br>
    [[270,  30]<br>
    [188, 112]]

In [74]:
words_list1=[]
import_list1 =[]
words_list2=[]
import_list2 =[]

sort_index = np.argsort(clf.coef_[0])
for i in range(20):
    words_list1.append(word_list_two[sort_index[i]])
    import_list1.append(clf.coef_[0][sort_index[i]])
    
sort_index = np.argsort(clf.coef_[0])[::-1]
for i in range(20):
    words_list2.append(word_list_two[sort_index[i]])
    import_list2.append(clf.coef_[0][sort_index[i]])
print words_list1
print import_list1
print words_list2
print import_list2

[u'littl els', u'obviou bluegreenscreen', u'low qualiti', u'servic receiv', u'waitress littl', u'alway old', u'entir movi', u'muffin come', u'see ok', u'redeem would', u'belowpar script', u'uneasi bad', u'never treat', u'superfici movi', u'work balanc', u'ear hurt', u'headset time', u'side restaur', u'feel cheat', u'time serv']
[-7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153]
[u'work great', u'highli recommend', u'work well', u'sound qualiti', u'film great', u'great phone', u'great food', u'go back', u'great product', u'come back', u'batteri life', u'great film', u'realli good', u'great servic', u'food good', u'ca nt', u'easi use', u'realli

Words that play the most important roles in deciding the sentiment of the reviews:<br>
[u'littl els', u'obviou bluegreenscreen', u'low qualiti', u'servic receiv', u'waitress littl', u'alway old', u'entir movi', u'muffin come', u'see ok', u'redeem would', u'belowpar script', u'uneasi bad', u'never treat', u'superfici movi', u'work balanc', u'ear hurt', u'headset time', u'side restaur', u'feel cheat', u'time serv']
[-7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153, -7.091742115095153]
[u'work great', u'highli recommend', u'work well', u'sound qualiti', u'film great', u'great phone', u'great food', u'go back', u'great product', u'come back', u'batteri life', u'great film', u'realli good', u'great servic', u'food good', u'ca nt', u'easi use', u'realli like', u'work fine', u'year ago']
[-3.9562478991660033, -4.3836919139929424, -4.7891570221011071, -4.7891570221011071, -5.0123005734153168, -5.0123005734153168, -5.0123005734153168, -5.0123005734153168, -5.0123005734153168, -5.1458319660398395, -5.1458319660398395, -5.1458319660398395, -5.1458319660398395, -5.1458319660398395, -5.1458319660398395, -5.1458319660398395, -5.2999826458670984, -5.2999826458670984, -5.2999826458670984, -5.2999826458670984]

## (h)
PCA for bag of words model. The features in the bag of words model have large redundancy. Implement PCA to reduce the dimension of features calculated in (e) to 10, 50 and 100 respectively. Using these lower-dimensional feature vectors and repeat (f), (g). Report corresponding clustering and classiﬁcation results. (Note: You should implement PCA yourself, but you can usenumpy.svdor some other SVD package. Feel free to double-check your PCA implementation against an existing one) 

In [98]:
(U,s,Vh) = np.linalg.svd(df_train1,full_matrices=False, compute_uv=True)

In [99]:
print U.shape
print s.shape
print Vh.shape

(2400L, 2400L)
(2400L,)
(2400L, 3379L)


In [110]:
Xr_train1=np.dot(df_train1,np.transpose(Vh[:10,:]))
Xr_train2=np.dot(df_train1,np.transpose(Vh[:50,:]))
Xr_train3=np.dot(df_train1,np.transpose(Vh[:100,:]))
Xr_test1=np.dot(df_test1,np.transpose(Vh[:10,:]))
Xr_test2=np.dot(df_test1,np.transpose(Vh[:50,:]))
Xr_test3=np.dot(df_test1,np.transpose(Vh[:100,:]))

#### 10 features:

In [111]:
# Train the logistic regression model
X_optimal = Xr_train1
y = train_label
logreg = LogisticRegression()
logreg.fit(X_optimal, y)
# Apply the logistic model on the testing data set and get the prediction labels
y_test_pred = logreg.predict(Xr_test1)
def accuracy(Y_predict,Y_true):
    return metrics.accuracy_score(Y_true, Y_predict)
acc1 = accuracy(y_test_pred,test_label)
print acc1
metrics.confusion_matrix(test_label, y_test_pred, labels=None, sample_weight=None)

0.618333333333


array([[251,  49],
       [180, 120]], dtype=int64)

Accuracy: 0.618<br>
Confusion Matrix:<br>
[[251,  49]<br>
       [180, 120]]

In [112]:
# Naive Bayes:
clf = naive_bayes.BernoulliNB()
clf.fit(X_optimal, y)
y_test_pred2 = clf.predict(Xr_test1)
acc2 = accuracy(y_test_pred2,test_label)
print acc2
metrics.confusion_matrix(test_label, y_test_pred2, labels=None, sample_weight=None)

0.606666666667


array([[190, 110],
       [126, 174]], dtype=int64)

Accuracy: 0.6067<br>
Confusion Matrix:<br>
       [[190, 110]<br>
       [126, 174]]

#### 50 features:

In [113]:
# Train the logistic regression model
X_optimal = Xr_train2
y = train_label
logreg = LogisticRegression()
logreg.fit(X_optimal, y)
# Apply the logistic model on the testing data set and get the prediction labels
y_test_pred = logreg.predict(Xr_test2)
def accuracy(Y_predict,Y_true):
    return metrics.accuracy_score(Y_true, Y_predict)
acc1 = accuracy(y_test_pred,test_label)
print acc1
metrics.confusion_matrix(test_label, y_test_pred, labels=None, sample_weight=None)

0.695


array([[256,  44],
       [139, 161]], dtype=int64)

Accuracy: 0.695<br>
Confusion Matrix:<br>
[[256,  44]<br>
[139, 161]]

In [114]:
# Naive Bayes:
clf = naive_bayes.BernoulliNB()
clf.fit(X_optimal, y)
y_test_pred2 = clf.predict(Xr_test2)
acc2 = accuracy(y_test_pred2,test_label)
print acc2
metrics.confusion_matrix(test_label, y_test_pred2, labels=None, sample_weight=None)

0.626666666667


array([[213,  87],
       [137, 163]], dtype=int64)

Accuracy: 0.626666666667<br>
Confusion Matrix:<br>
[[213,  87]<br>
[137, 163]]

#### 100 features:

In [115]:
# Train the logistic regression model
X_optimal = Xr_train3
y = train_label
logreg = LogisticRegression()
logreg.fit(X_optimal, y)
# Apply the logistic model on the testing data set and get the prediction labels
y_test_pred = logreg.predict(Xr_test3)
def accuracy(Y_predict,Y_true):
    return metrics.accuracy_score(Y_true, Y_predict)
acc1 = accuracy(y_test_pred,test_label)
print acc1
metrics.confusion_matrix(test_label, y_test_pred, labels=None, sample_weight=None)

0.715


array([[246,  54],
       [117, 183]], dtype=int64)

Accuracy: 0.715<br>
Confusion Matrix:<br>
[[246,  54]<br>
[117, 183]]

In [116]:
# Naive Bayes:
clf = naive_bayes.BernoulliNB()
clf.fit(X_optimal, y)
y_test_pred2 = clf.predict(Xr_test3)
acc2 = accuracy(y_test_pred2,test_label)
print acc2
metrics.confusion_matrix(test_label, y_test_pred2, labels=None, sample_weight=None)

0.65


array([[210,  90],
       [120, 180]], dtype=int64)

Accuracy: 0.65<br>
Confusion Matrix:<br>
[[210,  90]<br>
[120, 180]]

## 2-gram

In [117]:
df_train_two1
(U,s,Vh) = np.linalg.svd(df_train_two1,full_matrices=False, compute_uv=True)
print U.shape
print s.shape
print Vh.shape

(2400L, 2400L)
(2400L,)
(2400L, 10633L)


In [118]:
Xr_train1=np.dot(df_train_two1,np.transpose(Vh[:10,:]))
Xr_train2=np.dot(df_train_two1,np.transpose(Vh[:50,:]))
Xr_train3=np.dot(df_train_two1,np.transpose(Vh[:100,:]))
Xr_test1=np.dot(df_test_two1,np.transpose(Vh[:10,:]))
Xr_test2=np.dot(df_test_two1,np.transpose(Vh[:50,:]))
Xr_test3=np.dot(df_test_two1,np.transpose(Vh[:100,:]))

#### 10 features:

In [132]:
# Train the logistic regression model
X_optimal = Xr_train1
y = train_label
logreg = LogisticRegression()
logreg.fit(X_optimal, y)
# Apply the logistic model on the testing data set and get the prediction labels
y_test_pred = logreg.predict(Xr_test1)
print y_test_pred.shape
def accuracy(Y_predict,Y_true):
    return metrics.accuracy_score(Y_true, Y_predict)
acc1 = accuracy(y_test_pred,test_label)
print acc1
metrics.confusion_matrix(test_label, y_test_pred, labels=None, sample_weight=None)

(600L,)
0.515


array([[297,   3],
       [288,  12]], dtype=int64)

Accuracy: 0.515<br>
Confusion Matrix:<br>
[[297,   3]<br>
[288,  12]]

In [135]:
# Naive Bayes:
clf = naive_bayes.BernoulliNB()
clf.fit(X_optimal, y)
y_test_pred2 = clf.predict(Xr_test1)
acc2 = accuracy(y_test_pred2,test_label)
print acc2
metrics.confusion_matrix(test_label, y_test_pred2, labels=None, sample_weight=None)

0.573333333333


array([[234,  66],
       [190, 110]], dtype=int64)

Accuracy: 0.573333333333<br>
Confusion Matrix:<br>
[[234,  66]<br>
[190, 110]]

#### 50 features:

In [138]:
# Train the logistic regression model
X_optimal = Xr_train2
y = train_label
logreg = LogisticRegression()
logreg.fit(X_optimal, y)
# Apply the logistic model on the testing data set and get the prediction labels
y_test_pred = logreg.predict(Xr_test2)
def accuracy(Y_predict,Y_true):
    return metrics.accuracy_score(Y_true, Y_predict)
acc1 = accuracy(y_test_pred,test_label)
print acc1
metrics.confusion_matrix(test_label, y_test_pred, labels=None, sample_weight=None)

0.525


array([[294,   6],
       [279,  21]], dtype=int64)

Accuracy: 0.525<br>
Confusion Matrix:<br>
[[294,   6]<br>
[279,  21]]

In [140]:
# Naive Bayes:
clf = naive_bayes.BernoulliNB()
clf.fit(X_optimal, y)
y_test_pred2 = clf.predict(Xr_test2)
acc2 = accuracy(y_test_pred2,test_label)
print acc2
metrics.confusion_matrix(test_label, y_test_pred2, labels=None, sample_weight=None)

0.57


array([[233,  67],
       [191, 109]], dtype=int64)

Accuracy: 0.57<br>
Confusion Matrix:<br>
[[233,  67]<br>
[191, 109]]

#### 100 features:

In [146]:
# Train the logistic regression model
X_optimal = Xr_train3
y = train_label
logreg = LogisticRegression()
logreg.fit(X_optimal, y)
# Apply the logistic model on the testing data set and get the prediction labels
y_test_pred = logreg.predict(Xr_test3)
def accuracy(Y_predict,Y_true):
    return metrics.accuracy_score(Y_true, Y_predict)
acc1 = accuracy(y_test_pred,test_label)
print acc1
metrics.confusion_matrix(test_label, y_test_pred, labels=None, sample_weight=None)

0.538333333333


array([[287,  13],
       [264,  36]], dtype=int64)

Accuracy: 0.538333333333<br>
Confusion Matrix:<br>
[[287,  13]<br>
[264,  36]]

In [148]:
# Naive Bayes:
clf = naive_bayes.BernoulliNB()
clf.fit(X_optimal, y)
y_test_pred2 = clf.predict(Xr_test3)
acc2 = accuracy(y_test_pred2,test_label)
print acc2
metrics.confusion_matrix(test_label, y_test_pred2, labels=None, sample_weight=None)

0.561666666667


array([[234,  66],
       [197, 103]], dtype=int64)

Accuracy: 0.561666666667<br>
Confusion Matrix:<br>
[[234,  66]<br>
[197, 103]]