In [48]:
%matplotlib inline
# from numpy import *
import numpy as np
import math
import re
import glob

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn import linear_model

## b) Preprocessing and c) Split training and testing set
Since these sentences are online reviews, they may contain significant amounts of noise and garbage.

* Lowercase all of the words
* Lemmatization of all the words
* Strip punctuation
* Strip the stop words

for each file, please use the first 400 in- stances for each label as the training set and the remaining 100 instances as testing set. In total, there are 2400 reviews for training and 600 reviews for testing.

In [28]:
entries = [] # words for each entity for all files
labels = [] # sentiment label for all files

# for Spliting training and testing set
test_X = []
test_Y = []
train_X = []
train_Y = []

for filename in glob.glob('./data/*.txt'):
    f = open(filename, 'r')
    positiveCnt = 0; negativeCnt = 0
    sampThreshold = 800
    for line in f:
        temp = line.split('\t') # split strings and sentiment label
        label = temp[1][0]
        labels.append(label) # add sentiment label

        # remove all signs and spaces
        # 'filter' is for removing all null strings after regluar expression applied
        # 're' is for applying regluar exp so that signs would be remove
        # 'lower' is for make all words become lower case
        sentence = filter(None, re.split(r'[^a-zA-Z\']', temp[0].lower()))

        # Then we will move all stop words such as 'to', 'I', 'that', and 'the'
        # And apply the Wordnet Lemmatizer to remove any affix
        stop = set(stopwords.words('english'))
        lmtzr = WordNetLemmatizer()
        parsed = [lmtzr.lemmatize(word) for word in sentence if word not in stop]
        entries.append(parsed)
        
        # Split data based on its label and sperate them into different group
        # this make sure the training data is well balanced.
        if label == '1' and positiveCnt < 400:
            train_X.append(parsed)
            train_Y.append(label)
            positiveCnt += 1
            continue
        if label == '0' and negativeCnt < 400:
            train_X.append(parsed)
            train_Y.append(label)
            negativeCnt += 1
            continue
        test_X.append(parsed)
        test_Y.append(label)
        

print 'There are totally ' + str(len(train_X)) + ' reviews for training'
print str(len(test_X)) + ' reviews for testing.'

There are totally 2400 reviews for training
600 reviews for testing.


## d) Bag of Words model
Extract features and then represent each review using bag of words model, i.e.,  
every word in the review becomes its own element in a feature vector.In order to do this,  
1. make one pass through all the reviews in the training set and build a dictionary of unique words.  
2. Then, make another pass through the review in both the training set and testing set and count up the occurrences of each word in your dictionary. 

The ith element of a review’s feature vector is the number of occurrences of the i th dictionary word in the review.  
Implement the bag of words model and report feature vectors of any two reviews in the training set.

In [35]:
"""
Create a table with all words
    the table is a 1*d table, with each parsed word

If directly print the matrix, there will be char u'*', it is because json parser read string
    as unicode. It will be gone when fetch directly (e.g. ingredientsTable[0] print will not show the u')
"""
def BagOfWordModel(train_X, test_Y):
    wordTable = set()  # A table contains all words in training set
    for i in range(len(train_X)):
        for j in range(len(train_X[i])):
            if train_X[i][j] not in wordTable:
                wordTable.add(train_X[i][j])
    wordTable = list(wordTable)

    train_X_data = np.zeros((len(train_X), len(wordTable)))  # A table with 0 and 1 to indicate
    for i in range(len(train_X)):
        for j in range(len(train_X[i])):
            train_X_data[i][wordTable.index(train_X[i][j])] += 1
        
    test_X_data = np.zeros((len(test_X), len(wordTable)))  # A table with 0 and 1 to indicate
    for i in range(len(test_X)):
        for j in range(len(test_X[i])):
            if test_X[i][j] in wordTable:  # Only add words that included in the train set
                test_X_data[i][wordTable.index(test_X[i][j])] += 1
    return train_X_data, test_X_data

In [None]:
print train_X_data[np.random.randint(len(train_X_data))].tolist()
print train_X_data[np.random.randint(len(train_X_data))].tolist()

## e) Pick your postprocessing strategy

In [42]:
# here i choose to use L1 Norm postprocessing strategy
# this part to be discussed. not very sure
def postprocessing(train_X_data):
    for i in range(len(train_X_data)):
        x = sum([k*k for k in train_X_data[i]])
        for j in range(len(wordTable)):
            train_X_data[i][j] /= math.sqrt(x)
    return train_X_data

## f) Clustering

In [16]:
# Euclidean distance
def distance(vecA, vecB):
    return sqrt(sum(power(vecA-vecB, 2)))


# use random value to intialize the centroids
def initialCentroids(dataSet, k):
    n = shape(dataSet)[1]
    centroid = mat(zeros((k,n)))
    for j in range(n):
        minE = min(dataSet[:,j])
        maxE = max(dataSet[:,j])
        rangeE = float(maxE - minE)
        centroid[:,j] = minE + rangeE * random.rand(k,1)
    return centroid


def kMeans(dataSet, k):
    m = shape(dataSet)[0]
    result = mat(zeros((m,2)))
    centroids = initialCentroids(dataSet, k)
    clusterChanged = True
    while clusterChanged:
        clusterChanged = False
        for i in range(m):
            minDist, minIndex = np.inf, -1
            for j in range(k):
                distJI = distance(centroids[j,:], dataSet[i,:])
                if distJI < minDist:
                    minDist, minIndex = distJI, j
            if result[i,0] != minIndex: clusterChanged = True
            result[i,:] = minIndex, minDist**2
        for centroid in range(k):
            points = dataSet[nonzero(result[:,0].A==centroid)[0]]
            centroids[centroid,:] = mean(points, axis = 0)
    return centroids, result

In [30]:
# Cluster Evaluator
# compute the ratio of positive & negative previews in each cluster
def evaluate_cluster(train_X_data):
    centroids, result = kMeans(train_X_data, 2)
    part1 = []
    part2 = []
    for a,b,c in zip(train_X_data, result.tolist(),train_Y):
        if b[0] == 0: part1.append((a,c))
        else: part2.append((a,c))
    cnt1, cnt2 = 0, 0
    for m,n in part1:
        if n == '0': cnt1 += 1
        else: cnt2 += 1
    cnt3, cnt4 = 0,0
    for m,n in part2:
        if n == '0': cnt3 += 1
        else: cnt4 += 1
    print "cluster1 has " + str(len(part1)) + " reviews"
    print "negative: " + str(cnt1) +" ",
    print "posive: " + str(cnt2)
    print "cluster2 has " + str(len(part2)) + " reviews"
    print "negative: " + str(cnt3) +" ",
    print "posive: " + str(cnt4)

## h) N-gram Model. N = 2

In [32]:
def NgramModel(train_X, test_X):
    dictionary = set()
    for i in range(len(train_X)):
        for j in range(len(train_X[i])-1):
            pair = train_X[i][j]+" "+train_X[i][j+1]
            if pair not in dictionary:
                dictionary.add(pair)
    dictionary = list(dictionary)
    train_X_data = np.zeros((len(train_X), len(dictionary)))  # A table with 0 and 1 to indicate
    for i in range(len(train_X)):
        for j in range(len(train_X[i])-1):
            pair = train_X[i][j]+" "+train_X[i][j+1]
            train_X_data[i][dictionary.index(pair)] += 1
        
    test_X_data = np.zeros((len(test_X), len(dictionary)))  # A table with 0 and 1 to indicate
    for i in range(len(test_X)):
        for j in range(len(test_X[i])-1):
            pair = test_X[i][j]+" "+test_X[i][j+1]
            if pair in dictionary:  # Only add words that included in the train set
                test_X_data[i][dictionary.index(pair)] += 1
    return train_X_data, test_X_data

In [54]:
print "Clustering on Bag of Word Model"
print
train, test = BagOfWordModel(train_X, test_X)
# train = postprocessing(train)
evaluate_cluster(train)
print
print "Logistic Regression with Bag of Word Model"
cls = linear_model.LogisticRegression()
print "accuracy:",
print cls.fit(train, train_Y).score(test, test_Y)

print
print "Clustering on N-gram Model"
print
train, test = NgramModel(train_X, test_X)
# train = postprocessing(train)
evaluate_cluster(train)
print
print "Logic Regression with N-gram Model"
cls = linear_model.LogisticRegression()
print "accuracy:",
print cls.fit(train, train_Y).score(test, test_Y)

Clustering on Bag of Word Model

cluster1 has 2399 reviews
negative: 1199  posive: 1200
cluster2 has 1 reviews
negative: 1  posive: 0

Logistic Regression with Bag of Word Model
accuracy: 0.803333333333

Clustering on Ngram Model

cluster1 has 2400 reviews
negative: 1200  posive: 1200
cluster2 has 0 reviews
negative: 0  posive: 0

Logic Regression with N-gram Model
accuracy: 0.638333333333


## i) PCA for bag of word model.
The feature in *bag of words* has large redundancy.  
Implement PCA first to reduce dimension of features to 10, 50, 100.

In [61]:
def PCA(train, test, r):
    # subtract mean
    mean_vector = np.mean(train, axis=0)
    X = train - mean_vector
    X = np.dot(X.transpose(), X)
    U, s, V = np.linalg.svd(X, full_matrices = True)
    F = np.dot(train, (V[:r,:]).T)
    K = np.dot(test, (V[:r,:]).T)
    return F, K
train, test = BagOfWordModel(train_X, test_X)
train, test = PCA(train, test, 100)
cls = linear_model.LogisticRegression()
print "accuracy:",
print cls.fit(train, train_Y).score(test, test_Y)
train, test = PCA(train, test, 50)
cls = linear_model.LogisticRegression()
print "accuracy:",
print cls.fit(train, train_Y).score(test, test_Y)
train, test = PCA(train, test, 10)
cls = linear_model.LogisticRegression()
print "accuracy:",
print cls.fit(train, train_Y).score(test, test_Y)

accuracy: 0.696666666667
accuracy: 0.681666666667
accuracy: 0.613333333333
