In [14]:
# Taken from http://web.stanford.edu/class/cs221/ Assignment #2 Support Code
def dotProduct(d1, d2):
    """
    @param dict d1: a feature vector represented by a mapping from a feature (string) to a weight (float).
    @param dict d2: same as d1
    @return float: the dot product between d1 and d2
    """
    if len(d1) < len(d2):
        return dotProduct(d2, d1)
    else:
        return sum(d1.get(f, 0) * v for f, v in d2.items())

def increment(d1, scale, d2):
    """
    Implements d1 += scale * d2 for sparse vectors.
    @param dict d1: the feature vector which is mutated.
    @param float scale
    @param dict d2: a feature vector.

    NOTE: This function does not return anything, but rather
    increments d1 in place. We do this because it is much faster to
    change elements of d1 in place than to build a new dictionary and
    return it.
    """
    for f, v in d2.items():
        d1[f] = d1.get(f, 0) + v * scale

In [15]:
import os
import numpy as np
import pickle
import random

'''
Note:  This code is just a hint for people who are not familiar with text processing in python. There is no obligation to use this code, though you may if you like. 
'''


def folder_list(path,label):
    '''
    Input: path to where the data is located and label (1 or -1 depending on positive or negative)
    Output: list of file contents 
    '''
    filelist = os.listdir(path)
    review = []
    for infile in filelist:
        file = os.path.join(path,infile)
        r = read_data(file)
        r.append(label)
        review.append(r)
    return review

def read_data(file):
    '''
    Read each file into a list of strings. 
    Example:
    ["it's", 'a', 'curious', 'thing', "i've", 'found', 'that', 'when', 'willis', 'is', 'not', 'called', 'on', 
    ...'to', 'carry', 'the', 'whole', 'movie', "he's", 'much', 'better', 'and', 'so', 'is', 'the', 'movie']
    '''
    f = open(file)
    lines = f.read().split(' ')
    symbols = '${}()[].,:;+-*/&|<>=~" '
    words = map(lambda Element: Element.translate(None, symbols).strip(), lines)
    words = filter(None, words)
    return words
	
###############################################
######## YOUR CODE STARTS FROM HERE. ##########
###############################################

def shuffle_data():
    '''
    pos_path is where you save positive review data.
    neg_path is where you save negative review data.
    '''
    pos_path = "data/data/pos"
    neg_path = "data/data/neg"
	
    pos_review = folder_list(pos_path,1)
    neg_review = folder_list(neg_path,-1)
	
    review = pos_review + neg_review
    random.shuffle(review)
    return review

In [16]:
pos_path = "data/data/pos"
neg_path = "data/data/neg"
pos_review = folder_list(pos_path,1)
neg_review = folder_list(neg_path,-1)

In [17]:
shuffle=shuffle_data()
train = shuffle[:1500]
test = shuffle[1500:]

In [18]:
import collections
def counter(data):
    y = []
    X = []
    for i in range(len(data)):
        y.append(data[i][-1])
        X.append(collections.Counter(data[i][:-1]))
    return X, y

In [19]:
X,y = counter(train)

In [20]:
def pegasos(X, y, lambda_reg, max_epochs):
    t = 2
    w = dict()
    epoch = 0
    while epoch < max_epochs:
        epoch += 1
        for i in range(len(X)):
            t += 1
            stepSize = 1/(t*lambda_reg)
            if y[i]*dotProduct(w, X[i]) < 1:
                w = (1 - stepSize*lambda_reg)*w + stepSize*y*x
            else:
                w = (1 - stepSize*lambda_reg)*w
    return w

In [21]:
def pegasos_2(X_train, y_train, lambda_reg, max_epochs):
    t = 2
    w = dict()
    s = 1
    epoch = 0
    while epoch < max_epochs:
        epoch += 1
        for i in range(len(X_train)):
            t += 1
            stepSize = 1/(t*lambda_reg)
            s *= (1-stepSize*lambda_reg)
            if s*y_train[i]*dotProduct(w, X_train[i]) < 1:
                w = w*s + stepSize*y/s
    w = (1 - stepSize*lambda_reg)*w + stepSize*y*x
    return w