In [1]:
import pandas
import numpy
import random
import copy
import time

In [2]:
#Function used to compute dictionary as vector
def dic_linear(dic1,dic2,operator):
    '''
        dic1,dic2 - two dictionary to do plus or minus
        operator  - -1: minus
                    1: plus
        Return the result dictionary.
    '''
    dic={k:dic1[k] for k in dic1}
    for word in dic2:
        if word not in dic:
            dic[word]=0
        if operator==1:
            dic[word]+=dic2[word]
        if operator==-1:
            dic[word]-=dic2[word]
    return dic

def dic_dot(dic1,dic2):
    '''
        dic1,dic2 - two dictionary to do dot
        Return the result.
    '''
    dot=0
    for word in dic2:
        if word in dic1:
            dot+=dic1[word]*dic2[word]
    return dot

def dic_multi(diction,number):
    '''
        diction - the dictionary need to be multipled
        number  - multiplier
        Return a multipled dictionary.
    '''
    dic={k:diction[k]*number for k in diction}
    return dic

In [3]:
#Function used to do data representation
def tf(text,n=1):
    '''
        text - the review need to be transformed
        n    - n_gram: 
                        1 - unigram
                        2 - bigram
        Return a dictionary contains the tf of each word(s).
    '''
    words=text.split(' ')
    word_dic={}
    for i in range(len(words)-n+1):
        #Form the n_gram
        n_gram=''
        for j in range(n):
            n_gram=n_gram+words[i+j]
        if n_gram not in word_dic:
            word_dic[n_gram]=0
        word_dic[n_gram]+=1
    return word_dic

def idf(reviews):
    '''
        reviews - a collection of documents
        Return a dictionary contains the idf of each word.
    '''
    word_all={}
    D=len(reviews)
    #Form a dictionary contains all words appearing in D.
    for review in reviews:
        words=review.split(' ')
        for word in words:
            if word not in word_all:
                word_all[word]=0
    for review in reviews:
        for word in word_all:
            if word in review:
                word_all[word]+=1
    for word in word_all:
        word_all[word]=numpy.log10(D/word_all[word])
    return word_all
    
def tf_idf(text,word_all):
    '''
        text     - the review need to be transformed
        word_all - a dictionary contains the idf of each word
        Return a dictionary contains the tf_idf of each word.
    '''
    word_dic=tf(text)
    DEL=[]
    for word in word_dic:
        #Deal with the situation if word doesn't appear in any document in D.
        if word not in word_all:
            DEL.append(word)
        else:
            word_dic[word]=word_dic[word]*word_all[word]
    for word in DEL:
        del word_dic[word]
    return word_dic

def representation(text,word_all={},pre='unigram'):
    '''
        text     - text need to be transformed
        word_all - a dictionary contains the idf of each word;
                   this is required when pre is 'tf-idf'
        pre      - 'unigram'
                   'bigram'
                   'tf-idf'
        Return the transformed vector.
    '''
    if pre=='unigram':
        vec=tf(text)
    if pre=='bigram':
        vec=tf(text,n=2)
    if pre=='tf-idf':
        vec=tf_idf(text,word_all)
    vec['GG']=1                      #lifting the data
    return vec

In [4]:
#Functions used to get the classifier
def preprocessing(data):
    '''
        Used to transform label 0 to -1.
    '''
    for i in range(len(data['text'])):
        if data['label'][i]==0:
            data['label'][i]=-1

def Online_Perceptron(training,word_all={},pre='unigram'):
    '''
        training - the training set
        word_all - if method is tf-idf, word_all is required
        pre      - the method to do presentation:
                        'unigram'
                        'tf-idf'
                        'bigram'
        Return the weight.
    '''
    tr=copy.deepcopy(training)
    #Transform label 0 to -1
    preprocessing(tr)
    #First pass
    index=[i for i in range(len(tr['text']))]
    random.shuffle(index)
    w={}
    for i in index:
        vec=representation(tr['text'][i],word_all,pre)
        dot=dic_dot(w,vec)
        if tr['label'][i]*dot<=0:
            w=dic_linear(w,vec,tr['label'][i])
    #Second pass
    random.shuffle(index)
    W={}
    n=1
    for i in index:
        vec=representation(tr['text'][i],word_all,pre)
        dot=dic_dot(w,vec)
        if tr['label'][i]*dot<=0:
            if n==1:
                W=dic_linear(W,w,1)
            else:
                inter=dic_multi(w,n)
                W=dic_linear(W,inter,1)
            w=dic_linear(w,vec,tr['label'][i])
            n=1
        else:
            n+=1
    inter=dic_multi(w,n)
    W=dic_linear(W,inter,1)
    W=dic_multi(W,1/(len(tr['text'])+1))
    return W

def posi_nega(w):
    '''
        Used to found the 10 most positive and negative words.
    '''
    lis=[(k,w[k]) for k in w]
    lis=sorted(lis,key=lambda l:l[1])
    positive=lis[-10:]
    negative=lis[:10]
    return {'posi':positive,'nega':negative}

In [5]:
def testit(train,test,pre):
    tr=copy.deepcopy(train)
    te=copy.deepcopy(test)
    word_all={}
    timing=[]
    timing.append(time.time())
    if pre=='tf-idf':
        word_all=idf(tr['text'])
    timing.append(time.time())
    w=Online_Perceptron(tr,word_all,pre)
    timing.append(time.time())
    count=0
    for i in range(len(te['text'])):
        vec=representation(te['text'][i],word_all,pre)
        dot=dic_dot(w,vec)
        label=1
        if te['label'][i]==0:
            label=-1
        if dot*label>0:
            count+=1
    timing.append(time.time())
    accuracy=count/len(te['text'])
    pn=posi_nega(w)
    return {'accu':accuracy,
            'step1':timing[1]-timing[0],
            'step2':timing[2]-timing[1],
            'step3':timing[3]-timing[2],
            'posi_nega':pn}

In [6]:
data_tr=pandas.read_csv('reviews_tr.csv',header=0)
data_te=pandas.read_csv('reviews_te.csv',header=0)

In [7]:
ttr={}
ttr['text']=copy.deepcopy(data_tr['text'][:100000])
ttr['label']=copy.deepcopy(data_tr['label'][:100000])
tte={}
tte['text']=copy.deepcopy(data_te['text'][:100000])
tte['label']=copy.deepcopy(data_te['label'][:100000])

In [None]:
re100000b=testit(ttr,tte,pre='bigram')

In [39]:
[re10000u,re100000u,re10000idf,re10000b,re100000b]

[{'accu': 0.859,
  'posi_nega': {'nega': [('ok', -79.18108189181082),
    ('okay', -65.72382761723829),
    ('bland', -64.70832916708329),
    ('decent', -64.54054594540547),
    ('worst', -50.32446755324468),
    ('nothing', -48.358864113588645),
    ('mediocre', -45.68173182681732),
    ('average', -44.76842315768423),
    ('not', -44.17398260173983),
    ('terrible', -42.53904609539046)],
   'posi': [('awesome', 48.49595040495951),
    ('fresh', 49.214878512148786),
    ('favorite', 49.98300169983002),
    ('best', 55.36556344365564),
    ('great', 55.81661833816619),
    ('amazing', 63.87091290870913),
    ('definitely', 68.33286671332867),
    ('excellent', 73.68023197680232),
    ('perfect', 74.79932006799321),
    ('delicious', 95.05099490050995)]},
  'step1': 0.0,
  'step2': 29.63592219352722,
  'step3': 1.514094591140747},
 {'accu': 0.88512,
  'posi_nega': {'nega': [('mediocre', -156.12238877611225),
    ('worst', -155.74240257597424),
    ('bland', -140.16636833631665),
    (