In [157]:
import numpy as np
import scipy as sp
from collections import Counter
from stemming.porter2 import stem
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
nltk.download("stopwords")

def PreProcess_train_File(file):
    with open(file,'r') as f:
        df = pd.DataFrame(l.split("\t") for l in f) 
        newcols = {0: 'SentimentClass',1: 'Review',}
        df.rename(columns=newcols, inplace=True)

    clean = re.compile('<.*?>')
    df["Review"] = df["Review"].apply(lambda x:re.sub(clean, ' ', x))
    df["SentimentClass"] = df["SentimentClass"].apply(lambda x:1 if x=="+1" else -1)
    df["Review"] = df["Review"].str.lower().str.split()
    stop = stopwords.words('english')

    df["Review"] = df["Review"].apply(lambda x: [item for item in x if item not in stop])
    df["Review"] = df["Review"].apply(lambda x: [re.sub("[^a-z]+", "", word) for word in x if re.search("[^0-9]",word)<>None])
    df["Review"] = df["Review"].apply(lambda x:[stem(t) for t in x ])
    print df.get_value(0,'Review')
    print df.get_value(0,'SentimentClass')
    return df

def PreProcess_test_File(file):
    with open(file,'r') as f:
        df = pd.DataFrame(l for l in f) 
        newcols = {0: 'Review',}
        df.rename(columns=newcols, inplace=True)

    clean = re.compile('<.*?>')
    df["Review"] = df["Review"].apply(lambda x:re.sub(clean, ' ', x))
    df["Review"] = df["Review"].str.lower().str.split()
    stop = stopwords.words('english')

    df["Review"] = df["Review"].apply(lambda x: [item for item in x if item not in stop])
    df["Review"] = df["Review"].apply(lambda x: [re.sub("[^a-z]+", "", word) for word in x if re.search("[^0-9]",word)<>None])
    df["Review"] = df["Review"].apply(lambda x:[stem(t) for t in x ])
    print df.get_value(0,'Review')
    return df

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aravi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [158]:
from scipy.sparse import csr_matrix
    
def build_matrix(docs):
    r""" Build sparse matrix from a list of documents, 
    each of which is a list of word/terms in the document.  
    """
    nrows = len(docs)
    idx = {}
    tid = 0
    nnz = 0
    for d in docs:
        nnz += len(set(d))
        for w in d:
            if w not in idx:
                idx[w] = tid
                tid += 1
    ncols = len(idx)
    print idx
        
    # set up memory
    ind = np.zeros(nnz, dtype=np.int)
    val = np.zeros(nnz, dtype=np.double)
    ptr = np.zeros(nrows+1, dtype=np.int)
    i = 0  # document ID / row counter
    n = 0  # non-zero counter
    # transfer values
    for d in docs:
        cnt = Counter(d)
        keys = list(k for k,_ in cnt.most_common())
        l = len(keys)
        for j,k in enumerate(keys):
            ind[j+n] = idx[k]
            val[j+n] = cnt[k]
        ptr[i+1] = ptr[i] + l
        n += l
        i += 1
            
    mat = csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.double)
    mat.sort_indices()
    
    return mat


def csr_info(mat, name="", non_empy=False):
    r""" Print out info about this CSR matrix. If non_empy, 
    report number of non-empty rows and cols as well
    """
    if non_empy:
        print("%s [nrows %d (%d non-empty), ncols %d (%d non-empty), nnz %d]" % (
                name, mat.shape[0], 
                sum(1 if mat.indptr[i+1] > mat.indptr[i] else 0 
                for i in range(mat.shape[0])), 
                mat.shape[1], len(np.unique(mat.indices)), 
                len(mat.data)))
    else:
        print( "%s [nrows %d, ncols %d, nnz %d]" % (name, 
                mat.shape[0], mat.shape[1], len(mat.data)) )

In [190]:
#scale matrix and normalize its rows
from collections import defaultdict
def csr_idf(mat, copy=False, **kargs):
    r""" Scale a CSR matrix by idf. 
    Returns scaling factors as dict. If copy is True, 
    returns scaled matrix and scaling factors.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # document frequency
    df = defaultdict(int)
    for i in ind:
        df[i] += 1
    # inverse document frequency
    for k,v in df.items():
        df[k] = np.log(nrows / float(v))  ## df turns to idf - reusing memory
    # scale by idf
    for i in range(0, nnz):
        val[i] *= df[ind[i]]
        
    return df if copy is False else mat

def csr_l2normalize(mat, copy=False, **kargs):
    r""" Normalize the rows of a CSR matrix by their L-2 norm. 
    If copy is True, returns a copy of the normalized matrix.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # normalize
    for i in range(nrows):
        rsum = 0.0    
        for j in range(ptr[i], ptr[i+1]):
            rsum += val[j]**2
        if rsum == 0.0:
            continue  # do not normalize empty rows
        rsum = 1.0/np.sqrt(rsum)
        for j in range(ptr[i], ptr[i+1]):
            val[j] *= rsum
           
            
    if copy is True:
        return mat


In [161]:
train_df = PreProcess_train_File('train.dat')



['although', 'film', 'bruce', 'willi', 'alway', 'worth', 'watch', 'better', 'skip', 'one', 'watch', 'one', 'televis', 'didnt', 'plunk', 'cash', 'it', 'lucki', 'me', 'plot', 'develop', 'slowli', 'slowli', 'although', 'first', 'minut', 'quit', 'believ', 'get', 'unbeliev', 'toward', 'end', 'high', 'question', 'season', 'soldier', 'like', 'lt', 'water', 'would', 'disobey', 'direct', 'order', 'even', 'would', 'rest', 'platoon', 'would', 'know', 'put', 'direct', 'danger', 'know', 'certain', 'die', 'follow', 'him', 'heck', 'lt', 'let', 'say', 'despit', 'direct', 'order', 'rememb', 'still', 'nice', 'scene', 'movi', 'somewhat', 'save', 'villag', 'total', 'popul', 'massacr', 'rebel', 'well', 'save', 'dozen', 'villag', 'so', 'rest', 'alreadi', 'kill', 'strang', 'part', 'it', 'take', 'truck', 'rebel', 'left', 'behind', 'rather', 'go', 'foot', 'mayb', 'road', 'unsaf', 'explan', 'it', 'anyway', 'think', 'earn', 'movi', 'one', 'point', 'gave', 'it', 'made', 'movi', 'insult', 'brain', 'henc', 'complet

In [162]:
test_df = PreProcess_test_File('test.dat')



['glad', 'watch', 'everi', 'time', 'movi', 'ray', 'sidney', 'pottier', 'st', 'centuri', 'jami', 'foxx', 'play', 'role', 'evid', 'brilliant', 'abil', 'actor', 'take', 'posit', 'besid', 'great', 'actor', 'hollywood', 'golden', 'support', 'strong', 'abil', 'afro', 'american', 'actor', 'time', 'period', 'etern', 'work', 'evid', 'etern', 'ray', 'charl', 'grand', 'prove', 'legend', 'appear', 'everi', 'time', 'success', 'win', 'oscar', 'prize', 'best', 'actor', 'lead', 'role', 'best', 'mix', 'sound', 'fact', 'cinemat', 'scene', 'upon', 'ray', 'jami', 'mix', 'two', 'copi', 'ray', 'ray', 'past', 'ray', 'act', 'person', 'jami', 'foxxit', 'nice', 'director', 'choos', 'song', 'adapt', 'dramat', 'scene', 'accid', 'film', 'enter', 'atmospher', 'success', 'legend', 'corn', 'legend', 'movi', 'hollywood', 'sinc', 'till', 'clever', 'choos', 'sharon', 'warrn', 'role', 'ray', 's', 'mother', 'succeed', 'role', 'brilliant', 'analysi', 'core', 'charact', 'ray', 'mother', 'turn', 'point', 'upon', 'grew', 'ind

In [184]:
trainlist = train_df["Review"].tolist()
testlist = test_df["Review"].tolist()
totallist = trainlist + testlist
joint_mat = build_matrix(totallist)


(50000, 123451)


In [192]:
idf_matrix = csr_idf(joint_mat, copy=True)
idf_normal_matrix = csr_l2normalize(idf_matrix, copy=True)


AttributeError: round not found

In [199]:
rounded_array = np.around(idf_normal_matrix,3)
#print idf_normal_matrix[0].todense()
#print rounded_array[0].todense()

[[ 0.07878077  0.00842491  0.07411369 ...,  0.          0.          0.        ]]
[[ 0.079  0.008  0.074 ...,  0.     0.     0.   ]]


In [None]:
#dp2 = np.dot(test_mat_csr_idf_norm,train_mat_csr_idf_norm.T)
#print (test_mat_csr_idf_norm.get_shape())
#print (train_mat_csr_idf_norm.get_shape())
#print (test_mat_csr_idf_norm[0].toarray())
#print (test_mat_csr_idf_norm[0:2].toarray())
classification_list = []

for doc in rounded_array[25000:49999]:
    Similarity_Scores = {}
    list_of_Scores = []
    
    for doc1 in rounded_array[0:24999]:
        list_of_Scores.append(doc.dot(doc1.T).todense().item())
    Similarity_Scores[0] = list(enumerate(list_of_Scores))

    classification=""
    knnscore=0
    sortedSimilarityScores = sorted(Similarity_Scores[0], key=lambda x: x[1],reverse=True)

    for df_index,cosineSimilarity in sortedSimilarityScores[0:20]:
        if cosineSimilarity==0:
            break
        else:
            weighted_classification = (train_df.loc[df_index, 'SentimentClass'])/(cosineSimilarity*cosineSimilarity)    
            knnscore += weighted_classification
    classification_list.append(knnscore)
print ("end of computation")    
with open('format.dat', 'w') as f:
        for item in classification_list:
            if item>0:
                f.write('+1\n')
            else:
                 f.write('-1\n')
            f.close
print ("end")



In [205]:
len(classification_list)


4763