In [None]:
import numpy as np
import scipy as sp
from collections import Counter
from stemming.porter2 import stem
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
nltk.download("stopwords")

def PreProcess_train_File(file):
    with open(file,'r') as f:
        df = pd.DataFrame(l.split("\t") for l in f) 
        newcols = {0: 'SentimentClass',1: 'Review',}
        df.rename(columns=newcols, inplace=True)

    clean = re.compile('<.*?>')
    df["Review"] = df["Review"].apply(lambda x:re.sub(clean, ' ', x))
    df["Review"] = df["Review"].str.lower().str.split()
    stop = stopwords.words('english')

    df["Review"] = df["Review"].apply(lambda x: [item for item in x if item not in stop])
    df["Review"] = df["Review"].apply(lambda x: [re.sub("[^a-z]+", "", word) for word in x if re.search("[^0-9]",word)<>None])
    df["Review"] = df["Review"].apply(lambda x:[stem(t) for t in x ])
    print df.get_value(0,'Review')
    return df

def PreProcess_test_File(file):
    with open(file,'r') as f:
        df = pd.DataFrame(l for l in f) 
        newcols = {0: 'Review',}
        df.rename(columns=newcols, inplace=True)

    clean = re.compile('<.*?>')
    df["Review"] = df["Review"].apply(lambda x:re.sub(clean, ' ', x))
    df["Review"] = df["Review"].str.lower().str.split()
    stop = stopwords.words('english')

    df["Review"] = df["Review"].apply(lambda x: [item for item in x if item not in stop])
    df["Review"] = df["Review"].apply(lambda x: [re.sub("[^a-z]+", "", word) for word in x if re.search("[^0-9]",word)<>None])
    df["Review"] = df["Review"].apply(lambda x:[stem(t) for t in x ])
    print df.get_value(0,'Review')
    return df

In [None]:
from scipy.sparse import csr_matrix

idx = {}
tid = 0
nnz = 0
ncols=0

def build_common_index(combinedReview):
    for d in combinedReview:
        nnz += len(set(d))
        for w in d:
            if w not in idx:
                idx[w] = tid
                tid += 1
    ncols = len(idx)
    
def build_matrix(docs):
    r""" Build sparse matrix from a list of documents, 
    each of which is a list of word/terms in the document.  
    """
    nrows = len(docs)

        
    # set up memory
    ind = np.zeros(nnz, dtype=np.int)
    val = np.zeros(nnz, dtype=np.double)
    ptr = np.zeros(nrows+1, dtype=np.int)
    i = 0  # document ID / row counter
    n = 0  # non-zero counter
    # transfer values
    for d in docs:
        cnt = Counter(d)
        keys = list(k for k,_ in cnt.most_common())
        l = len(keys)
        for j,k in enumerate(keys):
            ind[j+n] = idx[k]
            val[j+n] = cnt[k]
        ptr[i+1] = ptr[i] + l
        n += l
        i += 1
            
    mat = csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.double)
    mat.sort_indices()
    
    return mat


def csr_info(mat, name="", non_empy=False):
    r""" Print out info about this CSR matrix. If non_empy, 
    report number of non-empty rows and cols as well
    """
    if non_empy:
        print("%s [nrows %d (%d non-empty), ncols %d (%d non-empty), nnz %d]" % (
                name, mat.shape[0], 
                sum(1 if mat.indptr[i+1] > mat.indptr[i] else 0 
                for i in range(mat.shape[0])), 
                mat.shape[1], len(np.unique(mat.indices)), 
                len(mat.data)))
    else:
        print( "%s [nrows %d, ncols %d, nnz %d]" % (name, 
                mat.shape[0], mat.shape[1], len(mat.data)) )

In [None]:
#scale matrix and normalize its rows
from collections import defaultdict
def csr_idf(mat, copy=False, **kargs):
    r""" Scale a CSR matrix by idf. 
    Returns scaling factors as dict. If copy is True, 
    returns scaled matrix and scaling factors.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # document frequency
    df = defaultdict(int)
    for i in ind:
        df[i] += 1
    # inverse document frequency
    for k,v in df.items():
        df[k] = np.log(nrows / float(v))  ## df turns to idf - reusing memory
    # scale by idf
    for i in range(0, nnz):
        val[i] *= df[ind[i]]
        
    return df if copy is False else mat

def csr_l2normalize(mat, copy=False, **kargs):
    r""" Normalize the rows of a CSR matrix by their L-2 norm. 
    If copy is True, returns a copy of the normalized matrix.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # normalize
    for i in range(nrows):
        rsum = 0.0    
        for j in range(ptr[i], ptr[i+1]):
            rsum += val[j]**2
        if rsum == 0.0:
            continue  # do not normalize empty rows
        rsum = 1.0/np.sqrt(rsum)
        for j in range(ptr[i], ptr[i+1]):
            val[j] *= rsum
            
    if copy is True:
        return mat


In [None]:
train_df = PreProcess_train_File('train.dat')

In [None]:
test_df = PreProcess_test_File('test.dat')

In [None]:
combined_Review_For_Commom_index = train_df["Review"].extend(test_df["Review"])

In [None]:
train_mat = build_matrix(train_df["Review"])
test_mat = build_matrix(test_df["Review"])
train_mat_csr_idf = csr_idf(train_mat, copy=True)
test_mat_csr_idf = csr_idf(test_mat, copy=True)
train_mat_csr_idf_norm = csr_l2normalize(train_mat_csr_idf, copy=True)
test_mat_csr_idf_norm = csr_l2normalize(test_mat_csr_idf, copy=True)

In [None]:
#dp2 = np.dot(test_mat_csr_idf_norm,train_mat_csr_idf_norm.T)
dp2 = np.dot(test_mat_csr_idf_norm[0],test_mat_csr_idf_norm[1].T)

