# 103590450 馬茂源 四資四

In [295]:
import pandas as pd
import numpy as np
import time
import os
import sys
import string
import html
import hashlib
import re
from collections import defaultdict
import itertools
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
t0 = time.time()

In [3]:
if not os.path.exists('result'):
    os.mkdir('result')

In [4]:
file_names = ['./data/reut2-{0:0>3}.sgm'.format(i) for i in range(22)]
file_names = [file_names[0]]

## (1) Given the Reuters-21578 dataset, please calculate all kshingles and output the set representation of the text dataset as a matrix. 

In [5]:
def parser(file_name):
    with open(file_name, 'r', encoding='ISO-8859-1') as f:
        file = f.read()
    news = []
    start = 0
    for i in range(len(file)):
        if file[i:i+6] == '<BODY>':
            start = i+6
        elif file[i:i+7] == '</BODY>':
            n = file[start:i].replace('\n', ' ')
            n = n.replace('REUTER &#3;', '')
            news.append(n)
    return news

In [6]:
news = []
for i in file_names:
    each_news = parser(i) 
    news.extend(each_news)

In [7]:
len(news)

925

In [8]:
news_data = pd.DataFrame(data=news, columns=['news'])

In [9]:
news_data.head()

Unnamed: 0,news
0,Showers continued throughout the week in the B...
1,Standard Oil Co and BP North America Inc said ...
2,Texas Commerce Bancshares Inc's Texas Commerce...
3,BankAmerica Corp is not under pressure to act ...
4,The U.S. Agriculture Department reported the f...


In [10]:
def tokenizer(text):
    strip_chars = '.' + ' –…' + string.punctuation
    text = html.unescape(text)
    text = text.lower()
    text = text.strip(strip_chars)
    text = text.replace('reuter', '')
    text = re.sub(re.compile('<.*?>'), '', text)
    return re.findall(r'\w+', text)

In [11]:
test_news = news_data['news'][1]

In [12]:
#tokenizer(test_news)

In [13]:
news_data['news_token'] = news_data.apply(lambda x: tokenizer(x['news']), axis=1)

In [14]:
#news_data

In [15]:
def k_shingle(text, k):
    string = ' '.join(text)
    shingles = set([])
    for i in range(len(string)-k + 1):
        shingles.add(string[i:i+k])
    return (shingles)

In [16]:
news_data['news_token'] = news_data['news_token'].apply(lambda x: k_shingle(x, 5))

In [17]:
shingles = set([])
for s in news_data['news_token'].values:
    shingles |= s
shingles = list(shingles)
len(shingles)

78082

In [18]:
shingles_dict_ = {s:i for i, s in enumerate(shingles)}

In [37]:
def encode_shingles(row, shingles_dict_):
    v = np.zeros(len(shingles_dict_), dtype='int')
    idx_list = [shingles_dict_[r] for r in row] 
    v[idx_list] = 1
    return v

In [38]:
news_data['shingles'] = news_data['news_token'].apply(encode_shingles, args=(shingles_dict_,))

In [39]:
output_shingles = np.array(news_data['shingles'].tolist()).T
output_shingles.shape

(78082, 925)

In [40]:
# np.savetxt("result/task_1.csv", output_shingles, delimiter=",",  fmt='%d')

## (2) Given the set representation, compute the minhash signatures of all documents using MapReduce.

In [41]:
news_shingles = output_shingles

In [42]:
def get_prime(greater_than):
    
    def is_prime(n):
        if n % 2 == 0 and n > 2: 
            return False
        return all(n % i for i in range(3, int(np.sqrt(n)) + 1, 2))
    is_p = False
    
    
    
    while not is_p:
        greater_than += 1
        is_p = is_prime(greater_than)
        
    return greater_than

In [43]:
def get_hash_func_list(n, k=100):
    p = get_prime(n)
    func_list = []
    for a, b in zip(np.random.randint(0, n, size=k),
                   np.random.randint(0, n, size=k)):
        func_list.append(lambda x, a=a,b=b,p=p,n=n: ((a*x+b)%p)%n)
    return np.array(func_list)

In [44]:
def one_pass_minhashing(shingles, k=100):
    n = shingles.shape[0]
    hash_list = get_hash_func_list(n, k=k)
    singnature = np.full((k, shingles.shape[1]), fill_value=np.inf)
    
    for i in range(n):
        hash_value = np.array([h(i) for h in hash_list])

        for j, c in enumerate(shingles[i, :] == 1):
            if c:
                mask = singnature[:, j] > hash_value
                singnature[:, j][mask] = hash_value[mask]
    
    return singnature.astype('int')

In [45]:
test_input = np.array([[1,0,1,0],
                       [1,0,0,1],
                       [0,1,0,1],
                       [0,1,0,1],
                       [0,1,0,1],
                       [1,0,1,0],
                       [1,0,1,0]])
test_singnature = one_pass_minhashing(test_input, k=6)
test_singnature.astype('int')

array([[0, 0, 0, 0],
       [1, 0, 1, 0],
       [0, 2, 0, 2],
       [0, 0, 0, 0],
       [1, 0, 1, 0],
       [2, 0, 2, 0]])

In [46]:
singnature = one_pass_minhashing(news_shingles, k=100)

In [47]:
singnature.shape

(100, 925)

In [48]:
singnature.dtype

dtype('int64')

In [49]:
#np.savetxt("result/task_2.csv", singnature, delimiter=",",  fmt='%d')

## (3) Implement the LSH algorithm by MapReduce and output the resulting candidate pairs of similar documents.

In [50]:
hashlib.sha512().block_size

128

In [51]:
def LSH(singnature, b=20):
    buckets = [defaultdict(set) for i in range(b)]
    k = singnature.shape[0]
    r = k // b
    for i, doc in enumerate(singnature.T):
        x = np.array2string(doc.astype('int'), separator='', precision=0)
        for j, start_idx in enumerate(range(0, k, r)):
            #print(j, j+r)
            key = hashlib.sha512(x[start_idx:start_idx+r].encode()).hexdigest()
            buckets[j][key].add(i)
    return buckets

In [52]:
def get_candidate(buckets):
    candidates = set([])
    for bucket in buckets:
        for k, item in bucket.items():
            if len(item) > 1:
                pairs = itertools.combinations(item, 2)
                for p in pairs:
                    candidates.add(p)

    return candidates

In [53]:
# bucket = LSH(test_singnature, b=2)
# bucket

In [54]:
buckets = LSH(singnature, b=20)
candidates = get_candidate(buckets)
candidates = list(candidates)

In [55]:
len(candidates)

25904

In [56]:
def get_distance(s1, s2):
    intersection = np.logical_and(s1, s2)
    union = np.logical_or(s1, s2)
    return intersection.sum() / float(union.sum())

In [308]:
new_candidates = []
for d_id_1, d_id_2 in candidates:
    if get_distance(singnature[:, d_id_1], 
                   singnature[:, d_id_2]) >= 0.8:
        new_candidates.append((d_id_1, d_id_2))
len(new_candidates)

25904

In [309]:
new_candidates.sort()

In [323]:
with open('result/task3.csv', 'w') as f:
    for i, g in itertools.groupby(new_candidates, key=lambda x: x[0]):
        f.write('%5d, %s\n'%(i, str(list(i[1] for i in g))))

In [290]:
#news_data.loc[new_candidates[0][0]]['news']

In [291]:
#news_data.loc[new_candidates[0][1]]['news']

## (4) Implement K-nearest neighbor (KNN) search using LSH and compare its performance with linear search.

In [327]:
singnature.shape

(100, 925)

In [328]:
class LSH_Knn:
    def __init__(self, singnature, n_hyper=10, k=3):
        # self._candidates = candidates
        self._singnature = singnature.T
        self.n_hyper = n_hyper
        self.hyper_planes = np.random.randn(self.n_hyper, 
                                            self._singnature.shape[1])
        self.regions = self.get_regions(self._singnature)
        self.k = k
        
    def get_regions(self, singnature):
        return (singnature.dot(self.hyper_planes.T) > 0).astype('int')
    
    def get_distance(self, s1, s2):
        intersection = np.logical_and(s1, s2)
        union = np.logical_or(s1, s2)
        return intersection.sum() / float(union.sum())
    
    def _get_nn(self, singnature, candidates_idx):
        s2 = singnature
        temp_candidates_sing = self._singnature[candidates_idx, :]
        #print(temp_candidates_sing.shape)
        dis = np.apply_along_axis(lambda s1, s2: self.get_distance(s1, s2), 
                            1, 
                            temp_candidates_sing,
                            s2=s2)
        #print(dis)
        idx_of_idx = np.argsort(dis)[:self.k]
        # print(candidates_idx[idx_of_idx])
        return candidates_idx[idx_of_idx]
    
    def _predict(self, singnature):
        r = self.get_regions(singnature)
        nn = np.all(r == self.regions, axis=1).astype('int')
        candidates_idx = np.argwhere(nn==1).reshape(-1,)
        return self._get_nn(singnature, candidates_idx)

In [329]:
model = LSH_Knn(singnature)

In [330]:
N = singnature.T.shape[0]//10

In [331]:
task4_output = ''
t1 = time.time()
for i in range(N):
    idx = model._predict(singnature.T[i])
task4_output += ('[LSH Knn] cost {:.3f} second in {} test data.'
                 .format(time.time()-t1, N))
task4_output

'[LSH Knn] cost 0.041 second in 92 test data.'

In [332]:
class MyKNeighborsClassifier:
    
    def __init__(self, n_neighbors=3, **kwargs):
        self._k = n_neighbors
        self._X = self._y = None
        self.set_params(**kwargs)
            
    def get_params(self, deep=True):
        # suppose this estimator has parameters "alpha" and "recursive"
        return self.__dict__

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self
    
    def fit(self, X):
        self._X = X.copy()
        # self._y = y.copy()
        
    def get_distance(self, s1, s2):
        intersection = np.logical_and(s1, s2)
        union = np.logical_or(s1, s2)
        return intersection.sum() / float(union.sum())
    
    def _predict(self, x):
        distances = np.apply_along_axis(lambda x1: get_distance(x, x1), 
                                        1, self._X)
        X_candidates = np.argsort(distances)[:self._k]
        # y_candidates = self._y[X_candidates]
        return X_candidates
    
    def predict(self, X):
        return np.apply_along_axis(lambda x: self._predict(x), 1, X)

In [333]:
model = MyKNeighborsClassifier()
model.fit(singnature.T)

In [334]:
t2 = time.time()
for i in range(N):
    idx = model._predict(singnature.T[i])
task4_output += ('\n[Linear search Knn] cost {:.3f} second in {} test data.'
                 .format(time.time()-t2, N))

print(task4_output)

[LSH Knn] cost 0.041 second in 92 test data.
[Linear search Knn] cost 1.392 second in 92 test data.


In [335]:
with open('result/task4.txt', 'w') as f:
    f.write(task4_output)

In [336]:
print('cost:{:.3f} min'.format((time.time()-t0)/60))

cost:149.393 min
