# 103590450 馬茂源 四資四

In [206]:
import pandas as pd
import numpy as np
import time
import os
import sys
import string
import html
import hashlib
from collections import defaultdict

In [207]:
t0 = time.time()

In [208]:
if not os.path.exists('result'):
    os.mkdir('result')
for i in range(1, 5):
    dir_ = 'result/task{}'.format(i)
    if not os.path.exists(dir_):
        os.mkdir(dir_)

In [209]:
file_names = ['./data/reut2-{0:0>3}.sgm'.format(i) for i in range(22)]
file_names = [file_names[0]]

## (1) Given the Reuters-21578 dataset, please calculate all kshingles and output the set representation of the text dataset as a matrix. 

In [210]:
def parser(file_name):
    with open(file_name, 'r', encoding='ISO-8859-1') as f:
        file = f.read()
    news = []
    start = 0
    for i in range(len(file)):
        if file[i:i+6] == '<BODY>':
            start = i+6
        elif file[i:i+7] == '</BODY>':
            n = file[start:i].replace('\n', ' ')
            n = n.replace('REUTER &#3;', '')
            news.append(n)
    return news

In [211]:
news = []
for i in file_names:
    each_news = parser(i) 
    news.extend(each_news)

In [212]:
len(news)

925

In [213]:
news_data = pd.DataFrame(data=news, columns=['news'])

In [214]:
news_data.head()

Unnamed: 0,news
0,Showers continued throughout the week in the B...
1,Standard Oil Co and BP North America Inc said ...
2,Texas Commerce Bancshares Inc's Texas Commerce...
3,BankAmerica Corp is not under pressure to act ...
4,The U.S. Agriculture Department reported the f...


In [215]:
def tokenizer(text):
    strip_chars = ' –…' + string.punctuation
    result = []
    for token in html.unescape(str(text)).split():
        token = token.lower()
        token = token.strip(strip_chars)
        if not token.isalpha():
            continue
        length = len(token)
        if length < 1 or length > 26:
            continue
        result.append(token)
    return result

In [216]:
news_data['news_token'] = news_data.apply(lambda x: tokenizer(x['news']), axis=1)

In [217]:
def k_shingle(text, k):
    string = ' '.join(tokenizer(text))
    shingles = set([])
    for i in range(len(string)-k + 1):
        shingles.add(string[i:i+k])
    return shingles

In [218]:
news_data['news_token'] = news_data.apply(lambda x: k_shingle(x['news_token'], 5), axis=1)

In [219]:
shingles = set([])
for s in news_data['news_token'].values:
    shingles |= s

In [220]:
N = len(shingles)
N

60332

In [221]:
def encode_shingles(row, shingles):
    vector = np.zeros(len(shingles), dtype='int')
    for shingle, idx in zip(shingles, range(len(shingles))):
        if shingle in row:
            vector[idx] = 1
    return vector

In [222]:
news_data['shingles'] = news_data['news_token'].apply(encode_shingles, args=(shingles,))

In [223]:
output_shingles = np.array(news_data['shingles'].tolist()).T.astype('int')

In [224]:
#np.savetxt("task_1.csv", output_shingles, delimiter=",",  fmt='%d')

## (2) Given the set representation, compute the minhash signatures of all documents using MapReduce.

In [225]:
news_shingles = output_shingles

In [226]:
def get_prime(greater_than):
    
    def is_prime(n):
        if n % 2 == 0 and n > 2: 
            return False
        return all(n % i for i in range(3, int(np.sqrt(n)) + 1, 2))
    is_p = False
    
    
    
    while not is_p:
        greater_than += 1
        is_p = is_prime(greater_than)
        
    return greater_than

In [227]:
def get_hash_func_list(n, k=100):
    p = get_prime(n)
    func_list = []
    for a, b in zip(np.random.randint(0, n, size=k),
                   np.random.randint(0, n, size=k)):
        func_list.append(lambda x, a=a,b=b,p=p,n=n: ((a*x+b)%p)%n)
    return np.array(func_list)

In [228]:
def one_pass_minhashing(shingles, k=100):
    n = shingles.shape[0]
    hash_list = get_hash_func_list(n, k=k)
    singnature = np.full((k, shingles.shape[1]), fill_value=np.inf)
    
    for i in range(n):
        hash_value = np.array([h(i) for h in hash_list])

        for j, c in enumerate(shingles[i, :] == 1):
            if c:
                mask = singnature[:, j] > hash_value
                singnature[:, j][mask] = hash_value[mask]
            
    return singnature

In [229]:
test_input = np.array([[1,0,1,0],
                       [1,0,0,1],
                       [0,1,0,1],
                       [0,1,0,1],
                       [0,1,0,1],
                       [1,0,1,0],
                       [1,0,1,0]])
test_singnature = one_pass_minhashing(test_input, k=6)
test_singnature.astype('int')

array([[0, 1, 0, 1],
       [2, 0, 2, 0],
       [1, 0, 1, 0],
       [0, 1, 0, 1],
       [0, 3, 2, 0],
       [0, 2, 0, 2]])

In [230]:
 #np.unique((minhashing[:, 0] - minhashing[:, 2]) == 0, return_counts=True)

In [231]:
singnature = one_pass_minhashing(news_shingles, k=100)

  


In [232]:
singnature.shape

(100, 925)

In [233]:
#np.savetxt("task_2.csv", singnature, delimiter=",",  fmt='%d')

## (3) Implement the LSH algorithm by MapReduce and output the resulting candidate pairs of similar documents.

In [253]:
hashlib.sha512().block_size

128

In [250]:
def LSH(singnature, b=2):
    bucket = defaultdict(list)
    k = singnature.shape[0]
    for i, doc in enumerate(singnature.T):
        x = np.array2string(doc.astype('int'), separator='', precision=0)
        for j in range(0, k, b):
            #print(j, j+b)
            key = hashlib.sha512(x[j:j+b].encode()).hexdigest()
            bucket[key].append(i)
    return bucket

In [235]:
def remove_alone(bucket):
    remove_list = []
    for k, item in bucket.items():
        if len(item) == 1:
            remove_list.append(k)
    for k in remove_list:
        bucket.pop(k)
    return bucket

In [236]:
# bucket = LSH(test_singnature, b=2)
# bucket

In [254]:
bucket = LSH(singnature, b=5)
bucket = remove_alone(bucket)
print(len(bucket))

2889


In [257]:
for k, item in bucket.items():
    if len(item) == 2:
        #print(item)
        pass

In [245]:
news_data.loc[29]['news']

'Sens. Alan Cranston (D-Cal.) and Daniel Evans (R-Wash.) said they introduced export licensing reform legislation that could save U.S. companies hundreds of thousands of dollars annually.     "Our emphasis is two-fold: Decontrol and de-license items where such actions will not endanger our national security, and eliminate the Department of Defense\'s de facto veto authority over the licensing process," Cranston said.     "Our reforms should reduce licensing requirements by 65  to 70 pct," he told reporters. "I am convinced that a more rational...licensing process will boost exports."     U.S. export controls are intended to deny Eastern bloc countries access to technology that could further their military capabilities.     "By refocusing our control resources on higher levels of technology, technology that is truly critical, we will do a better job of preventing diversion of critical technology to our adversaries while promoting more exports," Cranston said.     "We cannot expect to co

In [246]:
news_data.loc[52]['news']

'Sens. Alan Cranston (D-Cal.) and Daniel Evans (R-Wash.) said they introduced export licensing reform legislation that could save U.S. companies hundreds of thousands of dollars annually.     "Our emphasis is two-fold: Decontrol and de-license items where such actions will not endanger our national security, and eliminate the Department of Defense\'s de facto veto authority over the licensing process," Cranston said.     "Our reforms should reduce licensing requirements by 65  to 70 pct," he told reporters. "I am convinced that a more rational...licensing process will boost exports."     U.S. export controls are intended to deny Eastern bloc countries access to technology that could further their military capabilities.     "By refocusing our control resources on higher levels of technology, technology that is truly critical, we will do a better job of preventing diversion of critical technology to our adversaries while promoting more exports," Cranston said.     "We cannot expect to co

## (4) Implement K-nearest neighbor (KNN) search using LSH and compare its performance with linear search.