# **Assignment 7**
---
# **5430 NLP | Spring 2021 | Uni: chb2132**

## **A. Write a Python program, which:**
### **1. Filters out exactly and/or semantically duplicate articles from Webhose dataset**
---
#### **Notes**
* Use **LSH** *(SimHash: Word2Vec)* to deduplicate feeds based on title
* Store entire feeds in **JSON, text or CSV file**
* Include original and deduplicated sets of titles

### **Step 1:** Deduplicate Webhose Articles *(Exact/Semantic)*
---

In [None]:
!pip install simhash

In [None]:
import json
import numpy as np
import pandas as pd
import gensim, operator

from gensim.models import KeyedVectors
from scipy import spatial

#### *Load Google Word2Vec Vector Model*
---

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# loads in a model from specified path & file
model_path = '/content/drive/MyDrive/'

def load_model(model_name, file_name, flagBin):

    print('Loading ' + model_name + ' model...')
    model = KeyedVectors.load_word2vec_format(model_path + file_name, binary = flagBin)
    print('Finished loading ' + model_name + ' model...')
    
    return model
   
# load in Google word2vec model
model_w2v = load_model('Word2Vec', 'GoogleNews-vectors-negative300.bin.gz', True)

#### *Load Apple Webhose Newsfeeds, Subset Titles*
---

In [None]:
# load in Apple Webhose newsfeeds
apple_feeds = []

with open('/content/drive/MyDrive/webhose_apple.json', 'r') as f:
  for line in f.readlines():
    apple_feeds.append(json.loads(line))

# subset newsfeed for title column
newsfeeds = [a['title'] for a in apple_feeds]

In [None]:
i = 0
feeds = []

for feed in apple_feeds[:10800]:
    feed['id'] = i
    #print(feed['id'], str(feed['title']))
    i += 1
    feeds.append(feed)

#### *Define Vector Model Functions*
---

In [None]:
# check if input words in model vocabulary
def check_vocab(model, words):
    
    check_words = list()

    for word in words:
        if word in model.vocab:
            check_words.append(words.strip())
            
    return check_words

In [None]:
# calculate string similarity with model
def calc_sim(s1, s2, vects):

    s1_words = set(check_vocab(vects, s1.split()))
    s2_words = set(check_vocab(vects, s2.split()))
    
    str_sim = vects.n_similarity(s1_words, s2_words)

    return str_sim

### *Define Distance and Index (SimHash)*
---

In [None]:
import logging
logging.getLogger('simhash').setLevel(logging.CRITICAL)

from simhash import Simhash, SimhashIndex

In [None]:
# define distance
hamming_dist = 2

# define SimHash index and determine duplicate titles
obj = [(str(feed['id']), Simhash(str(feed['title']))) for feed in feeds]

index = SimhashIndex(obj, k = hamming_dist)

### *Calculate and Print Duplicate Titles (SimHash)*---

In [None]:
f = 0
dd_dex = []
dupe_ct = []
dedupe_dex = []

for feed in feeds:

  feed_sel = feeds[f]
    
  # calculate  hash value
  feed_hash = Simhash(str(feed_sel['title']))
    
  # find all duplicate indices
  dupe_dex = index.get_near_dups(feed_hash)
  dupe_ct.append(dupe_dex)
  
  if(len(dupe_dex) > 1):
    dedupe_dex.append(dupe_dex)
  
  f += 1

d = 0

for dupe in dedupe_dex:

  dex = dedupe_dex[d]

  for e in dex:
    dd_dex.append(int(e))

  d += 1

ddp = sorted(np.unique(dd_dex))
#print(ddp)

In [None]:
dedupes = []
j = 0

for feed in feeds:
  if feed['id'] not in ddp:
    dedupes.append(feed)
    j += 1

#print(len(dedupes))

### *Calculate SimHash/Word2Vec Similarity*
---

In [None]:
# calculate similarity with SimHash/Word2Vec
dedupe = []
dupe_count = 0

for dupe in dupe_dex:
  
  try:
    dupe_score = sim_calc(feed_sel['title'], feeds[int(dupe)]['title'], model_w2v)
    
  except:
    dupe_score = 0
  
  if dupe_score > 0.7:
    dupe_count += 1

### *Write deduplicated articles to a new JSON file*
---

In [None]:
with open("dedupes.json", "w") as data_file:
  
  for dupe in dedupes:

    line = json.dumps(dupe)
    data_file.write(line)
    data_file.write("\n")

### *Read and print record count to output*
---

In [None]:
# load in deduplicated json file and read to list
dedupeson = []

with open('/content/drive/MyDrive/dedupes.json', 'r') as f:
  for dedupe in f.readlines():
    dedupeson.append(json.loads(dedupe))

# print record count from dedupe json to output
print("Deduped Count: " + str(len(dedupeson[0])))