# NOTEBOOK FOR TESTING

### IMPORT LIBRARIES

In [1]:
import sqlite3  # library to import      
import time 
import re   
import nltk # library to import 
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize 
from nltk.corpus.reader.wordnet import NOUN, VERB, ADJ, ADV

from collections import Counter 
from simhash import Simhash

nltk.download('stopwords')
nltk.download('wordnet') 
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')  
nltk.download('omw-1.4')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\utente\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\utente\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\utente\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\utente\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\utente\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\utente\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already u

True

### PREPROCESS TEXT

In [37]:
def preprocess_sbagliato(text):
    #   Set of stopwords 
    stop_words = set(stopwords.words('english')) 
    #   Lemmatizer 
    lemmatizer = WordNetLemmatizer()
    #   potrebbe essere utile cambiare il set di stopwords a seconda della lingua della pagina
    #   devo capire come fare
    #   Tokenize the content       
    words = re.findall(r'\w+', text.lower())

    #   Remove the stop words from the set of words 
    words = [w for w in words if w.isalpha() and w not in stop_words]

    #   Apply lemmatization (keep lemmatization or apply stemming?)
    words = [lemmatizer.lemmatize(word) for word in words]

    #   Return the set of words preprocessed 
    return words 

Example

In [None]:
text = "The man suspected of opening fire on two Minnesota legislators and their spouses on 14 June, killing one legislator and her husband, was apprehended late on Sunday night and charged with two counts of murder and two of attempted murder, the state’s governor, Tim Walz, said at a news conference. Vance Boelter, 57, is suspected of fatally shooting the Democratic state representative Melissa Hortman and her husband, Mark, at their residence early on Saturday. Boelter is also suspected of shooting the state senator John Hoffman and his wife, Yvette, at their home, seriously injuring them. “One man’s unthinkable actions have altered the state of Minnesota,” the state’s governor, Tim Walz, said at a news conference. Boelter was arrested in a rural area in Sibley County, southwest of Minneapolis, according to police, who added that he was armed when he was taken into custody. A criminal complaint unsealed Sunday night said Boelter faces two counts of second-degree murder and two counts of attempted second-degree murder in the deaths of the Hortmans and the wounding of Hoffman and his wife."

words = preprocess_sbagliato(text) 

print(words)


['man', 'suspected', 'opening', 'fire', 'two', 'minnesota', 'legislator', 'spouse', '14', 'june', 'killing', 'one', 'legislator', 'husband', 'apprehended', 'late', 'sunday', 'night', 'charged', 'two', 'count', 'murder', 'two', 'attempted', 'murder', 'state', 'governor', 'tim', 'walz', 'said', 'news', 'conference', 'vance', 'boelter', '57', 'suspected', 'fatally', 'shooting', 'democratic', 'state', 'representative', 'melissa', 'hortman', 'husband', 'mark', 'residence', 'early', 'saturday', 'boelter', 'also', 'suspected', 'shooting', 'state', 'senator', 'john', 'hoffman', 'wife', 'yvette', 'home', 'seriously', 'injuring', 'one', 'man', 'unthinkable', 'action', 'altered', 'state', 'minnesota', 'state', 'governor', 'tim', 'walz', 'said', 'news', 'conference', 'boelter', 'arrested', 'rural', 'area', 'sibley', 'county', 'southwest', 'minneapolis', 'according', 'police', 'added', 'armed', 'taken', 'custody', 'criminal', 'complaint', 'unsealed', 'sunday', 'night', 'said', 'boelter', 'face', 't

Come si può vedere, non viene fatto il **lemmatization** bene, poiché non riconosce se una parola è un sostantivo oppure un verbo.

Per sistemare questa cosa, introduco il **POS tagging**, che è il processo di assegnare a ciascuna parola di un testo il suo ruolo grammaticale. 

Sistemo quindi il codice. 

In [2]:
def get_wordnet_pos(treebank_tag): 
    if treebank_tag.startswith('J'):
        return ADJ
    elif treebank_tag.startswith('V'):
        return VERB
    elif treebank_tag.startswith('N'):
        return NOUN
    elif treebank_tag.startswith('R'):
        return ADV
    else:
        return NOUN  # default

def preprocess(text): 
    words = word_tokenize(text.lower())
    #   Set of stopwords 
    stop_words = set(stopwords.words('english')) 
    #   Lemmatizer 
    lemmatizer = WordNetLemmatizer()

    number_words = ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten']

    #   Remove stop words and punctuation 
    words = [w for w in words if w.isalpha() and w not in stop_words and w not in number_words]

    tagged_words = pos_tag(words)

    #   Lemmatization is done using the correct grammatical type 
    lemmatized_words = [lemmatizer.lemmatize(w, get_wordnet_pos(t)) for w, t in tagged_words]

    return lemmatized_words


Riprovo l'esempio di prima

In [3]:
text = "The man suspected of opening fire on two Minnesota legislators and their spouses on 14 June, killing one legislator and her husband, was apprehended late on Sunday night and charged with two counts of murder and two of attempted murder, the state’s governor, Tim Walz, said at a news conference. Vance Boelter, 57, is suspected of fatally shooting the Democratic state representative Melissa Hortman and her husband, Mark, at their residence early on Saturday. Boelter is also suspected of shooting the state senator John Hoffman and his wife, Yvette, at their home, seriously injuring them. “One man’s unthinkable actions have altered the state of Minnesota,” the state’s governor, Tim Walz, said at a news conference. Boelter was arrested in a rural area in Sibley County, southwest of Minneapolis, according to police, who added that he was armed when he was taken into custody. A criminal complaint unsealed Sunday night said Boelter faces two counts of second-degree murder and two counts of attempted second-degree murder in the deaths of the Hortmans and the wounding of Hoffman and his wife."

words = preprocess(text) 

print(words)

['man', 'suspect', 'opening', 'fire', 'minnesota', 'legislator', 'spouses', 'june', 'kill', 'legislator', 'husband', 'apprehend', 'late', 'sunday', 'night', 'charge', 'count', 'murder', 'attempt', 'murder', 'state', 'governor', 'tim', 'walz', 'say', 'news', 'conference', 'vance', 'boelter', 'suspect', 'fatally', 'shoot', 'democratic', 'state', 'representative', 'melissa', 'hortman', 'husband', 'mark', 'residence', 'early', 'saturday', 'boelter', 'also', 'suspect', 'shoot', 'state', 'senator', 'john', 'hoffman', 'wife', 'yvette', 'home', 'seriously', 'injure', 'man', 'unthinkable', 'action', 'alter', 'state', 'minnesota', 'state', 'governor', 'tim', 'walz', 'say', 'news', 'conference', 'boelter', 'arrest', 'rural', 'area', 'sibley', 'county', 'southwest', 'minneapolis', 'accord', 'police', 'add', 'arm', 'take', 'custody', 'criminal', 'complaint', 'unseal', 'sunday', 'night', 'say', 'boelter', 'face', 'count', 'murder', 'count', 'attempt', 'murder', 'death', 'hortmans', 'wound', 'hoffman

In [4]:
tf = Counter(words)

print(tf)

Counter({'state': 5, 'murder': 4, 'boelter': 4, 'suspect': 3, 'count': 3, 'say': 3, 'man': 2, 'minnesota': 2, 'legislator': 2, 'husband': 2, 'sunday': 2, 'night': 2, 'attempt': 2, 'governor': 2, 'tim': 2, 'walz': 2, 'news': 2, 'conference': 2, 'shoot': 2, 'hoffman': 2, 'wife': 2, 'opening': 1, 'fire': 1, 'spouses': 1, 'june': 1, 'kill': 1, 'apprehend': 1, 'late': 1, 'charge': 1, 'vance': 1, 'fatally': 1, 'democratic': 1, 'representative': 1, 'melissa': 1, 'hortman': 1, 'mark': 1, 'residence': 1, 'early': 1, 'saturday': 1, 'also': 1, 'senator': 1, 'john': 1, 'yvette': 1, 'home': 1, 'seriously': 1, 'injure': 1, 'unthinkable': 1, 'action': 1, 'alter': 1, 'arrest': 1, 'rural': 1, 'area': 1, 'sibley': 1, 'county': 1, 'southwest': 1, 'minneapolis': 1, 'accord': 1, 'police': 1, 'add': 1, 'arm': 1, 'take': 1, 'custody': 1, 'criminal': 1, 'complaint': 1, 'unseal': 1, 'face': 1, 'death': 1, 'hortmans': 1, 'wound': 1})


### SIMHASH 

In [5]:
def compute_fingerprint(text):
    words = preprocess(text)
    return Simhash(words).value

In [12]:
fp1 = compute_fingerprint(text)
print(fp1)

2420779509525056338


Adesso provo a verificare se due contenuti sono "near duplicates"

In [11]:
def hamming_distance(fp1, fp2):

    x = (fp1 ^ fp2) & ((1 << 64) - 1)
    distance = 0
    while x:
        distance += 1
        x &= x - 1

    return distance

def is_near_duplicate(fp1, fp2, threshold=5):
    
    d = hamming_distance(fp1, fp2) 
    if d <= threshold :
            return True     #   The pages are near-duplicates
    
    return False    


In [10]:
text2 = "US president Donald Trump has denied that he left a G7 leaders summit in Canada early to work on a ceasefire between Israel and Iran in a social media post in which he also derided Emmanuel Macron as “publicity seeking”. The US president said Macron had “no idea” why he was returning to Washington and that the reason was “much bigger” than a ceasefire. Earlier, Macron had suggested that Trump had made a ceasefire “offer”. G7 leaders have affirmed that “Israel has a right to defend itself”. The leaders said that they reiterate their support for the security of Israel, in a statement in which they called Iran “the principal source of regional instability and terror”. Trump also said he wanted a “real end” to the nuclear problem with Iran, with Iran “giving up entirely” on nuclear weapons, according to comments that were posted by a CBS News reporter on X. Trump made the comments during his midnight departure from Canada, where he attended the Group of Seven nations summit on Monday, the CBS News reporter said early on Tuesday.  Heavy traffic could be seen from Tehran after Trump told Iranians to ‘immediately evacuate’. The comments from the president came not long after Israel told people to evacuate a large part of Tehran ahead of a bombing campaign – similar to its tactics in Gaza where it has killed tens of thousands of civilians. The Israeli military claimed on Tuesday to have assassinated Ali Shadmani, who it identified as Iran’s wartime chief of staff, Reuters reports. The Israelis described Shadmani as Iran’s “most senior military commander”. Shadmani was appointed to his role after the assassination of Gholamali Rashid last week by Israeli forces, who was the former head of the Khatam al-Anbiya Central Headquarters. At least 47 Palestinians have been killed as they awaited aid trucks in Khan Younis in the southern Gaza Strip, the territory’s civil defence agency said in a statement. The agency added that over 200 had been wounded as a result. The reports come after five Palestinian civilians were killed and several others injured after Israeli forces opened fire on people waiting for aid in north-west Gaza city. Dr Mohammed Saqer, head of nursing at Nasser hospital in Khan Younis, said 300 casualties had arrived after being attacked by “tanks” while waiting for food. He added that the hospital could “no longer deal with such high numbers of cases”. A staffer at the Iranian state broadcaster IRIB has died after an Israeli attack on the building during a live broadcast, Iranian media has reported. Masoumeh Azimi succumbed to severe injuries caused by the shockwave from the attack, Press TV, which is owned by IRIB, reported. Several other journalists were also injured, it said. It was not possible to independently confirm the report. US defence ministry spokespeople have denied reports online that the US has joined attacks on Iran. “This is not true”, chief Pentagon spokesperson Sean Parnell wrote on X in response to one such post." 

words2 = preprocess(text2)

print(words2) 

tf2 = Counter(words2)

print(tf2)

['u', 'president', 'donald', 'trump', 'deny', 'left', 'leader', 'summit', 'canada', 'early', 'work', 'ceasefire', 'israel', 'iran', 'social', 'medium', 'post', 'also', 'deride', 'emmanuel', 'macron', 'publicity', 'seek', 'u', 'president', 'say', 'macron', 'idea', 'return', 'washington', 'reason', 'much', 'big', 'ceasefire', 'earlier', 'macron', 'suggest', 'trump', 'make', 'ceasefire', 'offer', 'leader', 'affirm', 'israel', 'right', 'defend', 'leader', 'say', 'reiterate', 'support', 'security', 'israel', 'statement', 'call', 'iran', 'principal', 'source', 'regional', 'instability', 'terror', 'trump', 'also', 'say', 'wanted', 'real', 'end', 'nuclear', 'problem', 'iran', 'iran', 'give', 'entirely', 'nuclear', 'weapon', 'accord', 'comment', 'post', 'cbs', 'news', 'reporter', 'trump', 'make', 'comment', 'midnight', 'departure', 'canada', 'attend', 'group', 'nation', 'summit', 'monday', 'cbs', 'news', 'reporter', 'say', 'early', 'tuesday', 'heavy', 'traffic', 'could', 'see', 'tehran', 'trump

In [9]:
fp2 = compute_fingerprint(text2)

print(fp2)

16729321200353243688


In [13]:
if is_near_duplicate(fp1, fp2): 
    print("They are near duplicates")
else: 
    print("They are not near duplicates")

They are not near duplicates
