In [1]:
!pip install unidecode
!pip install textblob==0.15
!pip install datetime
!pip install langdetect

Collecting unidecode
  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
Installing collected packages: unidecode
Successfully installed unidecode-1.3.8
Collecting textblob==0.15
  Downloading textblob-0.15.0-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting nltk>=3.1 (from textblob==0.15)
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk>=3.1->textblob==0.15)
  Using cached click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting joblib (from nltk>=3.1->textblob==0.15)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting regex>=2021.8.3 (from nltk>=3.1->textblob==0.15)
  Using cached regex-2024.9.11-cp311-cp311-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting tqdm (from nltk>=3.1->textblob==0.15)
  Using cached tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)
Downloading textblob-0.15.0-py2.py3-none-any.whl (631 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import unidecode
from textblob import TextBlob
import re
from datetime import datetime
from langdetect import detect

In [3]:
def normalize_unicode(text):
    return unidecode.unidecode(text)

In [4]:
def normalize_language(text):
    blob = TextBlob(text)
    if detect(text)!= 'en':
        return str(blob.translate(from_lang=detect(text),to='en'))
    return text

In [5]:
def normalize_dates(text):
 # This regex pattern might need adjustments for specific formats
    dates = re.findall(r'\d{2}/\d{2}/\d{4}', text)
    normalized_dates = [datetime.strptime(date,'%d/%m/%Y').strftime('%Y-%m-%d') for date in dates]
    for date in set(dates):
        text = text.replace(date, next(iter(normalized_dates)))
    return text

In [6]:
def normalize_numbers(text):
# Example normalization: convert all integers to 'number'
    return re.sub(r'\b\d+\b', 'number', text)

In [7]:
def replace_synonyms(text, synonym_dict):
 # Example synonyms
    for word, synonym in synonym_dict.items():
        text = re.sub(r'\b' + word + r'\b', synonym, text)
    return text

In [8]:
original_text_english = "He said, 'I am flying to N.Y. on 04/07/2023 and back on 12/07/2023. I can't wait!'"
original_text_italian="Ciao, sono Mario e sono nato il 12/04/1990. Quest'anno saro' a N.Y per visitare la città"
synonyms = {'N.Y.': 'New York', "can't": 'cannot'}
list_text=[original_text_english,original_text_italian]
# Applying normalization functions
for original_text in list_text:
    normalized_text = normalize_unicode(original_text)
    normalized_text = normalize_language(normalized_text)
    normalized_text = normalize_dates(normalized_text)
    normalized_text = normalize_numbers(normalized_text)
    normalized_text = replace_synonyms(normalized_text, synonyms)
    print("Original Text:", original_text)
    print("Normalized Text:", normalized_text)

Original Text: He said, 'I am flying to N.Y. on 04/07/2023 and back on 12/07/2023. I can't wait!'
Normalized Text: He said, 'I am flying to N.Y. on number-number-number and back on number-number-number. I cannot wait!'
Original Text: Ciao, sono Mario e sono nato il 12/04/1990. Quest'anno saro' a N.Y per visitare la città
Normalized Text: Hi, I'm Mario and I was born on number-number-number. This year I will be in New Yorkto visit the city


In [12]:
## errore beautiful soup pagina 8

In [11]:
!pip install beautifulsoup4

Collecting beautifulsoup4
  Using cached beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4)
  Using cached soupsieve-2.6-py3-none-any.whl.metadata (4.6 kB)
Using cached beautifulsoup4-4.12.3-py3-none-any.whl (147 kB)
Using cached soupsieve-2.6-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.12.3 soupsieve-2.6


In [13]:
from bs4 import BeautifulSoup

In [14]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

In [15]:
def clean_nested_html(text):
    soup = BeautifulSoup(text, "html.parser")
    for script in soup(["script", "style"]): # remove all script and style tags
        script.extract()
    return soup.get_text(separator=' ')

In [16]:
def preserve_html_entities(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.decode(formatter="html")

In [17]:
def adjust_spacing(text):
    clean_text = ' '.join(text.split())
    return clean_text

In [18]:
def quality_check(text, original_text):
    if len(text) >= len(original_text):
        return False # indicates potential data loss or incorrect removal
    return True

In [19]:
sample_html ="<html><head><title>Test</title></head><body><h1>Heading</h1><p>This is a <b>bold</b> paragraph.</p><script>console.log('ignore this');</script></body></html>"
clean_text = remove_html_tags(sample_html)
clean_text= clean_nested_html(clean_text)
clean_text=preserve_html_entities(clean_text)
clean_text = adjust_spacing(clean_text)
print("Quality check result:",quality_check(text=clean_text,original_text=sample_html))
print("Original HTML Content:", sample_html)
print("Cleaned Text:", clean_text)

Quality check result: True
Original HTML Content: <html><head><title>Test</title></head><body><h1>Heading</h1><p>This is a <b>bold</b> paragraph.</p><script>console.log('ignore this');</script></body></html>
Cleaned Text: TestHeadingThis is a bold paragraph.


In [20]:
!pip install pyspellchecker

Collecting pyspellchecker
  Downloading pyspellchecker-0.8.1-py3-none-any.whl.metadata (9.4 kB)
Downloading pyspellchecker-0.8.1-py3-none-any.whl (6.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.1


In [21]:
from spellchecker import SpellChecker

In [22]:
def initialize_spell_checker():
    spell = SpellChecker()
    return spell

In [23]:
def tokenize(text):
    return text.split()

In [24]:
def find_misspelled_words(text, spell):
    tokens = tokenize(text)
    misspelled = spell.unknown(tokens)
    return misspelled

In [25]:
def correct_words(text, spell):
    misspelled = find_misspelled_words(text, spell)
    corrected_text = text
    for word in misspelled:
 # Get the one `most likely` answer
        corrected_word = spell.correction(word)
        corrected_text = corrected_text.replace(word, corrected_word)
    return corrected_text

In [26]:
def apply_contextual_correction(text, spell):
    corrected_text = correct_words(text, spell)
    # Additional context-based improvements can be coded here
    return corrected_text

In [27]:
original_text = "I ave receved an invitaion to an excusive event."
spell = initialize_spell_checker()
corrected_text = apply_contextual_correction(original_text, spell)
print("Original Text:", original_text)
print("Corrected Text:", corrected_text)

Original Text: I ave receved an invitaion to an excusive event.
Corrected Text: I have received an invitation to an exclusive event


In [None]:
## errore pagina 12

In [28]:
!pip install re

[31mERROR: Could not find a version that satisfies the requirement re (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for re[0m[31m
[0m

In [29]:
import regex as re

In [30]:
def define_punctuation():
    return re.compile(r'[\p{P}]+')

In [31]:
def remove_punctuation(text, pattern):
    return pattern.sub('', text)

In [32]:
def remove_unicode_punctuation(text):  
    unicode_punctuation = re.compile(r'[\p{S}\p{P}]+', re.UNICODE)
    return unicode_punctuation.sub('', text)

In [33]:
def clean_text(text):
    punctuation_pattern = define_punctuation()
    text = remove_punctuation(text, punctuation_pattern)
    text = remove_unicode_punctuation(text)
    return text

In [34]:
def verify_text_integrity(original, cleaned):
    if not cleaned.isalnum() and not cleaned:
        raise ValueError("Text integrity compromised: non-alphanumeric characters detected.")
    if len(cleaned)>=len(original):
        return False
    return True

In [35]:
sample_text = "Hello, world! Welcome to the universe of NLP; where processing... becomes easy?"
cleaned_text = clean_text(sample_text)
print("Text integrity passed?",verify_text_integrity(sample_text,cleaned_text))
print("Original Text:", sample_text)
print("Cleaned Text:", cleaned_text)

Text integrity passed? True
Original Text: Hello, world! Welcome to the universe of NLP; where processing... becomes easy?
Cleaned Text: Hello world Welcome to the universe of NLP where processing becomes easy


In [36]:
def to_lowercase(text):
    return text.lower()

In [37]:
def normalize_case(text, exceptions):
    words = text.split()
    normalized_words = [to_lowercase(word) if word.lower() not in exceptions else word for word in words]
    return ' '.join(normalized_words)

In [38]:
def apply_normalization(text, exceptions:set=set()):
    return normalize_case(text, exceptions)

In [39]:
def check_normalization(text,exceptions:set=None):
    if exceptions:
        text =str(set(text.split(' '))-exceptions)
    if text != to_lowercase(text):
        return False # Indicates not all text is in lowercase
    return True

In [40]:
sample_text = "This is an EXAMPLE of Mixed CASE Text."
exceptions = {'EXAMPLE'}
normalized_text = apply_normalization(sample_text, exceptions)
print("Check normalization:", check_normalization(normalized_text,exceptions=exceptions))
print("Original Text:", sample_text)
print("Normalized Text:", normalized_text)

Check normalization: True
Original Text: This is an EXAMPLE of Mixed CASE Text.
Normalized Text: this is an example of mixed case text.


In [41]:
import regex as re

In [47]:
def define_whitespace_patterns():
    return re.compile(r'\s+')

In [42]:
def remove_excess_whitespace(text):
    pattern = define_whitespace_patterns()
    return pattern.sub(' ', text).strip()

In [43]:
def remove_special_whitespace(text):
    # Replace tabs and newlines explicitly if required
    text = text.replace('\t', ' ').replace('\n', ' ')
    return remove_excess_whitespace(text)

In [44]:
def verify_text_structure(text):
    if re.search(r'^\s|\s$', text):
        raise ValueError("Text structure compromised: leading or trailing whitespace detected.")
    return text

In [45]:
def apply_whitespace_normalization(text):
    cleaned_text = remove_special_whitespace(text)
    cleaned_text = verify_text_structure(cleaned_text)
    return cleaned_text

In [48]:
sample_text = " This text\t has \n excessive \n\nwhitespace.\n "
normalized_text = apply_whitespace_normalization(sample_text)
print("Original Text:", sample_text)
print("Normalized Text:", normalized_text)

Original Text:  This text	 has 
 excessive 

whitespace.
 
Normalized Text: This text has excessive whitespace.


In [49]:
!pip install nltk

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Using cached click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting joblib (from nltk)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting regex>=2021.8.3 (from nltk)
  Using cached regex-2024.9.11-cp311-cp311-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting tqdm (from nltk)
  Using cached tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Using cached regex-2024.9.11-cp311-cp311-macosx_11_0_arm64.whl (284 kB)
Using cached click-8.1.7-py3-none-any.whl (97 kB)
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Using cached tqdm-4.66.5-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, regex, joblib, click, nltk
Successfully installed click-8.1.7 joblib-1.4.2 nltk-3.9.1 regex-2024.9.11 tqdm-4.66.5


In [50]:
import nltk
nltk.download('punkt')

from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /Users/vincenzo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [51]:
def segment_sentences(text):
    return sent_tokenize(text)

In [52]:
def handle_special_cases(sentences):
    adjusted_sentences = []
    for sentence in sentences:
        if sentence.endswith(('Mr.', 'Mrs.', 'Dr.', 'Inc.')): # Add more cases as needed
            next_index = sentences.index(sentence) + 1
            if next_index < len(sentences):
                combined_sentence = sentence + ' ' + sentences[next_index]
                adjusted_sentences.append(combined_sentence)
            else:
                adjusted_sentences.append(sentence)
        else:
            if not adjusted_sentences or not adjusted_sentences[-1].endswith(('Mr.', 'Mrs.', 'Dr.', 'Inc.')):
                adjusted_sentences.append(sentence)
    return adjusted_sentences

In [53]:
def validate_segmentation(text):
    sentences = segment_sentences(text)
    sentences = handle_special_cases(sentences)
    return sentences

In [54]:
text_example = "Dr. Smith loves New York. He moved there in 2010. He founded an Hi-Tec company ACME Inc. which sells computers. He's married with Mrs. Laura Bennet. Isn't it great? Yes, he said so."
segmented_sentences = validate_segmentation(text_example)
print("Segmented Sentences:", segmented_sentences)

Segmented Sentences: ['Dr. Smith loves New York.', 'He moved there in 2010.', 'He founded an Hi-Tec company ACME Inc. which sells computers.', "He's married with Mrs. Laura Bennet.", "Isn't it great?", 'Yes, he said so.']


In [55]:
!pip install numpy
!pip install scikit-learn

Collecting numpy
  Using cached numpy-2.1.2-cp311-cp311-macosx_14_0_arm64.whl.metadata (60 kB)
Using cached numpy-2.1.2-cp311-cp311-macosx_14_0_arm64.whl (5.4 MB)
Installing collected packages: numpy
Successfully installed numpy-2.1.2
Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp311-cp311-macosx_12_0_arm64.whl.metadata (13 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.14.1-cp311-cp311-macosx_14_0_arm64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp311-cp311-macosx_12_0_arm64.whl (11.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.0/11.0 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hUsing cached joblib-1.4.2-py3-none-any.whl (301 kB)
Downloading scipy-1.14.1-cp311-cp311-

In [56]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np

In [61]:
def initialize_vectorizer(vectorizer_type='count'):
    if vectorizer_type == 'tfidf':
        return TfidfVectorizer()
    else:
        return CountVectorizer()

In [57]:
def tokenize_and_encode(texts, vectorizer):
    vectorizer.fit(texts)
    return vectorizer.transform(texts)

In [58]:
def normalize_vectors(vectors):
    norm = np.linalg.norm(vectors.toarray(), axis=1, keepdims=True)
    return vectors / norm

In [59]:
def prepare_data_for_model(texts, vectorizer_type='count'):
    vectorizer = initialize_vectorizer(vectorizer_type)
    vectors = tokenize_and_encode(texts, vectorizer)
    vectors = normalize_vectors(vectors)
    return vectors

In [62]:
sample_texts = ["Hello world", " Vectorizing NLP text for Machine learning training"]
vectorized_data = prepare_data_for_model(sample_texts, 'tfidf')
print("Vectorized Data Shape:", vectorized_data.shape)
print("Vector Example\n",vectorized_data)

Vectorized Data Shape: (2, 9)
Vector Example
 <COOrdinate sparse matrix of dtype 'float64'
	with 9 stored elements and shape (2, 9)>
  Coords	Values
  (0, 1)	0.7071067811865476
  (0, 8)	0.7071067811865476
  (1, 0)	0.3779644730092273
  (1, 2)	0.3779644730092273
  (1, 3)	0.3779644730092273
  (1, 4)	0.3779644730092273
  (1, 5)	0.3779644730092273
  (1, 6)	0.3779644730092273
  (1, 7)	0.3779644730092273


In [63]:
!pip install nltk
!pip install spacy
!python -m spacy download en_core_web_md

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Using cached click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting joblib (from nltk)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting regex>=2021.8.3 (from nltk)
  Using cached regex-2024.9.11-cp311-cp311-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting tqdm (from nltk)
  Using cached tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Using cached regex-2024.9.11-cp311-cp311-macosx_11_0_arm64.whl (284 kB)
Using cached click-8.1.7-py3-none-any.whl (97 kB)
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Using cached tqdm-4.66.5-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, regex, joblib, click, nltk
Successfully installed click-8.1.7 joblib-1.4.2 nltk-3.9.1 regex-2024.9.11 tqdm-4.66.5
Collecting spacy
  Using cached spacy-3.8.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (27 kB)
Collectin

In [64]:
import nltk
import spacy
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/vincenzo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [65]:
def custom_tokenizer(text):
# This tokenizer will need to handle specific punctuation and formats unique to the domain.
    tokens = word_tokenize(text)
    return tokens

In [66]:
def integrate_domain_vocabulary(text, vocabulary):
    # Integrate domain-specific vocabulary into the tokenizer or text processing pipeline.
    return " ".join([vocabulary.get(word, word) for word in text.split()])

In [67]:
def apply_contextual_rules(text, rules):
    # Apply domain-specific rules that adjust text based on its context.
    for pattern, replacement in rules.items():
        text = text.replace(pattern, replacement)
    return text

In [68]:
def normalize_semantics(text, synonym_dict):
    words = text.split()
    normalized_text = ' '.join([synonym_dict.get(word, word) for word in words])
    return normalized_text

In [69]:
def recognize_entities(text, nlp_model):
    nlp = spacy.load(nlp_model)
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

In [None]:
## errore pagina 24, controllare la formattazione

In [71]:
text_example = "Dr. Watson reviewed the patient's history before prescribing Acetaminophen 500mg daily and 5 mg of Roxicodone every 10 hours."
domain_synonyms = {'Acetaminophen':'Paracetamol','Roxicodone':"Oxycodone"}
rules = {'500mg': '500 mg'}
# Applying functions
tokenized_text = custom_tokenizer(text_example)
vocabulary_integrated_text = integrate_domain_vocabulary(" ".join(tokenized_text), domain_synonyms) ## controllare la formattazione
contextually_preprocessed_text = apply_contextual_rules(vocabulary_integrated_text, rules)
normalized_text = normalize_semantics(contextually_preprocessed_text,domain_synonyms)
entities = recognize_entities(normalized_text, 'en_core_web_md') #Example of a domain-adapted model
print("Normalized Text:", normalized_text)
print("Recognized Entities:", entities)

Normalized Text: Dr. Watson reviewed the patient 's history before prescribing Paracetamol 500 mg daily and 5 mg of Oxycodone every 10 hours .
Recognized Entities: [('Watson', 'PERSON'), ('5 mg', 'QUANTITY'), ('every 10 hours', 'TIME')]


In [10]:
!pip install multiprocess

Collecting multiprocess
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting dill>=0.3.9 (from multiprocess)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Downloading multiprocess-0.70.17-py311-none-any.whl (144 kB)
Downloading dill-0.3.9-py3-none-any.whl (119 kB)
Installing collected packages: dill, multiprocess
Successfully installed dill-0.3.9 multiprocess-0.70.17


In [2]:
!pip install pandas numpy

Collecting pandas
  Using cached pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting numpy
  Using cached numpy-2.1.2-cp311-cp311-macosx_14_0_arm64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl (11.3 MB)
Using cached numpy-2.1.2-cp311-cp311-macosx_14_0_arm64.whl (5.4 MB)
Using cached pytz-2024.2-py2.py3-none-any.whl (508 kB)
Using cached tzdata-2024.2-py2.py3-none-any.whl (346 kB)
Installing collected packages: pytz, tzdata, numpy, pandas
Successfully installed numpy-2.1.2 pandas-2.2.3 pytz-2024.2 tzdata-2024.2


In [11]:
import pandas as pd
import numpy as np
#from multiprocessing import Pool
from multiprocess import Pool

In [12]:
def create_batches(data, batch_size):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

In [13]:
def process_batch(batch):
    """Process each batch - example transformation to lowercase."""
    return batch.str.lower()

In [14]:
def consolidate_results(results):
    """Combine processed batches into one dataset."""
    return pd.concat(results, ignore_index=True)

In [15]:
def safe_process_batch(batch):
    """Process batch with error handling."""
    try:
        return process_batch(batch)
    except Exception as e:
        print(f"Error processing batch: {e}")
        return pd.DataFrame()

In [16]:
def parallel_process(data, batch_size):
    """Use parallel processing to handle batches simultaneously."""
    batches = list(create_batches(data, batch_size))
    print(batches)
    with Pool(processes=4) as pool: # Adjust number of processes based on your system
        results = pool.map(safe_process_batch, batches)
    return results

In [17]:
# Example DataFrame
data = pd.Series(["Hello WORLD", "Batch Processing is COOL", "Python is great for DATA processing","PYTHON is a powerful tool for NLP"])

batch_size = 1 # Processing one item at a time for illustration
processed_results = parallel_process(data, batch_size)
consolidated_data = consolidate_results(processed_results)

print("Processed Data:")
print(consolidated_data)

[0    Hello WORLD
dtype: object, 1    Batch Processing is COOL
dtype: object, 2    Python is great for DATA processing
dtype: object, 3    PYTHON is a powerful tool for NLP
dtype: object]
Processed Data:
0                            hello world
1               batch processing is cool
2    python is great for data processing
3      python is a powerful tool for nlp
dtype: object


In [None]:
!pip install spacy
!pip install faker

In [22]:
## errore, manca questa installazione

In [12]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
import spacy
from faker import Faker

In [4]:
def identify_sensitive_data(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    return {ent.text: ent.label_ for ent in doc.ents if ent.label_ in ['PERSON', 'GPE', 'DATE', 'ORG']}

In [5]:
def anonymize_text(text, sensitive_info):
    faker = Faker()
    anonymized_info = {}
    for info, label in sensitive_info.items():
        if label == 'PERSON':
            anonymized_info[info] = faker.name()
        elif label == 'GPE':
            anonymized_info[info] = faker.country()
        elif label == 'DATE':
            anonymized_info[info] = faker.date()
        elif label == 'ORG':
            anonymized_info[info] = faker.company()
    for original, anon in anonymized_info.items():
        text = text.replace(original, anon)
    return text, anonymized_info

In [6]:
def validate_anonymization(anonymized_text, original_info,
anonymized_info):
    # Validation is successful if no original entity remains in the anonymized text
    for original, _ in original_info.items():
        if original in anonymized_text:
            return "Validation Failed"
    # Ensure that replacements are indeed different
    for original, anon in anonymized_info.items():
        if original == anon:
            return "Validation Failed"
    return "Validation Successful"

In [7]:
def log_anonymization_process(info):
    print("Anonymization process:", info)

In [8]:
## errore indentazione sbagliata pgina 29

In [9]:
def assess_re_identification_risk(anonymized_info):
    """ Assess risks based on how distinguishable the anonymized entities are from the originals """
    risk_levels = []
    for original, anon in anonymized_info.items():
        if original.lower() == anon.lower():
            risk_levels.append('High risk')
        else:
            risk_levels.append('Low risk')
    if 'High risk' in risk_levels:
        return "High risk of re-identification"
    return "Low risk of re-identification"

In [13]:
sample_text = "Dr. John Doe works at the University of Springfield, is an associate professor of Computer Science and was born on June 6, 1965."
sensitive_info = identify_sensitive_data(sample_text)
anonymized_text, anonymized_info = anonymize_text(sample_text, sensitive_info)
validation = validate_anonymization(anonymized_text, sensitive_info,anonymized_info)
log_anonymization_process(validation)
risk_assessment = assess_re_identification_risk(anonymized_info)

print("Original Text:", sample_text)
print("Anonymized Text:", anonymized_text)
print("Validation:", validation)
print("Risk Assessment:", risk_assessment)

Anonymization process: Validation Successful
Original Text: Dr. John Doe works at the University of Springfield, is an associate professor of Computer Science and was born on June 6, 1965.
Anonymized Text: Dr. Jacob Fuller works at Obrien Ltd, is an associate professor of Thornton, Fox and Gutierrez and was born on 2024-10-04.
Validation: Validation Successful
Risk Assessment: Low risk of re-identification
