In [10]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

drive_path = "/content/drive/MyDrive/Dissertation/"

Mounted at /content/drive


In [11]:
import pandas as pd

df = pd.read_csv(drive_path + 'raw.csv')

In [12]:
import spacy.cli
import spacy

spacy.cli.download("en_core_web_lg")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [13]:
nlp = spacy.load("en_core_web_lg")

In [14]:
import pandas as pd
import re

# Global dictionaries to ensure consistency across the entire dataset
entity_mapping = {}
entity_counter = {"PERSON": 0, "ORG": 0, "LOC": 0, "EMAIL": 0, "PHONE": 0}

def anonymize_story(story):
    global entity_mapping, entity_counter
    doc = nlp(story)
    anonymized_story = story

    # Anonymize email addresses
    for match in re.finditer(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", anonymized_story):
        email = match.group()
        if email not in entity_mapping:
            entity_counter["EMAIL"] += 1
            entity_mapping[email] = f"EMAIL{entity_counter['EMAIL']}"
        anonymized_story = anonymized_story.replace(email, entity_mapping[email])

    # Anonymize phone numbers
    for match in re.finditer(r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}", anonymized_story):
        phone = match.group()
        if phone not in entity_mapping:
            entity_counter["PHONE"] += 1
            entity_mapping[phone] = f"PHONE{entity_counter['PHONE']}"
        anonymized_story = anonymized_story.replace(phone, entity_mapping[phone])

    for ent in doc.ents:
        # Check if the entity label is not in the dictionary and add it if missing
        if ent.label_ not in entity_counter:
            entity_counter[ent.label_] = 0


        if ent.text not in entity_mapping:
            entity_counter[ent.label_] += 1
            entity_mapping[ent.text] = f"{ent.label_}{entity_counter[ent.label_]}"
        anonymized_story = anonymized_story.replace(ent.text, entity_mapping[ent.text])

    return anonymized_story

# Apply the anonymization function to each row of user stories data and replace the un-anonymised data
df['text'] = df['text'].apply(anonymize_story)

df['text']

0       As an app developer aiming for low-resource en...
1       As a community member, I want to create a ORG2...
2       As a existing Solid user, I would like to use ...
3       As a ORG3, I would like it to be easy for my u...
4       As a ORG3, I would like to easily register new...
                              ...                        
2081    As a ORG176 employee, I want to be able to acc...
2082    As a ORG176 employee, I want the system to sta...
2083    As a ORG176 employee, I want a platform that c...
2084    As a ORG176 employee, I want the software to b...
2085    As a ORG176 employee, I want to be able to tak...
Name: text, Length: 2086, dtype: object

In [15]:
len(entity_mapping)

317

In [16]:
entity_mapping.keys()

dict_keys(['Solid', 'Pod/RDF', 'Solid Identity Provider', 'Pod Providers', 'IdP', 'one', 'compus', 'webid', 'WebID', 'RDF', 'Turtle', 'HR', 'Project', '\ufeffAs', 'Data', '12-19-2017', 'Broker', 'UX', '2', 'DABS', 'FABS', 'Homepage', '3', 'publishStatus', 'DevOps', 'Validate', 'FPDS', 'DB-2213', 'GTAS', 'USAspending', 'Agency', 'Tech Thursday', 'FundingAgencyCode after FABS', 'FSRS', 'daily', 'PPoPCongressionalDistrict', 'zero', 'Financial Assistance', 'CFDA', 'SAM', '00FORGN', 'P&P', 'DAIMS', 'FREC', 'NASA', 'DUNS', 'ActionTypes', 'the Atom Feed', 'the SAMPLE FILE', 'PPoPZIP', '4', 'two', 'IG', '2007', 'Excel', 'Search for Information', 'County', 'ProspectiveApplicant', 'PreApplication Assistance', 'Customer', 'first', 'Submit Application', 'the Draft Proffers', 'Final Action', 'the Comprehensive Plan', 'the Zoning Ordinance', 'Applicant', 'Develop a Staff Report', 'Capture a Hearing Decision', 'Process the Appeals', 'the Plan Review Staff', 'Review Plans', 'Review the Code Modificati

In [17]:
entity_counter

{'PERSON': 16,
 'ORG': 177,
 'LOC': 2,
 'EMAIL': 0,
 'PHONE': 0,
 'CARDINAL': 21,
 'DATE': 31,
 'PRODUCT': 19,
 'WORK_OF_ART': 9,
 'FAC': 5,
 'GPE': 13,
 'ORDINAL': 3,
 'LAW': 6,
 'TIME': 9,
 'NORP': 3,
 'MONEY': 2,
 'QUANTITY': 1}

In [18]:
df.to_csv(drive_path + 'anonymised.csv')