In [11]:
import spacy
import pandas as pd

In [12]:
nlp = spacy.load("en_core_web_sm")


In [13]:
def generate_ngrams(text, n_range=(3, 6)):
    ngrams = []
    length = len(text)
    for n in range(n_range[0], min(n_range[1]+1, length+1)):
        ngrams.extend([text[i:i+n] for i in range(length-n+1)])
    return ngrams

def contains_person_name_from_ngrams(ngrams):
    for token in ngrams:
        doc = nlp(token)
        for ent in doc.ents:
            if ent.label_ == 'PERSON':
                return 1 
    return 0

In [14]:
def extract_features(email):
    username = email.split('@')[0]
    length_of_username = len(username)
    num_special_chars = sum(not char.isalnum() for char in username)
    ngrams = generate_ngrams(username)
    has_person_name_ngram = contains_person_name_from_ngrams(ngrams)
    return length_of_username, num_special_chars, has_person_name_ngram

In [15]:
def label_user(length_of_username, num_special_chars, has_person_name_ngram):
    if has_person_name_ngram != 1:
        return 0
    if length_of_username > 10 and num_special_chars > 2:
        return 0
    return 1

In [16]:
input_csv_file = 'testing.csv'
output_csv_file = 'email_features_labeled.csv'

In [17]:
df = pd.read_csv(input_csv_file)

In [18]:
data = []
for email in df['user_ids']:
    length_of_username, num_special_chars, has_person_name_ngram = extract_features(email)
    label = label_user(length_of_username, num_special_chars, has_person_name_ngram)
    data.append([email, length_of_username, num_special_chars, has_person_name_ngram, label])

In [10]:
labeled_df = pd.DataFrame(data, columns=['email', 'length_of_username', 'num_special_chars', 'has_person_name_ngram', 'label'])

print(labeled_df)

labeled_df.to_csv(output_csv_file, index=False)

                               email  length_of_username  num_special_chars  \
0  bohdan.ravlyk.mbdpts.2023@lpnu.ua                  25                  3   
1                 axel@nystruktur.no                   4                  0   
2        raymond@sttinternational.nl                   7                  0   

   has_person_name_ngram  label  
0                      1      1  
1                      0      1  
2                      1      1  
