The objective of this notebook is to make a first approach to and analyze the punctuation and special characters of the messages from the dataset and how to handle them in each case.

In [None]:
import pandas as pd
import re
from string import punctuation

In [None]:
df = pd.read_csv("../data/bronze/spam.csv")

### Special chars

In [None]:
def find_special_characters(text):
    return re.findall(r'[^a-zA-Z0-9\s]', text) 

all_special_characters = df['Message'].apply(find_special_characters).explode().dropna()


unique_special_characters = all_special_characters.unique()
unique_special_characters

### Emojis

In [None]:
import emoji

def extract_emojis(text):
    if isinstance(text, str):  
        return [char for char in text if char in emoji.EMOJI_DATA]
    return []

df['Extracted_Emojis'] = df['Message'].apply(extract_emojis)
rows_with_emojis = df[df['Extracted_Emojis'].apply(len) > 0]
rows_with_emojis

In [27]:
emoticon_pattern = re.compile(r"""
    [:;=Xx]           
    [-~]?             
    [\)\]\(\[dDpP/]   
""", re.VERBOSE)

def extract_emoticons(text):
    if isinstance(text, str):  
        return emoticon_pattern.findall(text)
    return []

unique_emoticons = set([emoticon for sublist in df['Message'].apply(extract_emoticons) for emoticon in sublist])

unique_emoticons 

{':(',
 ':)',
 ':-(',
 ':-)',
 ':-/',
 ':-D',
 ':-P',
 ':/',
 ':D',
 ';)',
 ';-(',
 ';-)',
 ';D',
 '=)',
 '=/',
 '=D',
 'XD',
 'XP',
 'x/',
 'xd',
 'xp'}

### Rows with char

In [None]:
def row_with_char(char):
    matching_rows = df[df['Message'].str.contains(re.escape(char))]
    print(f"Rows containing '{char}':")
    for index, row in matching_rows.iterrows():
        print(f"Row {index}: {row['Message']}")


In [None]:
pd.set_option('display.max_colwidth', None)
row_with_char("*")

In [None]:
row_with_char(">")

In [None]:
row_with_char("/")

In [None]:
row_with_char("@")

### First approach

First approach:

- ',', '.', '(', ')', '&': replace with space 
- "'": replace with empty string
- '!','?': replace with space
- '£': replace with "pound" and add other common currency names
- '*': replace with space
- '>': replace with space
- '/', '+': replace with space
- ':', '=': replace with space
- '-': replace with space
- 'ú': still don't know
- '‘', 'ü':, ';': replace with space
- '#', 
- '"': replace with space
- '@': they can belong to emails, ats...
- '$': replace with dollar, 
- 'Ü':
- '\x91', '\x92', '\x93', '\x94', '\x96': replace with empty string
- '~', '|', '_', '–', '<', '…', '\\', 'è', '^', , '“': replace with spaces
- '%': replace with "percetage"
- '[', ']', '’', , '»', '—', 'é', 'É', 'ì','鈥', '┾', '〨', '¡': replace with space

keywords: cash, xxx
websites


###  Clean text function

In [None]:

def clean_text(text):
    special_replacements = {
        r"£": "pound",
        r"\$": "dollar",
        r"\€": "euro",
        r"%": "percentage"}
    
    emoticon_pattern = re.compile(r"""
    [:;=Xx]           
    [-~]?             
    [\)\]\(\[dDpP/]   
    """, re.VERBOSE)
    
    for pattern, replacement in special_replacements.items():
        text = re.sub(pattern, replacement, text)
    text = re.sub(emoticon_pattern, 'emoji', text)
    text = text.lower()
    text = re.sub('<[^<>]+>', ' ', text)
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub('[0-9]+', 'number', text)
    text = re.sub('[^\s]+@[^\s]+', 'emailaddr', text)
    text = text.translate(str.maketrans('', '', punctuation))
    
    # text = re.sub(r'(http\S+|@\S+|\d+)', '', text)
    # text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # text = re.sub(r'\s+', ' ', text).strip()
    # text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
    # que elimine dos espacios o tres por uno
    return text

In [None]:
df_cleaned = df.copy() 

df_cleaned['Message']=df_cleaned['Message'].apply(clean_text)

In [None]:
all_special_characters = df_cleaned['Message'].apply(find_special_characters).explode().dropna()

unique_special_characters = all_special_characters.unique()
unique_special_characters

In [28]:
indices = [960, 2807, 3376, 4575, 4824]
rows = df_cleaned.loc[indices]
rows

Unnamed: 0,Category,Message,Extracted_Emojis,Extracted_Emoticons,Extracted_Emoticons_Text
960,ham,where,[],[],
2807,ham,can a not,[],[],
3376,ham,emoji,[],[:)],:)
4575,ham,emoji but your not here,[],[:(],:(
4824,ham,emoji emoji,[],"[:-), :-)]",":-), :-)"


In [None]:
row_with_char("-")

In [None]:
row_with_char("ü")

In [None]:
row_with_char("è")

In [None]:
row_with_char("ú")

In [None]:
row_with_char("é")

In [None]:
row_with_char("“")

In [None]:
row_with_char("»")

In [None]:
row_with_char('ì')

- "ü": replace with you
- "è": replace with empty string
- "ú": replace with empty string
- "é": replace with empty string
- "“": replace with empty string
- "»": replace with empty string
- 'ì': replace with i
- "\x91", "\x92", "\x93", "\x96": replace with empty string

The rest replace with empty string

In [None]:
def clean_text_2(text):
    special_replacements = {
        r"£": "pound",
        r"\$": "dollar",
        r"\€": "euro",
        r"%": "percentage", 
        r"ì": "i",
        r"ü": "you",
        }
    
    emoticon_pattern = re.compile(r"""
    [:;=Xx]           
    [-~]?             
    [\)\]\(\[dDpP/]   
    """, re.VERBOSE)
    
    for pattern, replacement in special_replacements.items():
        text = re.sub(pattern, replacement, text)
    text = re.sub(emoticon_pattern, 'emoji', text)
    text = text.lower()
    text = re.sub('<[^<>]+>', ' ', text)
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub('[0-9]+', 'number', text)
    text = re.sub('[^\s]+@[^\s]+', 'emailaddr', text)
    text = text.translate(str.maketrans('', '', punctuation))
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
df_cleaned_v2 = df.copy()

df_cleaned_v2['Message']=df_cleaned_v2['Message'].apply(clean_text_2)
df_cleaned_v2['Message']

In [None]:
all_special_characters = df_cleaned_v2['Message'].apply(find_special_characters).explode().dropna()


unique_special_characters = all_special_characters.unique()
unique_special_characters