# TBA 3102 - Text Analytics
## Practical Lab 07 - Text Classification
### Question 1 - Text Preprocessing
Student: Nicky Ng <br>
GitHub User: [ahjimomo](https://github.com/ahjimomo) <br>
Student Number: A0194330L

## Libraries

In [1]:
# Data Wrangling
import numpy as np
import pandas as pd

# Text preprocessing
import nltk                     # Text/Sentence Tokenizer + other NLPs
from nltk.corpus import wordnet # Cognitive Synonyms
import unicodedata              # Accented characters
import re                       # Regex
from textblob import Word       # Spelling Correction
import spacy                    # web sm dictionary

# Provided map from class
from contractions import CONTRACTION_MAP

# Lemmatization
nlp = spacy.load('en_core_web_sm')

# Stopwords
stopword_list = nltk.corpus.stopwords.words('english')

# Display DF
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('max_colwidth', None)

## 1a & b. Clean & preprocess raw SMS dataframe

In [9]:
## Helper functions
# 1. Data Quality Report
def data_quality_report(df):
    
    if isinstance(df, pd.core.frame.DataFrame):
        
        descriptive_statistics = df.describe(include = 'all')
        data_types = pd.DataFrame(df.dtypes, columns=['Data Type']).transpose()
        missing_value_counts = pd.DataFrame(df.isnull().sum(), columns=['Missing Values']).transpose()
        present_value_counts = pd.DataFrame(df.count(), columns=['Present Values']).transpose()
        data_report = pd.concat([descriptive_statistics, data_types, missing_value_counts, present_value_counts], axis=0)
        
        return data_report
    
    else:
    
        return None

# 2. Text to sentence Tokenizer (NLTK.sent_tokenizer)
def tokenize_text_to_sentences(text):
    
    sentences = nltk.sent_tokenize(text)
    
    return sentences

# 3. Sentence to Word Tokenizer (NLTK.word_tokenizer)
def tokenize_sentence_to_words(sentence):
    
    words = nltk.word_tokenize(sentence)
    
    return words

# 4. Accented Characters
def remove_accented_chars(text):

    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    return text

# 5. Contractions
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):

    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), flags=re.IGNORECASE|re.DOTALL)
    
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
    
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    
    return expanded_text

# 6. Special Characters
def remove_special_characters(text, remove_digits=False):
    
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    text = text.replace('[', '').replace(']', '')
    
    return text

# 7. Repeated Characters
def remove_repeated_characters(tokens):
    
    repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)')
    match_substitution = r'\1\2\3'
    
    def replace(old_word):
        
        if wordnet.synsets(old_word):
            
            return old_word
            
        new_word = repeat_pattern.sub(match_substitution, old_word)
        return replace(new_word) if new_word != old_word else new_word
    
    correct_tokens = [replace(word) for word in tokens]
    
    return correct_tokens

# 8. Correcting Spellings
def correct_spelling(word_tokens):
    
    for i in range(len(word_tokens)):
    
        w = Word(word_tokens[i])
        word_tokens[i] = str(w.correct())
    
    return word_tokens

# 9. Stopwords removal
def remove_stopword(tokens, is_lower_case=False):
    
    for index in range(len(tokens)):    
        
        if is_lower_case:

            if tokens[index] in stopword_list:

                tokens[index] = ''

        else:

            if tokens[index].lower() in stopword_list:

                tokens[index] = ''
    
    return tokens

# 10. Lemmatization of tokens
def lemmatize_tokens(tokens):
    
    for index in range(len(tokens)):
        
        tokens[index] = nlp(tokens[index])                

        if tokens[index][0].lemma_ != '-PRON-':

            tokens[index] = tokens[index][0].lemma_
        
        else:
        
            tokens[index] = tokens[index][0].text
    
    return tokens

In [3]:
# Helper functiont to exclude additional stop words
def exclude_stopwords(stopword_exclusion_list):
    for exclude in stopword_exclusion_list:
        stopword_list.remove(exclude)
        
# Exclude negation terms: "no" & "not"
exclude_stopwords(['not', 'no'])

In [10]:
# Main function to pre-process corpus
def normalize_corpus(dataframe, raw_column, clean_column,
                        html_stripping=False,
                        accented_char_removal=True, contraction_expansion=True,
                        text_lower_case=True, extra_newlines_removal=True, extra_whitespace_removal=True,
                        special_char_removal=True, remove_digits=True, repeating_char_removal=True,
                        spelling_correction=True, lemmatize=True, stop_word_removal=True):
    
    dataframe[clean_column] = ''
    
    for i in range(len(dataframe)):
        
        text = dataframe.loc[i, raw_column]
        
        if html_stripping:
            
            text = strip_html_tags(text)
            
        if accented_char_removal:
            
            text = remove_accented_chars(text)
        
        if contraction_expansion:
            
            text = expand_contractions(text)
        
        if text_lower_case:
            
            text = text.lower()
        
        if extra_newlines_removal:
            
            text = re.sub(r'[\r|\n|\r\n]+', ' ', text)
        
        if extra_whitespace_removal:
            
            text = re.sub(' +', ' ', text)
        
        if special_char_removal:
            
            text = remove_special_characters(text, remove_digits)
            
        # tokenize into words
        word_tokens = tokenize_sentence_to_words(text)
        
        if repeating_char_removal:
            
            word_tokens = remove_repeated_characters(word_tokens)
            
        if spelling_correction:
            
            word_tokens = correct_spelling(word_tokens)
        
        if lemmatize:
            
            word_tokens = lemmatize_tokens(word_tokens)
        
        if stop_word_removal:
            
            word_tokens = remove_stopword(word_tokens, text_lower_case)
        
        word_tokens = [word_token for word_token in word_tokens if word_token != '']
        text = ' '.join(word_tokens)
        
        dataframe.loc[i, clean_column] = text
    
    return dataframe

In [5]:
# Set random state for any random-related functions for reproducibility 
random_state = 42

In [6]:
# Import & review dataset
raw_df = pd.read_csv('./data/sms.tsv', sep = '\t')
raw_df.info() # No missing data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Label    5572 non-null   object
 1   SMSText  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [7]:
# Review some examples of raw data and corrections to be made
raw_df.sample(20, random_state = random_state)

Unnamed: 0,Label,SMSText
3245,ham,Squeeeeeze!! This is christmas hug.. If u lik my frndshp den hug me back.. If u get 3 u r cute:) 6 u r luvd:* 9 u r so lucky;) None? People hate u:
944,ham,And also I've sorta blown him off a couple times recently so id rather not text him out of the blue looking for weed
1044,ham,Mmm thats better now i got a roast down me! id b better if i had a few drinks down me 2! Good indian?
2484,ham,Mm have some kanji dont eat anything heavy ok
812,ham,So there's a ring that comes with the guys costumes. It's there so they can gift their future yowifes. Hint hint
2973,ham,Sary just need Tim in the bollox &it hurt him a lot so he tol me!
2991,ham,"Love isn't a decision, it's a feeling. If we could decide who to love, then, life would be much simpler, but then less magical"
2942,ham,My supervisor find 4 me one lor i thk his students. I havent ask her yet. Tell u aft i ask her.
230,ham,Dear good morning now only i am up
1181,ham,I'm in chennai velachery:)


Based on the sample, it seems like the provided SMSes dataset are already in their own sentences, and there are a number of preprocessing to perform, including:
* Correcting accented characters
* Expanding contractions
* Dealing with different cases (lowercasing)
* Removal of extra whitelines & whitespaces to be safe
* Removal of digits & special characters
* Removal of repeating characters
* Correction of spelling
* Lemmatization of words
* Removal of stopwords
* Tokenize

In [11]:
# Preprocess raw data
cleaned_df = normalize_corpus(raw_df, 'SMSText', 'Cleaned_SMSText')

# Review and check overview
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Label            5572 non-null   object
 1   SMSText          5572 non-null   object
 2   Cleaned_SMSText  5572 non-null   object
dtypes: object(3)
memory usage: 130.7+ KB


In [12]:
# Review raw & cleaned data
cleaned_df.sample(20, random_state = random_state)

Unnamed: 0,Label,SMSText,Cleaned_SMSText
3245,ham,Squeeeeeze!! This is christmas hug.. If u lik my frndshp den hug me back.. If u get 3 u r cute:) 6 u r luvd:* 9 u r so lucky;) None? People hate u:,squeeze christmas hug u like frndshp den hug I back u get u r cut u r love u r lucky none people hate u
944,ham,And also I've sorta blown him off a couple times recently so id rather not text him out of the blue looking for weed,also I sort blow couple time recently I rather not text blue look weed
1044,ham,Mmm thats better now i got a roast down me! id b better if i had a few drinks down me 2! Good indian?,mm well I get roast I I b well I drink I good indian
2484,ham,Mm have some kanji dont eat anything heavy ok,mm anti eat anything heavy ok
812,ham,So there's a ring that comes with the guys costumes. It's there so they can gift their future yowifes. Hint hint,ring come gun costume gift future yowife hint hint
2973,ham,Sary just need Tim in the bollox &it hurt him a lot so he tol me!,say need tim blood hurt lot I
2991,ham,"Love isn't a decision, it's a feeling. If we could decide who to love, then, life would be much simpler, but then less magical",love not decision feel could decide love life would much simple less magical
2942,ham,My supervisor find 4 me one lor i thk his students. I havent ask her yet. Tell u aft i ask her.,supervisor find I one I student I ask yet tell u aft I ask
230,ham,Dear good morning now only i am up,dear good morning I
1181,ham,I'm in chennai velachery:),I chennai velachery


## 1b. Remove empty documents (if any)

In [13]:
# Check number of empty column
cleaned_df['Cleaned_SMSText'].isnull().sum()

0

In [15]:
# Drop empty documents
dropna_df = cleaned_df.dropna()

# Review new document & check if empty documents has been dropped
dropna_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Label            5572 non-null   object
 1   SMSText          5572 non-null   object
 2   Cleaned_SMSText  5572 non-null   object
dtypes: object(3)
memory usage: 130.7+ KB


In [17]:
# Check len of df lesser than >1 again
dropna_df[dropna_df['Cleaned_SMSText'].str.len() < 1].head()

Unnamed: 0,Label,SMSText,Cleaned_SMSText
261,ham,Yup,
276,ham,Thanx...,
960,ham,Where @,
1191,ham,We're done...,
1277,ham,Can do lor...,


In [18]:
dropna_df[dropna_df['Cleaned_SMSText'].str.len() < 1].count()

Label              13
SMSText            13
Cleaned_SMSText    13
dtype: int64

In [19]:
# Drop the empty cells based on length of text
dropna_df = dropna_df[(dropna_df['Cleaned_SMSText'].str.len() < 1) == False]

# Recheck length
dropna_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5559 entries, 0 to 5571
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Label            5559 non-null   object
 1   SMSText          5559 non-null   object
 2   Cleaned_SMSText  5559 non-null   object
dtypes: object(3)
memory usage: 173.7+ KB


## 1d. Generate Quality Report

In [20]:
# Generate data quality report
sms_report = data_quality_report(dropna_df)

# Print report
sms_report

Unnamed: 0,Label,SMSText,Cleaned_SMSText
count,5559,5559,5559
unique,2,5157,5047
top,ham,"Sorry, I'll call later",sorry I call later
freq,4812,30,30
Data Type,object,object,object
Missing Values,0,0,0
Present Values,5559,5559,5559


## 1e. Export Dataframe in csv as "sms_cleaned.csv"

In [21]:
dropna_df.to_csv('./data/sms_cleaned.csv', index = False)