# TBA 3102 - Text Analytics
## Practical Lab 08 - Text Summarization and Topic Models (I)
### Question 1 - Data Preprocessing
Student: Nicky Ng <br>
GitHub User: [ahjimomo](https://github.com/ahjimomo) <br>
Student Number: A0194330L

## Libraries

In [1]:
# Data Wrangling
import numpy as np
import pandas as pd

# Text preprocessing
import nltk                     # Text/Sentence Tokenizer + other NLPs
from nltk.corpus import wordnet # Cognitive Synonyms
import unicodedata              # Accented characters
import re                       # Regex
from textblob import Word       # Spelling Correction
import spacy                    # web sm dictionary
from bs4 import BeautifulSoup

# Provided map from class
from contractions import CONTRACTION_MAP

# Lemmatization
nlp = spacy.load('en_core_web_sm')

# Stopwords
stopword_list = nltk.corpus.stopwords.words('english')

# Pre-defined methods
import tba3102

# Tokenizer & Lemmatizer
wtk = nltk.tokenize.RegexpTokenizer(r'\w+')
wnl = nltk.stem.wordnet.WordNetLemmatizer()

# Display DF
from IPython.core.display import HTML
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('max_colwidth', None)
display(HTML("<style>pre { white-space: pre !important; }</style>"))

## 1. Text Preprocessing

In [13]:
## Helper functions
# 1. Data Quality Report
def data_quality_report(df):
    
    if isinstance(df, pd.core.frame.DataFrame):
        
        descriptive_statistics = df.describe(include = 'all')
        data_types = pd.DataFrame(df.dtypes, columns=['Data Type']).transpose()
        missing_value_counts = pd.DataFrame(df.isnull().sum(), columns=['Missing Values']).transpose()
        present_value_counts = pd.DataFrame(df.count(), columns=['Present Values']).transpose()
        data_report = pd.concat([descriptive_statistics, data_types, missing_value_counts, present_value_counts], axis=0)
        
        return data_report
    
    else:
    
        return None

    
def normalize_corpus_2(df, raw_column, new_column):
    norm_papers = []
    
    for i in range(len(df)):
        
        paper = df.loc[i, raw_column]
        
        paper = paper.lower()
        paper_tokens = [token.strip() for token in wtk.tokenize(paper)]
        paper_tokens = [wnl.lemmatize(token) for token in paper_tokens if not token.isnumeric()]
        paper_tokens = [token for token in paper_tokens if len(token) > 1]
        paper_tokens = [token for token in paper_tokens if token not in stopword_list]
        paper_tokens = list(filter(None, paper_tokens))
        
        if paper_tokens:
            norm_papers.append(paper_tokens)
        else:
            norm_papers.append(None)
        
    df[new_column] = norm_papers
        

# 2. Text to sentence Tokenizer (NLTK.sent_tokenizer)
def tokenize_text_to_sentences(text):
    
    sentences = nltk.sent_tokenize(text)
    
    return sentences

# 3. Sentence to Word Tokenizer (NLTK.word_tokenizer)
def tokenize_sentence_to_words(sentence):
    
    words = nltk.word_tokenize(sentence)
    
    return words

# 4. Accented Characters
def remove_accented_chars(text):

    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    return text

# 5. Contractions
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):

    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), flags=re.IGNORECASE|re.DOTALL)
    
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
    
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    
    return expanded_text

# 6. Special Characters
def remove_special_characters(text, remove_digits=False):
    
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    text = text.replace('[', '').replace(']', '')
    
    return text

# 7. Repeated Characters
def remove_repeated_characters(tokens):
    
    repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)')
    match_substitution = r'\1\2\3'
    
    def replace(old_word):
        
        if wordnet.synsets(old_word):
            
            return old_word
            
        new_word = repeat_pattern.sub(match_substitution, old_word)
        return replace(new_word) if new_word != old_word else new_word
    
    correct_tokens = [replace(word) for word in tokens]
    
    return correct_tokens

# 8. Correcting Spellings
def correct_spelling(word_tokens):
    
    for i in range(len(word_tokens)):
    
        w = Word(word_tokens[i])
        word_tokens[i] = str(w.correct())
    
    return word_tokens

# 9. Stopwords removal
def remove_stopword(tokens, is_lower_case=False):
    
    for index in range(len(tokens)):    
        
        if is_lower_case:

            if tokens[index] in stopword_list:

                tokens[index] = ''

        else:

            if tokens[index].lower() in stopword_list:

                tokens[index] = ''
    
    return tokens

# 10. Lemmatization of tokens
def lemmatize_tokens(tokens):
    
    for index in range(len(tokens)):
        
        tokens[index] = nlp(tokens[index])                

        if tokens[index][0].lemma_ != '-PRON-':

            tokens[index] = tokens[index][0].lemma_
        
        else:
        
            tokens[index] = tokens[index][0].text
    
    return tokens

# 11. Stripping HTML 
def strip_html_tags(text):
    
    soup = BeautifulSoup(text, "html.parser")
    
    if bool(soup.find()):
        
        [s.extract() for s in soup(['iframe', 'script'])]
        stripped_text = soup.get_text()
        stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
        
    else:
        
        stripped_text = text
    
    return stripped_text

In [3]:
# Main function to pre-process corpus
def normalize_corpus(dataframe, raw_column, clean_column,
                        html_stripping=False,
                        accented_char_removal=True, contraction_expansion=True,
                        text_lower_case=True, extra_newlines_removal=True, extra_whitespace_removal=True,
                        special_char_removal=True, remove_digits=True, repeating_char_removal=True,
                        spelling_correction=True, lemmatize=True, stop_word_removal=True):
    
    dataframe[clean_column] = ''
    
    for i in range(len(dataframe)):
        
        text = dataframe.loc[i, raw_column]
        
        if html_stripping:
            
            text = strip_html_tags(text)
            
        if accented_char_removal:
            
            text = remove_accented_chars(text)
        
        if contraction_expansion:
            
            text = expand_contractions(text)
        
        if text_lower_case:
            
            text = text.lower()
        
        if extra_newlines_removal:
            
            text = re.sub(r'[\r|\n|\r\n]+', ' ', text)
        
        if extra_whitespace_removal:
            
            text = re.sub(' +', ' ', text)
        
        if special_char_removal:
            
            text = remove_special_characters(text, remove_digits)
            
        # tokenize into words
        word_tokens = tokenize_sentence_to_words(text)
        
        if repeating_char_removal:
            
            word_tokens = remove_repeated_characters(word_tokens)
            
        if spelling_correction:
            
            word_tokens = correct_spelling(word_tokens)
        
        if lemmatize:
            
            word_tokens = [wnl.lemmatize(token) for token in word_tokens if not token.isnumeric()]
        
        if stop_word_removal:
            
            word_tokens = remove_stopword(word_tokens, text_lower_case)
        
        word_tokens = [word_token for word_token in word_tokens if word_token != '']
        text = ' '.join(word_tokens)
        
        dataframe.loc[i, clean_column] = text
    
    return dataframe

In [4]:
# Helper functiont to exclude additional stop words
def exclude_stopwords(stopword_exclusion_list):
    for exclude in stopword_exclusion_list:
        stopword_list.remove(exclude)
        
# Exclude negation terms: "no" & "not"
# exclude_stopwords(['not', 'no'])

In [5]:
# a. Import data & generate quality report
raw_df = pd.read_csv('./data/voted-kaggle-dataset.csv')
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2150 entries, 0 to 2149
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Title        2150 non-null   object
 1   Subtitle     2046 non-null   object
 2   Owner        2150 non-null   object
 3   Votes        2150 non-null   int64 
 4   Versions     2145 non-null   object
 5   Tags         1608 non-null   object
 6   Data Type    2150 non-null   object
 7   Size         2150 non-null   object
 8   License      2150 non-null   object
 9   Views        2145 non-null   object
 10  Download     2135 non-null   object
 11  Kernels      1206 non-null   object
 12  Topics       1558 non-null   object
 13  URL          2150 non-null   object
 14  Description  2145 non-null   object
dtypes: int64(1), object(14)
memory usage: 252.1+ KB


In [6]:
data_quality_report(raw_df)

Unnamed: 0,Title,Subtitle,Owner,Votes,Versions,Tags,Data Type,Size,License,Views,Download,Kernels,Topics,URL,Description
count,2150,2046,2150,2150.0,2145,1608,2150,2150,2150,2145,2135,1206,1558,2150,2145
unique,2116,2019,1269,,1194,962,5,777,6,1826,947,164,24,2138,2008
top,Titanic,"25k+ matches, players & teams attributes for European Professional Football",Rachael Tatman,,"Version 1,2017-08-21",machine learning\npre-trained model,CSV,2 MB,CC0,259 views,7 downloads,2 kernels,0 topics,https://www.kaggle.com/pytorch/resnet50,This dataset does not have a description yet.
freq,6,3,82,,16,31,1593,99,845,6,18,211,1141,2,96
mean,,,,24.011628,,,,,,,,,,,
std,,,,64.788465,,,,,,,,,,,
min,,,,2.0,,,,,,,,,,,
25%,,,,4.0,,,,,,,,,,,
50%,,,,8.0,,,,,,,,,,,
75%,,,,19.0,,,,,,,,,,,


In [19]:
# b. Remove records that contain empty description only
dedup_df = raw_df.dropna(subset = ['Description']).reset_index(drop = True)
dedup_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2145 entries, 0 to 2144
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Title        2145 non-null   object
 1   Subtitle     2041 non-null   object
 2   Owner        2145 non-null   object
 3   Votes        2145 non-null   int64 
 4   Versions     2145 non-null   object
 5   Tags         1604 non-null   object
 6   Data Type    2145 non-null   object
 7   Size         2145 non-null   object
 8   License      2145 non-null   object
 9   Views        2145 non-null   object
 10  Download     2135 non-null   object
 11  Kernels      1206 non-null   object
 12  Topics       1558 non-null   object
 13  URL          2145 non-null   object
 14  Description  2145 non-null   object
dtypes: int64(1), object(14)
memory usage: 251.5+ KB


In [20]:
# Review updated dataframe to check if additional stopwords needed to be removed
dedup_df.sample(3)

Unnamed: 0,Title,Subtitle,Owner,Votes,Versions,Tags,Data Type,Size,License,Views,Download,Kernels,Topics,URL,Description
1982,DVLA Driving Licence Dataset,Driving licence data March 2016,mariakatosvich,2,"Version 1,2016-10-12",,Other,2 MB,ODbL,"2,728 views",148 downloads,,,https://www.kaggle.com/qwikfix/dvla-driving-licence-dataset,"These data sets contain data on current driving licences issued by the Driver and Vehicle Licensing Agency (DVLA). The DVLA is responsible for issuing driving licences in Great Britain (GB). Driving licences issued in Northern Ireland are the responsibility of the Northern Ireland Driver & Vehicle Agency and are outside the scope of this release.\nDVLA’s drivers database changes constantly as the Agency receives driving licence applications and other information that updates the records of individual drivers. Therefore, it is only possible only to provide a snapshot of the state of the record at a particular time.\nContact DVLA for Further information about driving licensing which can be found at: https://www.gov.uk/browse/driving/driving-licences"
2022,prostate.csv,,tvscitechtalk,2,"Version 1,2017-10-14",,CSV,9 KB,Other,202 views,34 downloads,,0 topics,https://www.kaggle.com/tvscitechtalk/prostatecsv,This dataset does not have a description yet.
1679,IBM HR,,E.Nikumanesh.Germany,3,"Version 1,2017-10-15",,CSV,223 KB,Other,778 views,127 downloads,,0 topics,https://www.kaggle.com/esmaeil391/ibm-hr,This dataset does not have a description yet.


In [23]:
# c. Perform cleaning of data without exclusion of negation terms
cleaned_df = normalize_corpus(dedup_df, 'Description', 'Cleaned_Description')
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2145 entries, 0 to 2144
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Title                2145 non-null   object
 1   Subtitle             2041 non-null   object
 2   Owner                2145 non-null   object
 3   Votes                2145 non-null   int64 
 4   Versions             2145 non-null   object
 5   Tags                 1604 non-null   object
 6   Data Type            2145 non-null   object
 7   Size                 2145 non-null   object
 8   License              2145 non-null   object
 9   Views                2145 non-null   object
 10  Download             2135 non-null   object
 11  Kernels              1206 non-null   object
 12  Topics               1558 non-null   object
 13  URL                  2145 non-null   object
 14  Description          2145 non-null   object
 15  Cleaned_Description  2145 non-null   object
dtypes: int

In [24]:
# Check number of empty column
cleaned_df['Cleaned_Description'].isnull().sum()

0

In [25]:
# Check NA based on length of cleaned text
cleaned_df[cleaned_df['Cleaned_Description'].str.len() < 1].head()

Unnamed: 0,Title,Subtitle,Owner,Votes,Versions,Tags,Data Type,Size,License,Views,Download,Kernels,Topics,URL,Description,Cleaned_Description
1326,European Soccer Database,"25k+ matches, players & teams attributes for European Professional Football",paosheng,6,"Version 1,2016-11-23",,CSV,6 KB,ODbL,"3,072 views",375 downloads,2 kernels,,https://www.kaggle.com/paosheng/european-soccer-database,歐洲足球資料庫 背景:歐洲足球 內容:歐洲足球分析,
1785,Shanghai stock composite index,Shanghai stock composite index,R1q3,3,"Version 1,2017-09-12",finance\ninternet,CSV,542 KB,CC0,241 views,33 downloads,,0 topics,https://www.kaggle.com/ruanqian/shanghai-stock-composite-index,上证综合指数前复权日线数据,


In [26]:
# Remove empty cleaned description
cleaned_df2 = cleaned_df[cleaned_df['Cleaned_Description'].str.len() > 1]
cleaned_df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2143 entries, 0 to 2144
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Title                2143 non-null   object
 1   Subtitle             2039 non-null   object
 2   Owner                2143 non-null   object
 3   Votes                2143 non-null   int64 
 4   Versions             2143 non-null   object
 5   Tags                 1603 non-null   object
 6   Data Type            2143 non-null   object
 7   Size                 2143 non-null   object
 8   License              2143 non-null   object
 9   Views                2143 non-null   object
 10  Download             2133 non-null   object
 11  Kernels              1205 non-null   object
 12  Topics               1557 non-null   object
 13  URL                  2143 non-null   object
 14  Description          2143 non-null   object
 15  Cleaned_Description  2143 non-null   object
dtypes: int

In [27]:
# Check the report again
data_quality_report(cleaned_df2)

Unnamed: 0,Title,Subtitle,Owner,Votes,Versions,Tags,Data Type,Size,License,Views,Download,Kernels,Topics,URL,Description,Cleaned_Description
count,2143,2039,2143,2143.0,2143,1603,2143,2143,2143,2143,2133,1205,1557,2143,2143,2143
unique,2110,2013,1264,,1194,959,5,774,6,1824,947,164,24,2131,2006,1993
top,Titanic,Collected from Project Gutenberg [text],Rachael Tatman,,"Version 1,2017-08-21",machine learning\npre-trained model,CSV,2 MB,CC0,259 views,7 downloads,2 kernels,0 topics,https://www.kaggle.com/pytorch/resnet101,This dataset does not have a description yet.,dataset description yet
freq,6,2,82,,16,31,1586,99,844,6,18,210,1140,2,96,96
mean,,,,24.058796,,,,,,,,,,,,
std,,,,64.888328,,,,,,,,,,,,
min,,,,2.0,,,,,,,,,,,,
25%,,,,4.0,,,,,,,,,,,,
50%,,,,8.0,,,,,,,,,,,,
75%,,,,20.0,,,,,,,,,,,,


In [28]:
# Save as csv
cleaned_df2.to_csv('./data/voted-kaggled-dataset-cleaned.csv')