In [1]:
import string   # contains a public variable with all ASCII punctuation characters
import nltk

# list of all stopwords such as 'and', 'the', 'is', etc.
nltk.download('stopwords')  

# WordNet is a lexical database of English words that groups words into sets of synonyms, while also recording semantic relationships between words such as "is-a", "part-of", and "opposite-of" relationships.
nltk.download('wordnet')    

# Open Multilingual WordNet (omw) links hand created wordnets and automatically created wordnets for different languages.
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk import ngrams

# Used to tokenize the text; i.e. create a dictionary mapping words to integers. The dictionary can be used to create a term-document matrix.
from gensim.corpora import Dictionary

from gensim.test.utils import datapath

from gensim.models.word2vec import Text8Corpus
from gensim.models.phrases import Phrases, Phraser
from gensim.models.phrases import ENGLISH_CONNECTOR_WORDS

import spacy
from textacy import extract

import re
import os

def read_txt_to_string(file_path):
    with open(file_path, 'r') as file:
        return file.read()
    
def combine_text_files_to_list(input_directory):

    txt_files = [os.path.join(input_directory, file) for file in os.listdir(input_directory) if file.endswith(".txt")]
    corpus = []

    for txt_file in txt_files:
        
        try:
            # Read the entire file as a string and add the string to the corpus
            with open(txt_file, 'r', encoding='utf-8') as file:
                file_content = file.read()  
                corpus.append(file_content)  
                
        except Exception as e:
            print(f"An error occurred while reading {txt_file}: {e}")
    
    return corpus

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\syeda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\syeda\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\syeda\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
data = combine_text_files_to_list("../../Dataset/Parsed_Lectures")
print("Corpus combined successfully as a list of strings.")

Corpus combined successfully as a list of strings.


In [29]:
print(len(data))
print(data[0][:500])

162
 Learning Objectives•  Explain why it is important to understand and use correct terminology.            •          Define: computer, software, memory, data, memory size/data size, cloud            •          Explain "Big Data" and describe data growth in the coming years.            •          Compare and contrast: digital versus analog            •          Briefly explain how integers, doubles, and strings are encoded.            •          Explain why ASCII table is required for character en


## Initial Experiments

### Experimenting with textacy's extract() function

In [None]:
nlp = spacy.load("en_core_web_sm")
# Add custom stop words, mostly including header and footer information like names of instructors, name of university, etc.

nlp.Defaults.stop_words |= {"ubc", "mds", "lecture", "lab", "assignments", "example", "page"}

doc = nlp(data[0].lower())

In [157]:
ngrams = [
    ngram.text.replace(" ", "_") 
        for ngram in extract.ngrams(doc, n = 2, min_freq = 4, filter_punct = True, filter_nums = True, exclude_pos=["PROPN", "ORG", "DATE", "X"]) 
        if not ngram.text.__contains__("=") 
            and not ngram.text.__contains__("@") 
            and not ngram.text.__contains__("$")
]


In [158]:
unique_ngrams = set(ngrams)
print(unique_ngrams)
print(len(unique_ngrams))

{'format_painter', 'visual_basic', 'cell_value', 'data_analysis', 'pivot_table', 'ubco_master', 'returns_true', 'quick_access', 'goal_seek', 'access_toolbar', 'operating_system', 'relative_path', 'records_found', 'representing_data', 'text_file', 'basic_editor', 'file_system', 'row_labels', 'data_values', 'forecast_outline', 'ex_delete', 'import_random', 'clothing_jacket', 'pivot_tables', 'overall_revenue', 'regression_equation', 'immediate_window', 'version_control', 'text_files', 'grand_total', 'syntax_errors', 'external_new', 'macro_security', 'r_session', 'selection_font', 'learning_studio', 'computer_system', 'api_key', 'file_encoding', 'current_directory', 'data_set', 'cell_styles', 'selecting_cells', 'command_line', 'aggregate_functions', 'machine_learning', 'following_statements', 'analysis_sheet', 'org_eclipse', 'import_module', 'shortcut_key', 'memory_size', 'analysis_toolpak', 'data_size', 'binary_file', 'data_science', 'file_encodings', 'conditional_formatting', 'computing_

### Continuing preprocessing

In [167]:
def clean_with_spacy(doc):

    spacy_parser = spacy.load("en_core_web_sm")
    # Add custom stop words that are likely to appear in all topic models
    spacy_parser.Defaults.stop_words |= {"ubc", "mds", "lecture", "lab", "assignments", "example", "page", "ex", "import"}
    
    spacy_doc = spacy_parser(doc.lower())

    ngrams = [
        ngram.text.replace(" ", "_")    # ngrams are separated by spaces, so we replace them with underscores
        for ngram in extract.ngrams(spacy_doc, n = 2, min_freq = 4, filter_punct = True, filter_nums = True, exclude_pos=["PROPN", "ORG", "DATE", "X"]) 
        if not ngram.text.__contains__("=") 
            and not ngram.text.__contains__("@") 
            and not ngram.text.__contains__("$")
    ]
    
    # Remove stopwords, punctuation, and numeric tokens
    tokens = [token.lemma_ 
                for token in spacy_doc 
                if not token.is_stop and not token.is_punct and not token.is_digit and token.is_alpha # Keep only words that are not stop words
                and token.text not in ["_", "+", "=", "\n","-","*","<",">"]                           # Remove special characters
                and not token.lemma_ == "datum"]                                                      # Do not lemmatize anything related to data                              
    
    return tokens + ngrams

In [168]:
cleaned_data530 = clean_with_spacy(data[0])

### Final Preprocessed Module

In [169]:
print(f"Number of words after cleaning: {len(cleaned_data530)}")
print(cleaned_data530[-200:])

Number of words after cleaning: 11524
['visual_basic', 'visual_basic', 'basic_editor', 'immediate_window', 'immediate_window', 'immediate_window', 'immediate_window', 'immediate_window', 'immediate_window', 'data_analysis', 'visual_basic', 'basic_editor', 'visual_basic', 'basic_editor', 'object_browser', 'ubco_master', 'data_science', 'conditional_formatting', 'data_analysis', 'data_analysis', 'cell_styles', 'clothing_jacket', 'cell_styles', 'clothing_jacket', 'clothing_jacket', 'selecting_cells', 'selecting_cells', 'selecting_cells', 'cell_styles', 'conditional_formatting', 'cell_styles', 'conditional_formatting', 'cell_styles', 'selecting_cells', 'cell_value', 'following_statements', 'cell_styles', 'clothing_jacket', 'aggregate_functions', 'data_values', 'aggregate_functions', 'conditional_formatting', 'cell_styles', 'aggregate_functions', 'aggregate_functions', 'cell_styles', 'clothing_jacket', 'clothing_jacket', 'clothing_jacket', 'aggregate_functions', 'following_statements', 'agg

## Separating n-gram extraction from preprocessing function

In [196]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])  

# Add custom stop words 
nlp.Defaults.stop_words |= {"ubc", "mds", "lecture", "lab", "assignments", "example", "british","columbia", "introduction" ,"page", "file", "question", "ex", "import", "jeffrey", "andrews", "irene", "vrbik", "shan", "du", "ifeoma", "adaji", "gema", "rodrigues", "fatemeh", "fard", "emelie", "gustafsson", "heinz", "bauschke", "travis", "douglas", "jones", "dave", "xiaoping", "shi", "khalad", "hasan", "ladan", "tazik", "ramon", "lawrence", "chu", "miller", "casey", "ritish", "smith", "lee", "university", "ιc", "jan", "feb", "mar", "tn", "pu", "xn", "ee", "sa", "fa", "toys", "bat", "clothing", "apples", "jacknife", "jacket", "following", "treatment", "let", "return", "returns", "true", "nh", "λy", "𝑘th", "ll", "lll", "calibri", "york", "florida", "illinois", "texas", "francisco", "quartersales", "quarterpivot", "food", "wind", "steak", "xlsx", "phd", "na", "kkt", "dur", "earlier", "city", "street", "false"}

In [197]:
def clean_without_ngrams(doc):

    spacy_doc = nlp(doc.lower())

    # Remove stopwords, punctuation, and numeric tokens
    tokens = [
        token.text 
        for token in spacy_doc 
        if not token.is_stop and not token.is_punct and not token.is_digit and token.is_alpha       # Keep only words that are not stop words
            and token.text not in ["_", "+", "=", "\n","-","*","<",">"]                             # Remove special characters       
            and not len(token.text) == 1                                                            # Remove single character words
            # and token.pos_ in ["NOUN", "ADJ", "VERB", "ADV"]                                        # Keep only nouns, adjectives, verbs, and adverbs
    ]    
                                                                           
    return tokens

In [198]:
data_words = [clean_without_ngrams(doc) for doc in data]

In [199]:
bigram = Phrases(data_words, min_count=10, threshold=20) 
trigram = Phrases(bigram[data_words], threshold=20) 

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)    

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [200]:
data_words_bigrams = make_bigrams(data_words)
data_words_trigrams = make_trigrams(data_words)

In [201]:
bigrams = []

for data_words_bigrams_doc in data_words_bigrams:    
    for word in data_words_bigrams_doc:
        if "_" in word:
            bigrams.append(word)

In [202]:
print(f"Number of ngrams: {len(bigrams)}")
unique_bigrams = set(bigrams)
print(f"Number of unique ngrams: {len(unique_bigrams)}")
print(unique_bigrams)

Number of ngrams: 11118
Number of unique ngrams: 528
{'categor_product', 'styles_cells', 'employee_eno', 'er_relational', 'misclassification_rate', 'gradient_descent', 'collaboration_githubremote', 'styles_format', 'read_write', 'binary_tree', 'markov_chains', 'pip_install', 'bi_ai', 'ename_emp', 'social_media', 'prior_parameters', 'computing_environments', 'main_branch', 'numpy_array', 'projection_operators', 'db_develop', 'nonempty_optimal', 'experimental_design', 'hidden_states', 'primal_feasibility', 'dynamic_programming', 'xml_document', 'eno_pno', 'method_alternating', 'residual_ss', 'multinomial_logistic', 'rr_lasso', 'credit_card', 'circle_radius', 'converges_point', 'garner_hbr', 'high_level', 'title_salary', 'cost_revenue', 'analyst_analyst', 'convolutional_neural', 'motivating_review', 'linked_lists', 'paste_format', 'summary_statistics', 'non_negative', 'performance_indices', 'plot_load', 'negative_binomial', 'relational_mapping', 'likelihood_estimation', 'standard_deviatio

In [203]:
trigrams = []

for data_words_bigrams_doc in data_words_trigrams:    
    for word in data_words_bigrams_doc:
        if "_" in word:
            trigrams.append(word)

In [204]:
print(f"Number of ngrams: {len(trigrams)}")
unique_trigrams = set(trigrams)
print(f"Number of unique ngrams: {len(unique_trigrams)}")
print(unique_trigrams)

Number of ngrams: 17304
Number of unique ngrams: 1625


These are bad, just use bigrams

In [205]:
def lemmatize(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    tokens = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        sent_tokens = []
        for token in doc: 
            if "_" in token.text:
                sent_tokens.append(token.text)
            else:
                if token.pos_ in allowed_postags:
                    sent_tokens.append(token.lemma_)
                    
        sent_tokens = [token.replace("datum", "data") for token in sent_tokens]
        tokens.append(sent_tokens)

    return tokens

In [206]:
data_lemmatized = lemmatize(data_words_bigrams)

In [207]:
print(len(data_lemmatized))
print(data_lemmatized[0][:20] + data_lemmatized[0][-20:])

162
['learn', 'explain', 'important', 'understand', 'use', 'correct', 'terminology', 'define', 'computer', 'software', 'memory', 'data', 'memory', 'size', 'data', 'size', 'cloud', 'explain', 'big', 'data', 'program', 'create', 'data', 'analyst', 'understand', 'different', 'way', 'represent', 'data', 'critical', 'necessary', 'transform', 'data', 'format', 'excel', 'analysis', 'ubco_master', 'data', 'science', 'data']


In [208]:
sum = 0
for doc in data_lemmatized:
    sum += len(doc)

print(f"Total number of words in the cleaned corpus: {sum}")

Total number of words in the cleaned corpus: 151017


Confirming that all the bigrams from earlier were retained

In [209]:
bigrams = []

for data_lemmatized_doc in data_lemmatized:    
    for word in data_lemmatized_doc:
        if "_" in word:
            bigrams.append(word)

In [210]:
print(f"Number of ngrams: {len(bigrams)}")
unique_bigrams = set(bigrams)
print(f"Number of unique ngrams: {len(unique_bigrams)}")
print(unique_bigrams)

Number of ngrams: 11118
Number of unique ngrams: 528
{'categor_product', 'styles_cells', 'employee_eno', 'er_relational', 'misclassification_rate', 'gradient_descent', 'collaboration_githubremote', 'styles_format', 'read_write', 'binary_tree', 'markov_chains', 'pip_install', 'bi_ai', 'ename_emp', 'social_media', 'prior_parameters', 'computing_environments', 'main_branch', 'numpy_array', 'projection_operators', 'db_develop', 'nonempty_optimal', 'experimental_design', 'hidden_states', 'primal_feasibility', 'dynamic_programming', 'xml_document', 'eno_pno', 'method_alternating', 'residual_ss', 'multinomial_logistic', 'rr_lasso', 'credit_card', 'circle_radius', 'converges_point', 'garner_hbr', 'high_level', 'title_salary', 'cost_revenue', 'analyst_analyst', 'convolutional_neural', 'motivating_review', 'linked_lists', 'paste_format', 'summary_statistics', 'non_negative', 'performance_indices', 'plot_load', 'negative_binomial', 'relational_mapping', 'likelihood_estimation', 'standard_deviatio