In [115]:
import string   # contains a public variable with all ASCII punctuation characters
import nltk

# list of all stopwords such as 'and', 'the', 'is', etc.
nltk.download('stopwords')  

# WordNet is a lexical database of English words that groups words into sets of synonyms, while also recording semantic relationships between words such as "is-a", "part-of", and "opposite-of" relationships.
nltk.download('wordnet')    

# Open Multilingual WordNet (omw) links hand created wordnets and automatically created wordnets for different languages.
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk import ngrams

# Used to tokenize the text; i.e. create a dictionary mapping words to integers. The dictionary can be used to create a term-document matrix.
from gensim.corpora import Dictionary

from gensim.test.utils import datapath

from gensim.models.word2vec import Text8Corpus
from gensim.models.phrases import Phrases, Phraser
from gensim.models.phrases import ENGLISH_CONNECTOR_WORDS

import spacy
from textacy import extract

import re

def read_txt_to_string(file_path):
    with open(file_path, 'r') as file:
        return file.read()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\syeda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\syeda\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\syeda\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
data530 = read_txt_to_string('Parsed_Slides/DATA530.txt')
# print(data530[:200])

### Experimenting with textacy's extract() function

In [None]:
nlp = spacy.load("en_core_web_sm")
# Add custom stop words, mostly including header and footer information like names of instructors, name of university, etc.

nlp.Defaults.stop_words |= {"ubc", "mds", "lecture", "lab", "assignments", "example", "page"}

doc = nlp(data530.lower())

In [157]:
ngrams = [
    ngram.text.replace(" ", "_") 
        for ngram in extract.ngrams(doc, n = 2, min_freq = 4, filter_punct = True, filter_nums = True, exclude_pos=["PROPN", "ORG", "DATE", "X"]) 
        if not ngram.text.__contains__("=") 
            and not ngram.text.__contains__("@") 
            and not ngram.text.__contains__("$")
]


In [158]:
unique_ngrams = set(ngrams)
print(unique_ngrams)
print(len(unique_ngrams))

{'format_painter', 'visual_basic', 'cell_value', 'data_analysis', 'pivot_table', 'ubco_master', 'returns_true', 'quick_access', 'goal_seek', 'access_toolbar', 'operating_system', 'relative_path', 'records_found', 'representing_data', 'text_file', 'basic_editor', 'file_system', 'row_labels', 'data_values', 'forecast_outline', 'ex_delete', 'import_random', 'clothing_jacket', 'pivot_tables', 'overall_revenue', 'regression_equation', 'immediate_window', 'version_control', 'text_files', 'grand_total', 'syntax_errors', 'external_new', 'macro_security', 'r_session', 'selection_font', 'learning_studio', 'computer_system', 'api_key', 'file_encoding', 'current_directory', 'data_set', 'cell_styles', 'selecting_cells', 'command_line', 'aggregate_functions', 'machine_learning', 'following_statements', 'analysis_sheet', 'org_eclipse', 'import_module', 'shortcut_key', 'memory_size', 'analysis_toolpak', 'data_size', 'binary_file', 'data_science', 'file_encodings', 'conditional_formatting', 'computing_

### Continuing preprocessing

In [167]:
def clean_with_spacy(doc):

    spacy_parser = spacy.load("en_core_web_sm")
    # Add custom stop words that are likely to appear in all topic models
    spacy_parser.Defaults.stop_words |= {"ubc", "mds", "lecture", "lab", "assignments", "example", "page", "ex", "import"}
    
    spacy_doc = spacy_parser(doc.lower())

    ngrams = [
        ngram.text.replace(" ", "_")    # ngrams are separated by spaces, so we replace them with underscores
        for ngram in extract.ngrams(spacy_doc, n = 2, min_freq = 4, filter_punct = True, filter_nums = True, exclude_pos=["PROPN", "ORG", "DATE", "X"]) 
        if not ngram.text.__contains__("=") 
            and not ngram.text.__contains__("@") 
            and not ngram.text.__contains__("$")
    ]
    
    # Remove stopwords, punctuation, and numeric tokens
    tokens = [token.lemma_ 
                for token in spacy_doc 
                if not token.is_stop and not token.is_punct and not token.is_digit and token.is_alpha # Keep only words that are not stop words
                and token.text not in ["_", "+", "=", "\n","-","*","<",">"]                           # Remove special characters
                and not token.lemma_ == "datum"]                                                      # Do not lemmatize anything related to data                              
    
    return tokens + ngrams

In [168]:
cleaned_data530 = clean_with_spacy(data530)

In [169]:
print(f"Number of words after cleaning: {len(cleaned_data530)}")
print(cleaned_data530[-200:])

Number of words after cleaning: 11524
['visual_basic', 'visual_basic', 'basic_editor', 'immediate_window', 'immediate_window', 'immediate_window', 'immediate_window', 'immediate_window', 'immediate_window', 'data_analysis', 'visual_basic', 'basic_editor', 'visual_basic', 'basic_editor', 'object_browser', 'ubco_master', 'data_science', 'conditional_formatting', 'data_analysis', 'data_analysis', 'cell_styles', 'clothing_jacket', 'cell_styles', 'clothing_jacket', 'clothing_jacket', 'selecting_cells', 'selecting_cells', 'selecting_cells', 'cell_styles', 'conditional_formatting', 'cell_styles', 'conditional_formatting', 'cell_styles', 'selecting_cells', 'cell_value', 'following_statements', 'cell_styles', 'clothing_jacket', 'aggregate_functions', 'data_values', 'aggregate_functions', 'conditional_formatting', 'cell_styles', 'aggregate_functions', 'aggregate_functions', 'cell_styles', 'clothing_jacket', 'clothing_jacket', 'clothing_jacket', 'aggregate_functions', 'following_statements', 'agg