In [19]:
import nltk
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import spacy
import re
from collections import Counter
from nltk import word_tokenize
from spacy.lang.en import English
from wordcloud import WordCloud

def matching_curly_braces(text):
    """return True if text has matching curly braces
    in the right order. Otherwise, returns False."""
    stack = []
    for c in text:
        if c == '{':
            stack.append(c)
        elif c == '}':
            if not stack or (c == '}' and stack[-1] != '{'):
                break
            stack.pop()
    return not stack


def remove_curly_braces(text):
    """remove all characters enclosed in curly braces from text"""
    left_bracket = []
    right_bracket = []
    stack = []
    for i, c in enumerate(text):
        if c == '{':  # c is curly brace
            stack.append(c)
            left_bracket.append(i)
        elif c == '}':           # c is closed curly brace
            # string is not valid
            if not stack or \
                (c == '}' and stack[-1] != '{'):
                break
            stack.pop() # pop open curly brace
            right_bracket.append(i+1)
    for _ in range(len(left_bracket)):
        shift = 0
        left_index = 0
        right_index = 0
        for left_index in range(len(left_bracket)):
            if left_bracket[left_index] > right_bracket[right_index]:
                left_index -= 1
                break
        text = text[0:left_bracket[left_index]] + text[right_bracket[right_index]:]
        shift = right_bracket[right_index] - left_bracket[left_index]
        for n in range(len(left_bracket)):
            if right_bracket[n] > right_bracket[right_index]:
                right_bracket[n] -= shift 
            if left_bracket[n] > left_bracket[left_index]:
                left_bracket[n] -= shift 
        left_bracket.pop(left_index)
        right_bracket.pop(right_index)
    return text

def remove_smart_quotes(text):
    """remove quotes in text"""
    return text.replace("“", "\"").replace("”","\"")

def clean_data(input_name, output_name):
    # read input file
    if input_name != "data.txt":
        with open(input_name, "r", encoding="utf8") as input_file:
            text = input_file.readlines()
            alt_text = ""
            for line in text:
                alt_text += line + "\n"
            if matching_curly_braces(alt_text):
                # remove text enclosed in curly_braces to remove random html code
                alt_text = remove_curly_braces(alt_text)
                text = alt_text
    else:
        with open(input_name, "r") as input_file:
            text = input_file.readlines()
            alt_text = ""
            for line in text:
                alt_text += line + "\n"
            if matching_curly_braces(alt_text):
                # remove text enclosed in curly_braces to remove random html code
                alt_text = remove_curly_braces(alt_text)
                text = alt_text

    text = re.sub("U.S.", "United States ", text)
    text = re.sub("p.m.", "pm ", text)
    text = re.sub("a.m.", "am ", text)
    text = re.sub("E.A.T.", "eat ", text)
    text = re.sub("-", " ", text)
    text = re.sub(r'([A-Za-z])(\d)', r'\1 \2', text)
    text = re.sub(r'(\d)([A-Za-z])', r'\1 \2', text)
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)

    nlp = spacy.load('en_core_web_sm')

    # split text into lines based on if it has a period, question, exclamation, or newline character
    text = re.split(r'[.?!]|\\n', text)

    # set list of stopwords
    stopwords = nlp.Defaults.stop_words
    html_stopwords = ["var","https", "csrftoken", "userdata", "csmllty54qx20erutnfcgs839jd2y", "const", "saml","getitem", "firebaseat", "firebaseapp", "json"]
    stopwords.update(html_stopwords)
    # write to output file
    with open(output_name, "w") as output_file:
        for line in text:
            # remove punctuation and stopwords from line
            tokenized_line = [word.lower() for word in word_tokenize(line) if word.isalnum() and word.lower() not in stopwords]
            newline = ""
            for word in tokenized_line:
                newline += word + " "
            if newline.strip():
                output_file.write(newline.strip() + "\n")

# clean_data("all_data.txt", "all_cleaned_data.txt")
clean_data("data.txt", "cleaned_data.txt")
# clean_data("test.txt", "cleaned_test.txt")
# clean_data("data2.txt", "cleaned_data2.txt")

In [20]:
# get the WordNet list of chosen_word

import nltk
from nltk.corpus import wordnet as wn
chosen_words = ["students", "program", "pacific", "learn", "university"]

for chosen_word in chosen_words:
    definitions = wn.synsets(chosen_word)
    if len(definitions) > 0:
        word_list = definitions[0].hyponyms()
        simple_names = []
        for word in range (len(word_list)):
            simple_name = word_list[word].lemma_names()[0]
            simple_names.append(simple_name)

        # generate some sample data

        with open("cleaned_data.txt", "r") as input_file:
            lines = input_file.readlines()
            
            for line in lines:
                index = -1
                index = line.find(chosen_word, index + 1)
                if index != -1:
                    with open("generated_data.txt", "a") as output_file:
                        for val in range(len(simple_names)):
                            output_file.write(line[0:index] + simple_names[val] + line[index+len(chosen_word):] + "\n")
    

In [21]:
# an NLTK CFG grammar 
grammar = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det N N |Det N PP | Pro | Adj N
Pro -> 'I' |'you'|'we'
VP -> V NP | VP PP | VP Adv | V V
Det -> 'an' | 'my' | 'the' | 'these'|'your'
N -> 'permit' | 'areas' | 'parking' |'links' | 'information' |'children'
V -> 'saw'|'watched'|'have'|'can'|'park'|'are'|'provided'
P -> 'for'|'to'|'where'|'here'
""")
# parse and visualize a sentence
# we will need this to tokenize the input
import nltk
from nltk import word_tokenize
# a package for visualizing parse trees
import svgling
# to use svgling we need to disable NLTK's normal visualization functions
svgling.disable_nltk_png()
# example sentence that can be parsed with the grammar we've defined
example = "these links are provided for your information"
# with open("cleaned_data.txt", "r") as input_file:
#     example = input_file.readlines()[10]
sent = word_tokenize(example)
# create a chart parser based on the grammar above
parser = nltk.ChartParser(grammar)
# parse the sentence
trees = list(parser.parse(sent))
# print a text-formatted parse tree
print(trees[0])
# print an SVG formatted parse tree
trees[0]
with open("CFG_Parsed_Sentence.txt", "w") as output_file:
    output_file.write(str(trees[0]))

(S
  (NP (Det these) (N links))
  (VP
    (VP (V are) (V provided))
    (PP (P for) (NP (Det your) (N information)))))


In [22]:
import spacy
from spacy.lang.en import English

nlp = English()

ruler = nlp.add_pipe("entity_ruler")
student_patterns = [
    {"label": "STUDENT", "pattern": "undergraduate"},
    {"label": "STUDENT", "pattern": "graduate"},
    {"label": "STUDENT", "pattern": "international"},
    {"label": "STUDENT", "pattern": "new student"}]
event_patterns = [
    {"label": "EVENT", "pattern": "orientation"},
    {"label": "EVENT", "pattern": "parking"}]
vehicle_patterns = [
    {"label": "VEHICLE", "pattern": "car"},
    {"label": "VEHICLE", "pattern": "bus"},
    {"label": "VEHICLE", "pattern": "shuttle"},
    {"label": "VEHICLE", "pattern": "bike"},
    {"label": "VEHICLE", "pattern": "bicycle"}]
campus_patterns = [
    {"label": "CAMPUS", "pattern": "stockton"},
    {"label": "CAMPUS", "pattern": "san francisco"},
    {"label": "CAMPUS", "pattern": "sacramento"}]
building_patterns = [
    {"label": "BUILDING", "pattern": "biology"},
    {"label": "BUILDING", "pattern": "chemistry"},
    {"label": "BUILDING", "pattern": "pharmacy"},
    {"label": "BUILDING", "pattern": "computer science"},
    {"label": "BUILDING", "pattern": "library"},
    {"label": "BUILDING", "pattern": "cafeteria"}]
location_patterns = [
    {"label": "LOCATION", "pattern": "near here","id":"nearby"},
    {"label": "LOCATION", "pattern": "close by","id":"nearby"},
    {"label": "LOCATION", "pattern": "near me","id":"nearby"},
    {"label": "LOCATION", "pattern": "walking distance", "id":"short_walk"},
    {"label": "LOCATION", "pattern": "short walk", "id":"short_walk"},
    {"label": "LOCATION", "pattern": "a short drive"}]
           
ruler.add_patterns(student_patterns)
ruler.add_patterns(event_patterns)
ruler.add_patterns(campus_patterns)
ruler.add_patterns(vehicle_patterns)
ruler.add_patterns(building_patterns)
ruler.add_patterns(location_patterns)

doc = nlp("where can i find a parking lot close by the new student orientation on the stockton campus?")
print([(ent.text, ent.label_) for ent in doc.ents])

[('parking', 'EVENT'), ('close by', 'LOCATION'), ('new student', 'STUDENT'), ('orientation', 'EVENT'), ('stockton', 'CAMPUS')]


In [24]:
from spacy import displacy
colors = {"STUDENT": "#ea7e7e",
          "EVENT": "#baffc9",
          "VEHICLE": "#abcdef",
          "BUILDING": "#aaffbb",
          "CAMPUS": "#ffffaa",
          "LOCATION": "#fedcba"}
options = {"ents": ["STUDENT", "EVENT", "VEHICLE", "BUILDING", "CAMPUS", "LOCATION"], "colors": colors}
svg = displacy.render(doc, style="ent", options=options,jupyter = False)
with open("slot_filling_and_visualization.svg", "w") as output_file:
    output_file.write(svg)
displacy.render(doc, style="ent", options=options,jupyter = True)