<a href="https://colab.research.google.com/github/boyuan5022/boyuan5022.github.io/blob/master/SpaCy_N_Grams.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Sentence Splitter

In [None]:
import spacy
import re
import nltk

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
wnl = WordNetLemmatizer()
nlp = spacy.load("en_core_web_sm")

###################
#sentence_splitter
###################
#tokenizes a sentence, applies stopwords, and detects n-grams.
#input: sentence-string to tokenize 
#input: stop_words-words to remove from sentence
#output: list of strings
def sentence_splitter(sentence,stop_words):
    stop_words=stop_words_input+['\'s']
    stop_words_with_punctuation=stop_words+[c for c in '\[\]\"(){}?!.,<>/;:+\'']
    more_stop_words=stop_words+['more', 'lot', 'less', 'few', 'some', 'little','many']

    #Algorithm
    #1.Run sentence in Spacy to create doc, a spacy model for natural language processing.
    #2.Use the build in model functionality to create a list of Entities(a well known object like "Barrak Obama") and Chunks(Multiple words that are used to describe an object like "the black cat")
    #3.Before Lemmatizer and Stopwords, we need to do this additional step. If a chunk implies ownership i.e. "Barrack Obama's black cat", we must seperate "Barrack Obama's" from "black cat".
    #3 continued. If a chunk contains a numeric descriptor i.e. "5 Black Cats", we must seperate "5" from "black cats".
    #3 continued. The rules above only apply if the chunk is not an entity i.e. "The 3 Musketeers" must be kept together.
    #4.Apply Lemmatizer to list of words
    #5.Remove Stopwords from list of words.

    doc = nlp(sentence)
    ents=[ent.text for ent in doc.ents if " " in ent.text]
    chunks=[chunk.text for chunk in doc.noun_chunks if " " in chunk.text]
    
    #we are resorting chunks and ents by legnth so that if a bigger chunk contains a smaller one, the big on gets resolved properly
    ents.sort(key=len, reverse=True)
    chunks.sort(key=len, reverse=True)

    #we are at step 3 of the algorithm now and the next part will improve our chunks
    #use|to seperate chunk into smaller groups and connect words in smaller groups with "_"
    for chunk in chunks:
        #if one entities owns another, we must seperate the 2
        new_chunk=chunk.replace("'s","'s|")
        #create groups for entities
        for ent in ents: 
            if ent in chunk: new_chunk=re.sub(r"\b"+ent+r"\b","|"+ent.replace(" ","_")+"|",new_chunk)
        #commas in english represent a division of entities
        new_chunk=chunk.replace(",","|")
        #create groups for stopwords and special words like few, more, a lot, a little, some
        for word in new_chunk.split():
            if word.lower() in more_stop_words:
                new_chunk=re.sub(r"\b"+word+r"\b","|"+word+"|",new_chunk)
        #create groups for numbers
        new_chunk=re.sub(r"(\d+)",r"|\g<1>|",new_chunk)
        #the words not grouped yet are new grouped and we replace the chunk in the sentence with our new grouped chunk
        sentence=sentence.replace(chunk," ".join([" "+wnl.lemmatize(phrase).strip().replace(" ","_")+"|yes " for phrase in new_chunk.split("|")]))
    
    #add |yes and clean multi word ents
    for ent in ents:
        sentence=sentence.replace(ent," "+ent.replace(" ", "_").replace("'s","")+"|yes ")
    
    #add |yes to single word ents
    for ent in doc.ents:
        if " " not in ent.text and "(" not in ent.text and ")" not in ent.text:
          sentence=re.sub(r"\b"+ent.text+r"\b"," "+ent.text+"|yes ",sentence)
    
    return [wnl.lemmatize(word.replace("_"," ")) for word in nltk.word_tokenize(sentence.lower()) if word.replace("|yes","") not in stop_words_with_punctuation and word!="|yes"]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Count Word in Document

In [None]:
from collections import Counter
import pandas as pd
from string import ascii_lowercase
import numpy as np
#for non jupyter notebook environment use "from tqdm import tqdm" instead
from tqdm.notebook import tqdm
import csv

###################
#count_words_in_document
###################
#applies sentence_splitter to cells in an excel file and counts word occurances
#input: input_file_path-path of input file
#input: output_file_path-path of output file
#output: no output directly returned. Creates or modify file at output_file_path
def count_words_in_document(input_file_path,output_file_path,column_letter):
    count = Counter()
    stop_words_input = list(stopwords.words('english')+["if","'d","'m","n't","'ve","'re","this",'these',"'ll"])
    stop_words_input = list(sorted(stop_words_input,key=len, reverse=True))
    #Converts Excel column letters to strings
    LETTERS = {letter: str(index) for index, letter in enumerate(ascii_lowercase, start=1)}

    data_table = pd.read_excel(input_file_path)
    predsentences = data_table[data_table.columns[int(LETTERS[column_letter])-1]].to_list()
    pred_sentences = ["" if x is np.nan else " ".join(str(x).splitlines()) for x in predsentences]

    #run function of a concatenated string of 100 sentences to improve run time
    for i in tqdm(range(0,len(pred_sentences),100)):
        upper_bound = min(i+100,len(pred_sentences))
        pred_sentences_chunk = " . ".join(pred_sentences[i:upper_bound])
        count.update(sentence_splitter(pred_sentences_chunk,stop_words_input))

    with open(output_file_path, 'w', newline='') as csvfile:
        fieldnames = ['word','entity','frequency']
        writer = csv.writer(csvfile)
        writer.writerow(fieldnames)
        for key, value in count.items():
            #optional, remove entries that are only character or numbers
            if key=="|yes" or key.replace("|yes","").isdigit() or re.match(r'^[_\W]+$', key.replace("|yes","")):
                pass
            elif "|" in key:
                writer.writerow(key.split("|") + [value])
            else:
                writer.writerow([key,"no"] + [value])

Test

In [None]:
from collections import Counter
count=Counter()
stop_words_input = list(stopwords.words('english')+["if","'d","'m","n't","'ve","'re","this",'these',"'ll"])
stop_words_input = list(sorted(stop_words_input,key=len, reverse=True))
sentence_input1="I love this dress. i'd get it in both colors if i could! the cut and fit is beautiful, i'd suggest sizing down if you feel like it's too boxy or lacks shape. the bottom skirt is round enough to where you can twirl ( #1 thing to look for in a dress ;) ) and it's just overall a classic pretty dress. my only complaint is that the overlay cut out seems a little bit delicate and i'm afraid it will be ruined after a few wears but it seems to be holding up fine so far and isn't incredibly delicate like"
sentence_input2="The fabric and detailing of this dress is of superior quality, but unfortunately it runs huge-- you definitely need to wear a tank or cami underneath. i am 5'9 145lbs with massive shoulders/smaller bust and i got the xs petite!"
sentence_input3="Barack Obama quickly walked his 3 dogs, Nancy, Mary Jane, and Rover last night."
sentence_input4="Barack Obama's dog Nancy was taken out for a walk in front of the white house last night."
sentence_input5="I love how different this dress is in terms of the design. it catches light beautifully. the short, biased hem makes the dress fun and flirty. but, this dress is completely sheer. there is no lining which is very disappointing. i do have a short, nude slip that makes this dress work for me. you will need a slip, for sure! unless you are rhianna. the sizing runs a little large, i probably could size down given the loose cut but overall the proportions are flattering showing off my legs. i will re"
print(sentence_splitter(sentence_input1,stop_words_input))
print(sentence_splitter(sentence_input2,stop_words_input))
print(sentence_splitter(sentence_input3,stop_words_input))
print(sentence_splitter(sentence_input4,stop_words_input))
print(sentence_splitter(sentence_input5,stop_words_input))

['love', 'dress|yes', 'get', 'colors|yes', 'could', 'cut|yes', 'fit', 'beautiful', 'suggest', 'sizing', 'feel', 'like', 'boxy', 'lack', 'shape', 'bottom skirt|yes', 'round', 'enough', 'twirl', ' ', '#', '1|yes', 'thing|yes', 'look', 'dress|yes', 'overall', 'classic pretty dress|yes', 'complaint|yes', 'overlay|yes', 'cut', 'seems', 'little', 'bit', 'delicate', 'afraid', 'ruined', 'wears|yes', 'seems', 'holding', 'fine', 'far', 'incredibly', 'delicate', 'like']
['fabric|yes', 'detailing', 'dress|yes', 'superior quality|yes', 'unfortunately', 'run', 'huge', '--', 'definitely', 'need', 'wear', 'tank|yes', 'cami', 'underneath', '5|yes', '9|yes', '145|yes', 'lb|yes', 'massive shoulders|yes', 'smaller bust|yes', 'got', 'xs petite|yes']
['barack obama|yes', 'quickly', 'walked', '3|yes', 'dogs|yes', 'nancy|yes', 'mary jane|yes', 'rover|yes', 'last night|yes']
["barack obama's|yes", 'dog nancy|yes', 'taken', 'walk|yes', 'front', 'the white house|yes', 'last night|yes']
['love', 'different', 'dre

Full Demo

In [None]:
from google.colab import drive, files
drive.mount('/content/drive')
count_words_in_document("/content/drive/MyDrive/Colab Notebooks/Reviews.xlsx","/content/drive/MyDrive/Colab Notebooks/worddata.csv","g")
files.download("/content/drive/MyDrive/Colab Notebooks/worddata.csv") 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


HBox(children=(FloatProgress(value=0.0, max=235.0), HTML(value='')))




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>