[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/kentonishi/nlp-preprocessing-demo/blob/master/solution.ipynb)

# yay, let's demo what we learned today!

In [3]:
# prerequisites
!pip3.8 install nltk # library for natural language processing

Collecting nltk
  Using cached https://files.pythonhosted.org/packages/aa/b8/09ac15436591cefc0adc882798d5cf629f13addae0495b20b682219e3afe/nltk-3.6.5-py3-none-any.whl
Collecting regex>=2021.8.3 (from nltk)
  Using cached https://files.pythonhosted.org/packages/b6/53/fea3a3ffaa05b7787bfd359ddeb0f42e5da925da34c0bef158c566248e37/regex-2021.11.10-cp38-cp38-macosx_10_9_x86_64.whl
Installing collected packages: regex, nltk
  Found existing installation: regex 2021.4.4
    Uninstalling regex-2021.4.4:
      Successfully uninstalled regex-2021.4.4
Successfully installed nltk-3.6.5 regex-2021.11.10
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


# import the libraries we'll be using (just code other ppl wrote that we can use too)

In [7]:
# import libraries we need
import re  # regular expressions for splitting strings
import nltk  # natural language toolkit
from nltk import word_tokenize  # tokenize sentences
from nltk.corpus import stopwords  # list of stopwords
from nltk.stem.porter import PorterStemmer  # stemming algorithm

# just run this 
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/anish/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# first step: preprocessing!

In [8]:
# the string we'll be working with
TEXT = "LEC Hacks is an incredibly good hackathon and we are definitely not sponsored to say this."

# split the words into a list

In [12]:
tokens = word_tokenize(TEXT) 
print(tokens)

# we also don't need to have punctuation in our text
words = [word for word in tokens if word.isalpha()]  
print(words) # removes the punctuation 

['LEC', 'Hacks', 'is', 'an', 'incredibly', 'good', 'hackathon', 'and', 'we', 'are', 'definitely', 'not', 'sponsored', 'to', 'say', 'this', '.']
['LEC', 'Hacks', 'is', 'an', 'incredibly', 'good', 'hackathon', 'and', 'we', 'are', 'definitely', 'not', 'sponsored', 'to', 'say', 'this']


# Remove stop words (unimportant words with no meaning).

In [16]:
nltk.download("punkt") 
nltk.download("stopwords")

stop_words = (stopwords.words("english"))  # read stopwords
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package punkt to /Users/anish/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/anish/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
# let's remove these words 
words = [word for word in words if not word in stop_words]  # only select non-stopwords
print(words)

['LEC', 'Hacks', 'incredibly', 'good', 'hackathon', 'definitely', 'sponsored', 'say']


# we can also use stemming, to convert words to their base form: 

## for example, "eating" => "eat". Only applied for words without a stem

In [31]:
porter = PorterStemmer() 

NO_STEM_WORD = "LEC" 
STEM_WORD = "EATING"

print(porter.stem(NO_STEM_WORD)) # no stem, just changed capitalization
print(porter.stem(STEM_WORD)) # stem

lec
eat


In [32]:
words = [porter.stem(word) for word in words]  # stem all words
print(words)

['lec', 'hack', 'incred', 'good', 'hackathon', 'definit', 'sponsor', 'say']


# part 2 : exploring word embeddings!

In [50]:
# first install what we'll need 
!pip install gensim 
!pip3.8 install gensim 



# Load the Stanford GLoVE Vectors (100 sized)

In [51]:
import gensim
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-100')

# try it out 

In [53]:
WORD = 'word'
glove_vectors[WORD].shape

(100,)

# check most similar words (basically prove this works.)

In [49]:
glove_vectors.most_similar("dank")

[('dingy', 0.7442813515663147),
 ('windowless', 0.6911330223083496),
 ('claustrophobic', 0.683182954788208),
 ('musty', 0.6824417114257812),
 ('grimy', 0.6645137071609497),
 ('filthy', 0.6520333290100098),
 ('cramped', 0.6409538388252258),
 ('dreary', 0.613743782043457),
 ('subterranean', 0.6083458662033081),
 ('drafty', 0.6067347526550293)]