In [1]:
# https://www.ntlk.org
# https://www.tutorialspoint.com/natural_language_toolkit/natural_language_toolkit_tutorial.pdf

import pathlib

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style('whitegrid')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

import os

In [None]:
# only first time
# nltk.download()

### Tokenizing Text

#### word_tokenize module 
- splits a string into tokens

In [6]:
# from package import class
# from nltk.tokenizer import word_tokenize

bama_string = 'On January 11, 2021, Saban\'s Alabama Crimson Tide defeated the Ohio State Buckeyes 52-24 to win the National Championship.'
my_tokens = nltk.tokenize.word_tokenize(bama_string)
my_tokens

['On',
 'January',
 '11',
 ',',
 '2021',
 ',',
 'Saban',
 "'s",
 'Alabama',
 'Crimson',
 'Tide',
 'defeated',
 'the',
 'Ohio',
 'State',
 'Buckeyes',
 '52-24',
 'to',
 'win',
 'the',
 'National',
 'Championship',
 '.']

#### WordPunctTokenizer class
- splits all punctuation into separate tokens

In [10]:
tknzr = nltk.tokenize.WordPunctTokenizer()
my_tokens = tknzr.tokenize(bama_string)
my_tokens

['On',
 'January',
 '11',
 ',',
 '2021',
 ',',
 'Saban',
 "'",
 's',
 'Alabama',
 'Crimson',
 'Tide',
 'defeated',
 'the',
 'Ohio',
 'State',
 'Buckeyes',
 '52',
 '-',
 '24',
 'to',
 'win',
 'the',
 'National',
 'Championship',
 '.']

#### sent_tokenizer module
- split text/paragraph into sentences

In [12]:
para_string = 'I took the dog for a walk. After five minutes, it began to rain. We ran back to the house.'
my_tokens = nltk.tokenize.sent_tokenize(para_string)
my_tokens

['I took the dog for a walk.',
 'After five minutes, it began to rain.',
 'We ran back to the house.']

#### RegexpTokenizer class 
- gives complete control over how to tokenize the text

In [14]:
# example: don't split contradictions like "Saban's"
tknzr = nltk.tokenize.RegexpTokenizer("[\w']+")
my_tokens = tknzr.tokenize(bama_string)
my_tokens

['On',
 'January',
 '11',
 '2021',
 "Saban's",
 'Alabama',
 'Crimson',
 'Tide',
 'defeated',
 'the',
 'Ohio',
 'State',
 'Buckeyes',
 '52',
 '24',
 'to',
 'win',
 'the',
 'National',
 'Championship']

In [23]:
# example: always tokenize on whitespace
tknzr = nltk.tokenize.RegexpTokenizer('\s+', gaps = True)
my_tokens = tknzr.tokenize(bama_string)
my_tokens

['On',
 'January',
 '11,',
 '2021,',
 "Saban's",
 'Alabama',
 'Crimson',
 'Tide',
 'defeated',
 'the',
 'Ohio',
 'State',
 'Buckeyes',
 '52-24',
 'to',
 'win',
 'the',
 'National',
 'Championship.']

#### TweetTokenizer class 
- best tokenizer for emojis

In [24]:
emoji_string = 'LeBron had himself a night🔥💯 It\'s INSANE that he’s doing this at 36👏😤'
tknzr = nltk.tokenize.TweetTokenizer()
my_tokens = tknzr.tokenize(emoji_string)
my_tokens

['LeBron',
 'had',
 'himself',
 'a',
 'night',
 '🔥',
 '💯',
 "It's",
 'INSANE',
 'that',
 'he',
 '’',
 's',
 'doing',
 'this',
 'at',
 '36',
 '👏',
 '😤']

## Stopwords  
Words that are present in text but do not contribute to the meaning of a sentence

#### English Stopwords

In [32]:
# English is one of many languages with a preloaded set of stopwords
english_stops = nltk.corpus.stopwords.words('english')
# there are too many to show, but listed below are a few examples
english_stops[0:5]

['i', 'me', 'my', 'myself', 'we']

In [34]:
# example: remove stopwords
words = ['I', 'am', 'a', 'student', 'at', 'the', 'University', 'in', 'Switzerland']
[word for word in words if word not in english_stops]

['I', 'student', 'University', 'Switzerland']

## Word Replacement