In [1]:
# https://www.ntlk.org
# https://www.tutorialspoint.com/natural_language_toolkit/natural_language_toolkit_tutorial.pdf
# https://scikit-learn.org/stable/modules/feature_extraction.html

import pathlib

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style('whitegrid')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

import os

# **NLTK**

In [2]:
# only first time
# nltk.download()

### Tokenizing Text

#### word_tokenize module 
- splits a string into tokens

In [3]:
# from package import class
# from nltk.tokenizer import word_tokenize

bama_string = 'On January 11, 2021, Saban\'s Alabama Crimson Tide defeated the Ohio State Buckeyes 52-24 to win the National Championship.'
my_tokens = nltk.tokenize.word_tokenize(bama_string)
my_tokens

['On',
 'January',
 '11',
 ',',
 '2021',
 ',',
 'Saban',
 "'s",
 'Alabama',
 'Crimson',
 'Tide',
 'defeated',
 'the',
 'Ohio',
 'State',
 'Buckeyes',
 '52-24',
 'to',
 'win',
 'the',
 'National',
 'Championship',
 '.']

#### WordPunctTokenizer class
- splits all punctuation into separate tokens

In [4]:
tknzr = nltk.tokenize.WordPunctTokenizer()
my_tokens = tknzr.tokenize(bama_string)
my_tokens

['On',
 'January',
 '11',
 ',',
 '2021',
 ',',
 'Saban',
 "'",
 's',
 'Alabama',
 'Crimson',
 'Tide',
 'defeated',
 'the',
 'Ohio',
 'State',
 'Buckeyes',
 '52',
 '-',
 '24',
 'to',
 'win',
 'the',
 'National',
 'Championship',
 '.']

#### sent_tokenizer module
- split text/paragraph into sentences

In [5]:
para_string = 'I took the dog for a walk. After five minutes, it began to rain. We ran back to the house.'
my_tokens = nltk.tokenize.sent_tokenize(para_string)
my_tokens

['I took the dog for a walk.',
 'After five minutes, it began to rain.',
 'We ran back to the house.']

#### RegexpTokenizer class 
- gives complete control over how to tokenize the text

In [6]:
# example: don't split contradictions like "Saban's"
tknzr = nltk.tokenize.RegexpTokenizer("[\w']+")
my_tokens = tknzr.tokenize(bama_string)
my_tokens

['On',
 'January',
 '11',
 '2021',
 "Saban's",
 'Alabama',
 'Crimson',
 'Tide',
 'defeated',
 'the',
 'Ohio',
 'State',
 'Buckeyes',
 '52',
 '24',
 'to',
 'win',
 'the',
 'National',
 'Championship']

In [7]:
# example: always tokenize on whitespace
tknzr = nltk.tokenize.RegexpTokenizer('\s+', gaps = True)
my_tokens = tknzr.tokenize(bama_string)
my_tokens

['On',
 'January',
 '11,',
 '2021,',
 "Saban's",
 'Alabama',
 'Crimson',
 'Tide',
 'defeated',
 'the',
 'Ohio',
 'State',
 'Buckeyes',
 '52-24',
 'to',
 'win',
 'the',
 'National',
 'Championship.']

#### TweetTokenizer class 
- best tokenizer for emojis

In [8]:
emoji_string = 'LeBron had himself a night🔥💯 It\'s INSANE that he’s doing this at 36👏😤'
tknzr = nltk.tokenize.TweetTokenizer()
my_tokens = tknzr.tokenize(emoji_string)
my_tokens

['LeBron',
 'had',
 'himself',
 'a',
 'night',
 '🔥',
 '💯',
 "It's",
 'INSANE',
 'that',
 'he',
 '’',
 's',
 'doing',
 'this',
 'at',
 '36',
 '👏',
 '😤']

## Stopwords  
Words that are present in text but do not contribute to the meaning of a sentence

#### English Stopwords

In [9]:
# English is one of many languages with a preloaded set of stopwords
english_stops = nltk.corpus.stopwords.words('english')
# there are too many to show, but listed below are a few examples
english_stops[0:5]

['i', 'me', 'my', 'myself', 'we']

In [10]:
# example: remove stopwords
words = ['I', 'am', 'a', 'student', 'at', 'the', 'University', 'in', 'Switzerland']
[word for word in words if word not in english_stops]

['I', 'student', 'University', 'Switzerland']

## Stemming and Lemmatization

### Stemming  
- Technique used to extract the base form of words by removing affixes (root stem)
- Looks at form of the word

#### PorterStemmer class 

In [11]:
word_stemmer = nltk.stem.PorterStemmer()
word_stemmer.stem('writing')

'write'

#### LancasterStemmer class 

In [12]:
word_stemmer = nltk.stem.LancasterStemmer()
word_stemmer.stem('written')

'writ'

#### RegexpStemmer class  
- Takes in a single regular expression, removes any prefix or suffix that matches that expression

In [13]:
word_stemmer = nltk.stem.RegexpStemmer('ing')
word_stemmer.stem('ingwriting')

'writ'

#### SnowballStemmer class  
- works with multiple languages

In [14]:
French_stemmer = nltk.stem.SnowballStemmer('french')
French_stemmer.stem('Bonjoura')

'bonjour'

### Lemmatization
- Technique used to extract the base form of words by finding root word
- Looks at meaning of the word

In [15]:
word_lemmatizer = nltk.stem.WordNetLemmatizer()
word_lemmatizer.lemmatize('believes')

'belief'

- Difference versus stemming

In [16]:
word_stemmer = nltk.stem.LancasterStemmer()
word_stemmer.stem('believes')

'believ'

## Word Replacement

## Distance Metrics

#### edit_distance(s1, s2)  
- Calculates the number of characters that need to be substituted, inserted, or deleted to transform s1 into s2
- Possible to weigh subsitution edits differently (default 1)

In [17]:
s1 = 'Kevin Durant scored 37 points, grabbed 8 rebounds, and had 6 assists in a loss'
s2 = 'Stephen Curry scored 30 points, grabbed 4 rebounds, and had 11 assists in a win'

edit_distance = nltk.edit_distance(s1, s2)
edit_distance

17

#### edit_proportion

In [18]:
edit_proportion = round(nltk.edit_distance(s1, s2) / len(s1), 4)
edit_proportion

0.2179

In [19]:
s3 = 'Javale McGee scored 10 points, and grabbed 5 rebounds in a loss'
s4 = 'The square root of 49 is 7'

player_list = [s1, s2, s3, s4]
target_player = player_list[1]

for comparison_player in player_list:
    edit_distance = nltk.edit_distance(comparison_player, target_player)
    edit_percent = round(100*edit_distance/len(comparison_player), 2)
    print('Transform')
    print(f'  - {comparison_player}')
    print('Into')
    print(f'  - {target_player}')
    print(f'     - edit distance: {edit_distance}')
    print(f'     - edit percent: {edit_percent}')
    print('*'*75)

Transform
  - Kevin Durant scored 37 points, grabbed 8 rebounds, and had 6 assists in a loss
Into
  - Stephen Curry scored 30 points, grabbed 4 rebounds, and had 11 assists in a win
     - edit distance: 17
     - edit percent: 21.79
***************************************************************************
Transform
  - Stephen Curry scored 30 points, grabbed 4 rebounds, and had 11 assists in a win
Into
  - Stephen Curry scored 30 points, grabbed 4 rebounds, and had 11 assists in a win
     - edit distance: 0
     - edit percent: 0.0
***************************************************************************
Transform
  - Javale McGee scored 10 points, and grabbed 5 rebounds in a loss
Into
  - Stephen Curry scored 30 points, grabbed 4 rebounds, and had 11 assists in a win
     - edit distance: 39
     - edit percent: 61.9
***************************************************************************
Transform
  - The square root of 49 is 7
Into
  - Stephen Curry scored 30 points, grabbe

# **sklearn.feature_extraction.text**

## Vectorization
- The general process of turning a collection of text documents into numerical feature vectors
- Tokenization, counting, normalization process
- Documents are described by word occurences, while ignoring positional information of the words in the document

In [20]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(player_list)

In [21]:
X

<4x26 sparse matrix of type '<class 'numpy.float64'>'
	with 41 stored elements in Compressed Sparse Row format>