### NLP Refresher

In [1]:
text = ["System of the World. By Isaac Newton", "   Snow Crash  .  By Neal Stephenson ",
       " AFROFUTURISM. by     Ytasha L. Womack "]

In [2]:
strip_whitespace = [string.strip() for string in text]

In [3]:
strip_whitespace

['System of the World. By Isaac Newton',
 'Snow Crash  .  By Neal Stephenson',
 'AFROFUTURISM. by     Ytasha L. Womack']

In [4]:
strip_whitespace2 = [string.strip() for string in strip_whitespace]
strip_whitespace2

['System of the World. By Isaac Newton',
 'Snow Crash  .  By Neal Stephenson',
 'AFROFUTURISM. by     Ytasha L. Womack']

In [5]:
remove_periods = [string.replace(".","") for string in strip_whitespace]

In [6]:
remove_periods

['System of the World By Isaac Newton',
 'Snow Crash    By Neal Stephenson',
 'AFROFUTURISM by     Ytasha L Womack']

In [7]:
upper = [string.upper() for string in strip_whitespace]

In [8]:
upper

['SYSTEM OF THE WORLD. BY ISAAC NEWTON',
 'SNOW CRASH  .  BY NEAL STEPHENSON',
 'AFROFUTURISM. BY     YTASHA L. WOMACK']

In [9]:
import re

In [10]:
xs = [re.sub(r"[a-zA-Z]", "X", string) for string in strip_whitespace]

In [11]:
xs

['XXXXXX XX XXX XXXXX. XX XXXXX XXXXXX',
 'XXXX XXXXX  .  XX XXXX XXXXXXXXXX',
 'XXXXXXXXXXXX. XX     XXXXXX X. XXXXXX']

**REGEX TUTORIAL**

https://www.analyticsvidhya.com/blog/2015/06/regular-expression-python/

### Scraping

In [12]:
import requests
from bs4 import BeautifulSoup

In [13]:
url = 'https://www.analyticsvidhya.com/blog/2015/06/regular-expression-python/'

In [14]:
req = requests.get(url)

In [15]:
req

<Response [200]>

In [16]:
soup = BeautifulSoup(req.text, 'html.parser')

In [20]:
soup.text[40:50]

'lar Expres'

In [20]:
soup.find('h2')

<h2 class="site-outline">Learn everything about Analytics</h2>

In [21]:
heads = soup.find_all('h2')

In [22]:
len(heads)

6

### Basic NLP

In [28]:
from nltk.tokenize import word_tokenize
import nltk

In [30]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/NYCMath/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [31]:
pgraph = soup.find('p').text

In [32]:
tokes = word_tokenize(pgraph)

In [33]:
tokes

['In',
 'last',
 'few',
 'years',
 ',',
 'there',
 'has',
 'been',
 'a',
 'dramatic',
 'shift',
 'in',
 'usage',
 'of',
 'general',
 'purpose',
 'programming',
 'languages',
 'for',
 'data',
 'science',
 'and',
 'machine',
 'learning',
 '.',
 'This',
 'was',
 'not',
 'always',
 'the',
 'case',
 '–',
 'a',
 'decade',
 'back',
 'this',
 'thought',
 'would',
 'have',
 'met',
 'a',
 'lot',
 'of',
 'skeptic',
 'eyes',
 '!']

In [37]:
sy = ['.', ',', '!', '-', '?','*', '–']

In [38]:
for word in tokes:
    if word not in sy:
        print(word)
    else:
        _

In
last
few
years
there
has
been
a
dramatic
shift
in
usage
of
general
purpose
programming
languages
for
data
science
and
machine
learning
This
was
not
always
the
case
a
decade
back
this
thought
would
have
met
a
lot
of
skeptic
eyes


In [39]:
from nltk.tokenize import sent_tokenize

In [40]:
sent_tokenize(pgraph)[0]

'In last few years, there has been a dramatic shift in usage of general purpose programming languages for data science and machine learning.'

In [41]:
from nltk.corpus import stopwords

In [42]:
stop_words = stopwords.words('english')

In [43]:
stop_words[:6]

['i', 'me', 'my', 'myself', 'we', 'our']

In [44]:
[word for word in tokes if word not in stop_words]

['In',
 'last',
 'years',
 ',',
 'dramatic',
 'shift',
 'usage',
 'general',
 'purpose',
 'programming',
 'languages',
 'data',
 'science',
 'machine',
 'learning',
 '.',
 'This',
 'always',
 'case',
 '–',
 'decade',
 'back',
 'thought',
 'would',
 'met',
 'lot',
 'skeptic',
 'eyes',
 '!']

In [45]:
stop_words[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [46]:
#stemming
from nltk.stem.porter import PorterStemmer

In [47]:
porter = PorterStemmer()

In [48]:
[porter.stem(word) for word in tokes]

['In',
 'last',
 'few',
 'year',
 ',',
 'there',
 'ha',
 'been',
 'a',
 'dramat',
 'shift',
 'in',
 'usag',
 'of',
 'gener',
 'purpos',
 'program',
 'languag',
 'for',
 'data',
 'scienc',
 'and',
 'machin',
 'learn',
 '.',
 'thi',
 'wa',
 'not',
 'alway',
 'the',
 'case',
 '–',
 'a',
 'decad',
 'back',
 'thi',
 'thought',
 'would',
 'have',
 'met',
 'a',
 'lot',
 'of',
 'skeptic',
 'eye',
 '!']

In [49]:
from nltk import pos_tag

In [50]:
text_tagged = pos_tag(tokes)

In [51]:
text_tagged

[('In', 'IN'),
 ('last', 'JJ'),
 ('few', 'JJ'),
 ('years', 'NNS'),
 (',', ','),
 ('there', 'EX'),
 ('has', 'VBZ'),
 ('been', 'VBN'),
 ('a', 'DT'),
 ('dramatic', 'JJ'),
 ('shift', 'NN'),
 ('in', 'IN'),
 ('usage', 'NN'),
 ('of', 'IN'),
 ('general', 'JJ'),
 ('purpose', 'NN'),
 ('programming', 'NN'),
 ('languages', 'NNS'),
 ('for', 'IN'),
 ('data', 'NNS'),
 ('science', 'NN'),
 ('and', 'CC'),
 ('machine', 'NN'),
 ('learning', 'NN'),
 ('.', '.'),
 ('This', 'DT'),
 ('was', 'VBD'),
 ('not', 'RB'),
 ('always', 'RB'),
 ('the', 'DT'),
 ('case', 'NN'),
 ('–', 'VBZ'),
 ('a', 'DT'),
 ('decade', 'NN'),
 ('back', 'RB'),
 ('this', 'DT'),
 ('thought', 'NN'),
 ('would', 'MD'),
 ('have', 'VB'),
 ('met', 'VBN'),
 ('a', 'DT'),
 ('lot', 'NN'),
 ('of', 'IN'),
 ('skeptic', 'JJ'),
 ('eyes', 'NNS'),
 ('!', '.')]

In [52]:
[word for word, tag in text_tagged if tag in ['NN', 'NNS']]

['years',
 'shift',
 'usage',
 'purpose',
 'programming',
 'languages',
 'data',
 'science',
 'machine',
 'learning',
 'case',
 'decade',
 'thought',
 'lot',
 'eyes']

In [56]:
tweets = ["we are more worried about what we can lose than what we feel",
         "it's really cool to say I hate you. But it's not cool to say I love you. Love has a stigma",
         "Instead of doing what you feel you just do what other people think you should do"]

In [57]:
tagged_tweets = []
for tweet in tweets:
    tweet_tag = pos_tag(word_tokenize(tweet))
    tagged_tweets.append([tag for word, tag in tweet_tag])

In [58]:
tagged_tweets[2][:5]

['RB', 'IN', 'VBG', 'WP', 'PRP']

In [59]:
from sklearn.preprocessing import MultiLabelBinarizer

In [60]:
one_hot_multi = MultiLabelBinarizer()

In [61]:
one_hot_multi.fit_transform(tagged_tweets)

array([[0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1],
       [1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0],
       [0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1]])

In [62]:
one_hot_multi.classes_

array(['.', 'CC', 'DT', 'IN', 'JJ', 'MD', 'NN', 'NNS', 'PRP', 'RB', 'RBR',
       'TO', 'VB', 'VBG', 'VBP', 'VBZ', 'WP'], dtype=object)

### CountVectorizer

In [86]:
import numpy as np

In [87]:
from sklearn.feature_extraction.text import CountVectorizer

In [88]:
text_data = np.array(['I like Cardi B. ', 'Tribeca is a strange place.', ' Germany is where they make volkswagen cars.'])

In [89]:
count = CountVectorizer()

In [90]:
bag_of_words = count.fit_transform(text_data)

In [92]:
count.get_feature_names()

['cardi',
 'cars',
 'germany',
 'is',
 'like',
 'make',
 'place',
 'strange',
 'they',
 'tribeca',
 'volkswagen',
 'where']

In [68]:
bag_of_words

<3x12 sparse matrix of type '<class 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [69]:
bag_of_words.toarray()

array([[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0],
       [0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1]], dtype=int64)

In [71]:
count.get_feature_names()

['cardi',
 'cars',
 'germany',
 'is',
 'like',
 'make',
 'place',
 'strange',
 'they',
 'tribeca',
 'volkswagen',
 'where']

In [78]:
count_2gram = CountVectorizer(ngram_range = (1, 2), stop_words="english", 
                             vocabulary=['cardi'])

In [79]:
bag = count_2gram.fit_transform(text_data)

In [80]:
bag.toarray()

array([[1],
       [0],
       [0]])

### Tfidf

In [81]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [82]:
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)

In [83]:
feature_matrix

<3x12 sparse matrix of type '<class 'numpy.float64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [84]:
feature_matrix.toarray()

array([[0.70710678, 0.        , 0.        , 0.        , 0.70710678,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.40204024, 0.        ,
        0.        , 0.52863461, 0.52863461, 0.        , 0.52863461,
        0.        , 0.        ],
       [0.        , 0.38988801, 0.38988801, 0.29651988, 0.        ,
        0.38988801, 0.        , 0.        , 0.38988801, 0.        ,
        0.38988801, 0.38988801]])

In [85]:
tfidf.vocabulary_

{'cardi': 0,
 'cars': 1,
 'germany': 2,
 'is': 3,
 'like': 4,
 'make': 5,
 'place': 6,
 'strange': 7,
 'they': 8,
 'tribeca': 9,
 'volkswagen': 10,
 'where': 11}