## Tokenization

In [1]:
from spacy.lang.en import English
nlp = English()

text = """Stimulate your mind as you test your typing speed with this standard English paragraph typing test. Watch your typing speed and accuracy increase as you learn about a variety of new topics! Over 40 typing test selections available."""
my_doc = nlp(text)

token_list = []# Create list of word tokens
for token in my_doc:
    token_list.append(token.text)
print(token_list)

['Stimulate', 'your', 'mind', 'as', 'you', 'test', 'your', 'typing', 'speed', 'with', 'this', 'standard', 'English', 'paragraph', 'typing', 'test', '.', 'Watch', 'your', 'typing', 'speed', 'and', 'accuracy', 'increase', 'as', 'you', 'learn', 'about', 'a', 'variety', 'of', 'new', 'topics', '!', 'Over', '40', 'typing', 'test', 'selections', 'available', '.']


## Sentence Tokens

In [2]:
nlp = English()

nlp.add_pipe('sentencizer')

text = """Stimulate your mind as you test your typing speed with this standard English paragraph typing test. Watch your typing speed and accuracy increase as you learn about a variety of new topics! Over 40 typing test selections available."""
doc = nlp(text)
sents_list = []
for sent in doc.sents:
    sents_list.append(sent.text)
print(sents_list)

['Stimulate your mind as you test your typing speed with this standard English paragraph typing test.', 'Watch your typing speed and accuracy increase as you learn about a variety of new topics!', 'Over 40 typing test selections available.']


## Cleaning Text Data: Removing Stopwords

In [3]:
import spacy
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
print('Number of stop words: %d' % len(spacy_stopwords))
print('First ten stop words: %s' % list(spacy_stopwords)[:20])

Number of stop words: 326
First ten stop words: ['really', 'go', 'several', 'five', 'yours', 'may', 'wherever', 'who', 'above', 'amount', 'hence', '‘ve', 'make', 'through', 'a', 'sometime', "'ve", 'to', 'whenever', 'becoming']


## Removing Stopwords from Our Data

In [4]:
from spacy.lang.en.stop_words import STOP_WORDS

#Implementation of stop words:
filtered_sent=[]
doc = nlp(text)
for word in doc:
    if word.is_stop==False:
        filtered_sent.append(word)
print("Filtered Sentence:",filtered_sent)

Filtered Sentence: [Stimulate, mind, test, typing, speed, standard, English, paragraph, typing, test, ., Watch, typing, speed, accuracy, increase, learn, variety, new, topics, !, 40, typing, test, selections, available, .]


## Lemmatization

In [5]:
lem = nlp("Stimulate mind test typing")
for word in lem:
    print(word.text,word.lemma_)

Stimulate 
mind 
test 
typing 


## Part of Speech (POS) Tagging

In [6]:
import en_core_web_sm
nlp = en_core_web_sm.load()
docs = nlp(u"Stimulate mind test typing.")

for word in docs:
    print(word.text,word.pos_)

Stimulate VERB
mind NOUN
test NOUN
typing VERB
. PUNCT


## Entity Detection

In [7]:
from spacy import displacy

nytimes= nlp(u"""Stimulate your mind as you test your typing speed with this standard English paragraph typing test. Watch your typing speed and accuracy increase as you learn about a variety of new topics! Over 40 typing test selections available.""")

entities=[(i, i.label_, i.label) for i in nytimes.ents]
entities

[(English, 'LANGUAGE', 389)]

In [8]:
displacy.render(nytimes, style = "ent",jupyter = True)

## Dependency Parsing

In [9]:
docp = nlp (" Stimulate your mind as you test your typing speed with this standard.")

for chunk in docp.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
          chunk.root.head.text)

your mind mind dobj Stimulate
you you nsubj test
your your nsubj typing
speed speed dobj typing
this standard standard pobj with


In [10]:
displacy.render(docp, style="dep", jupyter= True)

## Word Vector Representation

In [11]:
import en_core_web_sm
nlp = en_core_web_sm.load()
mango = nlp(u'mango')
print(mango.vector.shape)
print(mango.vector)

(96,)
[-0.7061122  -1.432946    0.24227908  0.65981305 -0.20285594 -0.33635628
 -1.4245119  -0.11146493 -0.56221634  0.30030626 -0.19000375 -0.08635557
  1.3099953   1.3799536   0.02685273  1.5109317  -0.7333338   0.80945396
  0.29014236 -0.2684871  -0.7413075  -0.75340056  1.5254198  -0.6160394
  0.37298858  0.3126852  -0.68583    -0.75191927  0.58086467 -1.0955325
  0.8663806  -1.9158287  -0.05129775 -0.20604813  0.28277478 -2.0198557
 -0.01264121  0.36663294 -1.2550777   1.6548666  -0.8567238  -0.9216614
  0.29520363  0.01230142 -0.42903072 -0.49667066 -0.25612816 -1.3058069
  1.8100014   0.511529    0.03404026  0.70565414  0.42585483 -0.83498126
  0.5538809   0.57170045 -1.1014041   0.336201    0.07782169  0.5464127
 -0.06026492 -0.57346153  0.68430394 -1.0217377  -0.11573872 -0.9308227
 -0.85589594  0.55057144  1.3896189  -0.5574838   0.19777791  0.31532818
 -0.37644482  0.38533604  0.02513811 -0.29302767 -0.23319077  0.88431716
  0.61514163 -1.1896809   1.3120098   0.4991181  -0.

## Text Classification

In [12]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

In [13]:
df = pd.read_csv ("amazon_alexa.tsv", sep="\t")

In [14]:
df.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [15]:
df.shape

(3150, 5)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   rating            3150 non-null   int64 
 1   date              3150 non-null   object
 2   variation         3150 non-null   object
 3   verified_reviews  3150 non-null   object
 4   feedback          3150 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 123.2+ KB


In [17]:
df.feedback.value_counts()

1    2893
0     257
Name: feedback, dtype: int64

## Tokening the Data With spaCy

In [18]:
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

punctuations = string.punctuation
nlp = spacy.load("en_core_web_sm")
stop_words = spacy.lang.en.stop_words.STOP_WORDS
parser = English()

def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    return mytokens

In [19]:
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

## Vectorization Feature Engineering (TF-IDF)

In [20]:
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))

In [21]:
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [26]:
print(tfidf_vector)

TfidfVectorizer(tokenizer=<function spacy_tokenizer at 0x000001A5FBA0D790>)
