# Word tokenizer

## sklearn

In [1]:
test = 'This is a test!!! This is another test! Test (Test) !Let\'s go. \"Yeah, right\" '
print(test)

This is a test!!! This is another test! Test (Test) !Let's go. "Yeah, right" 


In [2]:
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
analyze = CountVectorizer().build_analyzer()
analyze(test)

['this',
 'is',
 'test',
 'this',
 'is',
 'another',
 'test',
 'test',
 'test',
 'let',
 'go',
 'yeah',
 'right']

## spacy

In [None]:
from fastai.nlp import *
import spacy

In [24]:
spacy_tok = spacy.load('en')

In [25]:
[i for i in spacy_tok(test)]

[This,
 is,
 a,
 test,
 !,
 !,
 !,
 This,
 is,
 another,
 test,
 !,
 Test,
 (,
 Test,
 ),
 !,
 Let,
 's,
 go,
 .,
 ",
 Yeah,
 ,,
 right,
 "]

# Bag of words (basic)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
test = ['This is a test!!! This is another test! Test !Let\'s go. \"Yeah, right\" ',
       'Started one sentence, and replaces 1 word with another (‘cats are cute’ to ‘justice are cute’). Then they made labels 1 if it was unchanged, and 0 if it was changed',
        'However, we may find ourselves in a part of the weight space that isn\'t very resilient - that is, small changes to the weights may result in big changes to the loss. We want to encourage our model to find parts of the weight space that are both accurate and stable'
       ]

In [9]:
vec = CountVectorizer()
temp = vec.fit_transform(test)

In [10]:
temp.shape

(3, 58)

In [11]:
vec.vocabulary_

{'this': 46,
 'is': 16,
 'test': 41,
 'another': 2,
 'let': 21,
 'go': 12,
 'yeah': 57,
 'right': 35,
 'started': 40,
 'one': 27,
 'sentence': 36,
 'and': 1,
 'replaces': 32,
 'word': 56,
 'with': 55,
 'cats': 6,
 'are': 3,
 'cute': 9,
 'to': 47,
 'justice': 19,
 'then': 44,
 'they': 45,
 'made': 23,
 'labels': 20,
 'if': 14,
 'it': 18,
 'was': 51,
 'unchanged': 48,
 'changed': 7,
 'however': 13,
 'we': 52,
 'may': 24,
 'find': 11,
 'ourselves': 29,
 'in': 15,
 'part': 30,
 'of': 26,
 'the': 43,
 'weight': 53,
 'space': 38,
 'that': 42,
 'isn': 17,
 'very': 49,
 'resilient': 33,
 'small': 37,
 'changes': 8,
 'weights': 54,
 'result': 34,
 'big': 4,
 'loss': 22,
 'want': 50,
 'encourage': 10,
 'our': 28,
 'model': 25,
 'parts': 31,
 'both': 5,
 'accurate': 0,
 'stable': 39}

In [16]:
vec.get_feature_names()


['accurate',
 'and',
 'another',
 'are',
 'big',
 'both',
 'cats',
 'changed',
 'changes',
 'cute',
 'encourage',
 'find',
 'go',
 'however',
 'if',
 'in',
 'is',
 'isn',
 'it',
 'justice',
 'labels',
 'let',
 'loss',
 'made',
 'may',
 'model',
 'of',
 'one',
 'our',
 'ourselves',
 'part',
 'parts',
 'replaces',
 'resilient',
 'result',
 'right',
 'sentence',
 'small',
 'space',
 'stable',
 'started',
 'test',
 'that',
 'the',
 'then',
 'they',
 'this',
 'to',
 'unchanged',
 'very',
 'want',
 'was',
 'we',
 'weight',
 'weights',
 'with',
 'word',
 'yeah']

In [15]:
temp.toarray() # convert collection of text documents to matrix of token counts

array([[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 3, 0, 0,
        0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 2, 1, 2, 0, 0, 1, 1, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 2, 1, 1, 0,
        0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
        1, 1, 0, 1, 1, 0, 0, 2, 0, 0, 0, 1, 1, 0],
       [1, 1, 0, 1, 1, 1, 0, 0, 2, 0, 1, 2, 0, 1, 0, 2, 1, 1, 0, 0, 0, 0,
        1, 0, 2, 1, 2, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 2, 1, 0, 0, 3, 4,
        0, 0, 0, 4, 0, 1, 1, 0, 2, 2, 1, 0, 0, 0]])

# TF-IDF

In [12]:
import pandas as pd

In [13]:
import numpy as np

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer(min_df=.2,max_df=.5,ngram_range=(1,2))

In [21]:
test = ['This is a test!!! This is another test! Test !Let\'s go. \"Yeah, right\" ',
       'Started one sentence, and replaces 1 word with another (‘cats are cute’ to ‘justice are cute’). Then they made labels 1 if it was unchanged, and 0 if it was changed',
        'However, we may find ourselves in a part of the weight space that isn\'t very resilient - that is, small changes to the weights may result in big changes to the loss. We want to encourage our model to find parts of the weight space that are both accurate and stable'
       ]
texts=[
    'good movie', 'not a good movie','did not like', 'i like it','good one'
]

In [24]:
features = vec.fit_transform(test)
features

<3x129 sparse matrix of type '<class 'numpy.float64'>'
	with 129 stored elements in Compressed Sparse Row format>

In [23]:
pd.DataFrame(features.todense(),columns = vec.get_feature_names())

Unnamed: 0,accurate,accurate and,and if,and replaces,and stable,another cats,another test,are both,are cute,big,...,weight,weight space,weights,weights may,with,with another,word,word with,yeah,yeah right
0,0.0,0.0,0.0,0.0,0.0,0.0,0.182574,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.182574,0.182574
1,0.0,0.0,0.125988,0.125988,0.0,0.125988,0.0,0.0,0.251976,0.0,...,0.0,0.0,0.0,0.0,0.125988,0.125988,0.125988,0.125988,0.0,0.0
2,0.085749,0.085749,0.0,0.0,0.085749,0.0,0.0,0.085749,0.0,0.085749,...,0.171499,0.171499,0.085749,0.085749,0.0,0.0,0.0,0.0,0.0,0.0
