In [15]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import TweetTokenizer
import ast


In [4]:
#List of string 
compare_list = ['https://t.co/9z2J3P33Uc',
               'laugh/cry',
               'ðŸ˜¬ðŸ˜­ðŸ˜“ðŸ¤¢ðŸ™„ðŸ˜±',
               "world's problems",
               "@datageneral",
                "It's interesting",
               "don't spell my name right",
               'all-nighter']

In [5]:
word_token= []
#sent is single string under consideration
for sent in compare_list:
    print(word_tokenize(sent))
    word_token.append(word_tokenize(sent))

['https', ':', '//t.co/9z2J3P33Uc']
['laugh/cry']
['ðŸ˜¬ðŸ˜­ðŸ˜“ðŸ¤¢ðŸ™„ðŸ˜±']
['world', "'s", 'problems']
['@', 'datageneral']
['It', "'s", 'interesting']
['do', "n't", 'spell', 'my', 'name', 'right']
['all-nighter']


In [6]:
#We will use WordPunktTokenizer that takes into consideration punctuations along with words
punct_tokenizer = WordPunctTokenizer()
punct_tokens = []
for sent in compare_list:
    print(punct_tokenizer.tokenize(sent))
    punct_tokens.append(punct_tokenizer.tokenize(sent))

['https', '://', 't', '.', 'co', '/', '9z2J3P33Uc']
['laugh', '/', 'cry']
['ðŸ˜¬ðŸ˜­ðŸ˜“ðŸ¤¢ðŸ™„ðŸ˜±']
['world', "'", 's', 'problems']
['@', 'datageneral']
['It', "'", 's', 'interesting']
['don', "'", 't', 'spell', 'my', 'name', 'right']
['all', '-', 'nighter']


In [13]:
regexp = []
pattern = RegexpTokenizer('[\W\w]+')
for sent in compare_list:
    print(pattern.tokenize(sent))
    regexp.append(pattern.tokenize(sent))

['https://t.co/9z2J3P33Uc']
['laugh/cry']
['ðŸ˜¬ðŸ˜­ðŸ˜“ðŸ¤¢ðŸ™„ðŸ˜±']
["world's problems"]
['@datageneral']
["It's interesting"]
["don't spell my name right"]
['all-nighter']


In [14]:
#Matching on whitespaces
whiteexp = []
pattern = RegexpTokenizer('[\s]',gaps= True)
for sent in compare_list:
    print(pattern.tokenize(sent))
    whiteexp.append(pattern.tokenize(sent))

['https://t.co/9z2J3P33Uc']
['laugh/cry']
['ðŸ˜¬ðŸ˜­ðŸ˜“ðŸ¤¢ðŸ™„ðŸ˜±']
["world's", 'problems']
['@datageneral']
["It's", 'interesting']
["don't", 'spell', 'my', 'name', 'right']
['all-nighter']


In [18]:
tt = []
tc = TweetTokenizer()
for sent in compare_list:
    print(tc.tokenize(sent))
    tt.append(tc.tokenize(sent))

['https://t.co/9z2J3P33Uc']
['laugh', '/', 'cry']
['ðŸ˜¬', 'ðŸ˜­', 'ðŸ˜“', 'ðŸ¤¢', 'ðŸ™„', 'ðŸ˜±']
["world's", 'problems']
['@datageneral']
["It's", 'interesting']
["don't", 'spell', 'my', 'name', 'right']
['all-nighter']


In [19]:
#Wrapping it up in a single data frame
token_di = {'Word_Tokenizer' : word_token, 'Word_Punkt_Tokenizer' : punct_tokens, 'RegexpTokenizer':regexp,  'RegexpTokenizer_whitespace':whiteexp, 'TweetTokenizer':tt}
df = pd.DataFrame(token_di)

In [23]:
df.head(10)

Unnamed: 0,Word_Tokenizer,Word_Punkt_Tokenizer,RegexpTokenizer,RegexpTokenizer_whitespace,TweetTokenizer
0,"[https, :, //t.co/9z2J3P33Uc]","[https, ://, t, ., co, /, 9z2J3P33Uc]",[https://t.co/9z2J3P33Uc],[https://t.co/9z2J3P33Uc],[https://t.co/9z2J3P33Uc]
1,[laugh/cry],"[laugh, /, cry]",[laugh/cry],[laugh/cry],"[laugh, /, cry]"
2,[ðŸ˜¬ðŸ˜­ðŸ˜“ðŸ¤¢ðŸ™„ðŸ˜±],[ðŸ˜¬ðŸ˜­ðŸ˜“ðŸ¤¢ðŸ™„ðŸ˜±],[ðŸ˜¬ðŸ˜­ðŸ˜“ðŸ¤¢ðŸ™„ðŸ˜±],[ðŸ˜¬ðŸ˜­ðŸ˜“ðŸ¤¢ðŸ™„ðŸ˜±],"[ðŸ˜¬, ðŸ˜­, ðŸ˜“, ðŸ¤¢, ðŸ™„, ðŸ˜±]"
3,"[world, 's, problems]","[world, ', s, problems]",[world's problems],"[world's, problems]","[world's, problems]"
4,"[@, datageneral]","[@, datageneral]",[@datageneral],[@datageneral],[@datageneral]
5,"[It, 's, interesting]","[It, ', s, interesting]",[It's interesting],"[It's, interesting]","[It's, interesting]"
6,"[do, n't, spell, my, name, right]","[don, ', t, spell, my, name, right]",[don't spell my name right],"[don't, spell, my, name, right]","[don't, spell, my, name, right]"
7,[all-nighter],"[all, -, nighter]",[all-nighter],[all-nighter],[all-nighter]


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 5 columns):
Word_Tokenizer                8 non-null object
Word_Punkt_Tokenizer          8 non-null object
RegexpTokenizer               8 non-null object
RegexpTokenizer_whitespace    8 non-null object
TweetTokenizer                8 non-null object
dtypes: object(5)
memory usage: 448.0+ bytes
