In [1]:
import nltk
import re
import string

In [2]:
corpus = ["The brown fox wasn't that quick and he couldn't win the race",
          "Hey that's a great deal! I just bought a phone for $199",
          "@@You'll (learn) a **lot** in the book. Python is an amazing language!@@"]

**Function for Tokenization of Sentences and Words**

In [8]:
def tokenize_text(raw_text,language="english"):
    default_st=nltk.sent_tokenize
    default_wt=nltk.word_tokenize
    sentences=default_st(raw_text,language=language)
    words=[default_wt(sentence) for sentence in sentences]
    return words

In [9]:
tokens_list=[tokenize_text(text) for text in corpus]
tokens_list

[[['The',
   'brown',
   'fox',
   'was',
   "n't",
   'that',
   'quick',
   'and',
   'he',
   'could',
   "n't",
   'win',
   'the',
   'race']],
 [['Hey', 'that', "'s", 'a', 'great', 'deal', '!'],
  ['I', 'just', 'bought', 'a', 'phone', 'for', '$', '199']],
 [['@',
   '@',
   'You',
   "'ll",
   '(',
   'learn',
   ')',
   'a',
   '**lot**',
   'in',
   'the',
   'book',
   '.'],
  ['Python', 'is', 'an', 'amazing', 'language', '!'],
  ['@', '@']]]

** Removing Special Characters After Tokenization**

In [39]:
def remove_characters_after_tokenization(tokens):
    pattern=re.compile('[{}]'.format(re.escape(string.punctuation)))
    filtered_tokens=list(filter(None,[pattern.sub('',token) for token in tokens]))
    return filtered_tokens

In [41]:
filtered_list= list(filter(None,[[remove_characters_after_tokenization(tokens) for tokens in sentence_tokens] for sentence_tokens in tokens_list]))
filtered_list

[[['The',
   'brown',
   'fox',
   'was',
   'nt',
   'that',
   'quick',
   'and',
   'he',
   'could',
   'nt',
   'win',
   'the',
   'race']],
 [['Hey', 'that', 's', 'a', 'great', 'deal'],
  ['I', 'just', 'bought', 'a', 'phone', 'for', '199']],
 [['You', 'll', 'learn', 'a', 'lot', 'in', 'the', 'book'],
  ['Python', 'is', 'an', 'amazing', 'language'],
  []]]

** Removing Special Characters Before Tokenization**

In [49]:
def remove_characters_before_tokenization(sentence,keep_apostrophes=False):
    sentence=sentence.strip()#an array of characters can be added which need to be removed
    if keep_apostrophes:
        PATTERN=r'[?|$|&|*|%|@|(|)|~]'
        filtered_sentence=re.sub(PATTERN,r'',sentence)
    else:
        PATTERN=r'[^0-9a-zA-Z ]'
        filtered_sentence=re.sub(PATTERN,r'',sentence)
    return filtered_sentence

In [50]:
filtered_list=[remove_characters_before_tokenization(sentence) for sentence in corpus]
filtered_list

['The brown fox wasnt that quick and he couldnt win the race',
 'Hey thats a great deal I just bought a phone for 199',
 'Youll learn a lot in the book Python is an amazing language']

In [53]:
cleaned_corpus=[remove_characters_before_tokenization(sentence,keep_apostrophes=True) for sentence in corpus]
cleaned_corpus

["The brown fox wasn't that quick and he couldn't win the race",
 "Hey that's a great deal! I just bought a phone for 199",
 "You'll learn a lot in the book. Python is an amazing language!"]

** Contraction Map **
* The below contraction map has lot of contractions and their expanded forms and can be used for replacing the contractions.

In [52]:
CONTRACTION_MAP = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

In [60]:
def expand_contractions(sentence,contraction_map):
    contraction_pattern=re.compile('({})'.format('|'.join(contraction_map.keys())),flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match=contraction.group()
        firstchar=match[0]#Taking out the first character to retain the capitalisation if any of the character
        expanded_contraction=contraction_map.get(match)\
                              if contraction_map.get(match)\
                              else contraction_map.get(match.lower())
        expanded_contraction=firstchar+expanded_contraction[1:]
        return expanded_contraction
    expand_sentence=contraction_pattern.sub(expand_match,sentence)
    return expand_sentence

In [62]:
expanded_corpus = [expand_contractions(sentence,CONTRACTION_MAP) for sentence in cleaned_corpus]
expanded_corpus

['The brown fox was not that quick and he could not win the race',
 'Hey that is a great deal! I just bought a phone for 199',
 'You will learn a lot in the book. Python is an amazing language!']

**Case Conversion**
* upper() and lower() functions can be used.

In [64]:
corpus[0].lower()

"the brown fox wasn't that quick and he couldn't win the race"

** Removing StopWords **
* Stop words are as a,the,me etc.. which appear a lot in text
* These stop words can be used from the nltk library nltk.corpus.stopwords.words("english")

In [65]:
def remove_stopwords(tokens):
    stopwords_list=nltk.corpus.stopwords.words("english")
    filtered_tokens=[token for token in tokens if token not in stopwords_list]
    return filtered_tokens

In [70]:
expanded_corpus_tokens=[tokenize_text(text) for text in cleaned_corpus]
expanded_corpus_tokens_wo_stopws=[[remove_stopwords(tokens) for tokens in sentence_tokens] for sentence_tokens in expanded_corpus_tokens]
expanded_corpus_tokens

[[['The',
   'brown',
   'fox',
   'was',
   "n't",
   'that',
   'quick',
   'and',
   'he',
   'could',
   "n't",
   'win',
   'the',
   'race']],
 [['Hey', 'that', "'s", 'a', 'great', 'deal', '!'],
  ['I', 'just', 'bought', 'a', 'phone', 'for', '199']],
 [['You', "'ll", 'learn', 'a', 'lot', 'in', 'the', 'book', '.'],
  ['Python', 'is', 'an', 'amazing', 'language', '!']]]