# https://machinelearningmastery.com/clean-text-machine-learning-python/

In [1]:
import pandas as pd
import numpy as np

In [1]:
filename = 'Metamorphosis.txt'
file = open(filename, 'rt')
text = file.read()
# split into words by white space
words = text.split()

# ab hame is text ko words me split karna h, or hayan hamary paas 3 options hen:
### 1- Split by Whitespace (masla: We can see that punctuation is preserved (e.g. “wasn’t” and “armour-like“),
#### >>> words = text.split()

### 2- Split using regex (masla: What’s” is two words “What” and “s”)
#### >>> import re
#### >>> words = re.split(r'\W+', text)

### 3- Split by Whitespace and Remove Punctuation (“What’s” have become “Whats” but “armour-like” has become “armourlike“.)
#### >>> import string
#### >>> table = str.maketrans('', '', string.punctuation)
#### >>> stripped = [w.translate(table) for w in words]

In [33]:
import string
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in words]
print(len(stripped))
# 25186
# same as print(len(words))

print(len(set(stripped)))
# 3352
# fever from print(len(set(words))) <which is 4595>

25186
3352


# Normalizing Case

### Note: This means that the vocabulary will shrink in size, but some distinctions are lost (e.g. “Apple” the company vs “apple” the fruit is a commonly used example or ‘Bush’ is different than ‘bush’, while ‘Another’ has usually the same sense as ‘another’).
### 1 or cheez bhi consider karna hoti h, wo ye k peragraph k beech me jo word capital ho us ko lower nahi karna q k bohot zyada chance h k wo ksi makhsoos wajah sy capital kya h, eg: US vs us.

In [34]:
lower = [i.lower() for i in stripped]

# Tokenization and Cleaning with NLTK

## Split into sentences

#### A good useful first step is to split the text into sentences.

Some modeling tasks prefer input to be in the form of paragraphs or sentences, such as word2vec. You could first split your text into sentences, split each sentence into words, then save each sentence to file, one per line.

NLTK provides the sent_tokenize() function to split text into sentences.

In [41]:
from nltk import sent_tokenize
sentences = sent_tokenize(text)
print(sentences[0])

The Project Gutenberg EBook of Metamorphosis, by Franz Kafka
Translated by David Wyllie.


### Running the example, we can see that although the document is split into sentences, that each sentence still preserves the new line from the artificial wrap of the lines in the original document.

## Split into Words

In [59]:
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text)
print(tokens[:100])

['The', 'Project', 'Gutenberg', 'EBook', 'of', 'Metamorphosis', ',', 'by', 'Franz', 'Kafka', 'Translated', 'by', 'David', 'Wyllie', '.', 'This', 'eBook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever', '.', 'You', 'may', 'copy', 'it', ',', 'give', 'it', 'away', 'or', 're-use', 'it', 'under', 'the', 'terms', 'of', 'the', 'Project', 'Gutenberg', 'License', 'included', 'with', 'this', 'eBook', 'or', 'online', 'at', 'www.gutenberg.net', '**', 'This', 'is', 'a', 'COPYRIGHTED', 'Project', 'Gutenberg', 'eBook', ',', 'Details', 'Below', '**', '**', 'Please', 'follow', 'the', 'copyright', 'guidelines', 'in', 'this', 'file', '.', '**', 'Title', ':', 'Metamorphosis', 'Author', ':', 'Franz', 'Kafka', 'Translator', ':', 'David', 'Wyllie', 'Release', 'Date', ':', 'August', '16']


### It splits tokens based on white space and punctuation. For example, commas and periods are taken as separate tokens. Contractions are split apart (e.g. “What’s” becomes “What” “‘s“). Quotes are kept, and so on.

## Filter Out Punctuation
### We can filter out all tokens that we are not interested in, such as all standalone punctuation.

In [60]:
tokens = word_tokenize(text)
# remove all tokens that are not alphabetic
words = [word.lower() for word in tokens if word.lower().isalpha()]
print(words[:100])

['the', 'project', 'gutenberg', 'ebook', 'of', 'metamorphosis', 'by', 'franz', 'kafka', 'translated', 'by', 'david', 'wyllie', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever', 'you', 'may', 'copy', 'it', 'give', 'it', 'away', 'or', 'it', 'under', 'the', 'terms', 'of', 'the', 'project', 'gutenberg', 'license', 'included', 'with', 'this', 'ebook', 'or', 'online', 'at', 'this', 'is', 'a', 'copyrighted', 'project', 'gutenberg', 'ebook', 'details', 'below', 'please', 'follow', 'the', 'copyright', 'guidelines', 'in', 'this', 'file', 'title', 'metamorphosis', 'author', 'franz', 'kafka', 'translator', 'david', 'wyllie', 'release', 'date', 'august', 'ebook', 'first', 'posted', 'may', 'last', 'updated', 'may', 'language', 'english', 'start', 'of', 'this', 'project', 'gutenberg', 'ebook', 'metamorphosis', 'copyright']


# Filter out Stop Words (and Pipeline)

In [50]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
print(stop_words)

{'no', 'be', 's', 'was', 'some', "doesn't", 'doing', "needn't", 'been', 'she', 'they', 'for', 'yours', 're', 'it', 'on', 'below', 'under', "don't", 'yourselves', 'above', 'own', "shan't", 'haven', 'won', 'her', 'these', "haven't", "shouldn't", 'my', 'didn', 'there', 'an', 'y', 'as', 'those', 'again', 'if', 'ourselves', 'of', 'most', 'between', 'them', 'are', 'other', 'until', 'than', 'had', 'wouldn', "weren't", 'having', 'its', "that'll", 'how', 'i', 'you', "you'll", 'll', 'against', 'hers', 'by', 'up', 'herself', 'all', 'while', 'same', "isn't", 'that', 'is', 'both', 'to', 'we', "hadn't", "aren't", 'isn', 'should', 'their', 'our', 'hadn', 'couldn', 'himself', 'once', 'so', 'him', "it's", 'now', 'whom', 'being', 'just', 'did', 'will', 't', 'itself', 'theirs', 'when', 'has', 'wasn', "wasn't", "mustn't", 'over', 'with', 'not', 'out', 'more', 'aren', 'before', 'doesn', 'shouldn', 'ma', 'his', 'mightn', 'nor', 'hasn', "mightn't", 'after', 'do', 'myself', 'your', 'only', "you'd", 'needn', "

In [61]:
words = [w for w in words if not w in stop_words]
print(words[:100])

['project', 'gutenberg', 'ebook', 'metamorphosis', 'franz', 'kafka', 'translated', 'david', 'wyllie', 'ebook', 'use', 'anyone', 'anywhere', 'cost', 'almost', 'restrictions', 'whatsoever', 'may', 'copy', 'give', 'away', 'terms', 'project', 'gutenberg', 'license', 'included', 'ebook', 'online', 'copyrighted', 'project', 'gutenberg', 'ebook', 'details', 'please', 'follow', 'copyright', 'guidelines', 'file', 'title', 'metamorphosis', 'author', 'franz', 'kafka', 'translator', 'david', 'wyllie', 'release', 'date', 'august', 'ebook', 'first', 'posted', 'may', 'last', 'updated', 'may', 'language', 'english', 'start', 'project', 'gutenberg', 'ebook', 'metamorphosis', 'copyright', 'c', 'david', 'wyllie', 'metamorphosis', 'franz', 'kafka', 'translated', 'david', 'wyllie', 'one', 'morning', 'gregor', 'samsa', 'woke', 'troubled', 'dreams', 'found', 'transformed', 'bed', 'horrible', 'vermin', 'lay', 'back', 'lifted', 'head', 'little', 'could', 'see', 'brown', 'belly', 'slightly', 'domed', 'divided',

# Stem Words

### Stemming refers to the process of reducing each word to its root or base.
### For example “fishing,” “fished,” “fisher” all reduce to the stem “fish.”


In [62]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
stemmed = [porter.stem(word) for word in words]
print(stemmed[:100])

['project', 'gutenberg', 'ebook', 'metamorphosi', 'franz', 'kafka', 'translat', 'david', 'wylli', 'ebook', 'use', 'anyon', 'anywher', 'cost', 'almost', 'restrict', 'whatsoev', 'may', 'copi', 'give', 'away', 'term', 'project', 'gutenberg', 'licens', 'includ', 'ebook', 'onlin', 'copyright', 'project', 'gutenberg', 'ebook', 'detail', 'pleas', 'follow', 'copyright', 'guidelin', 'file', 'titl', 'metamorphosi', 'author', 'franz', 'kafka', 'translat', 'david', 'wylli', 'releas', 'date', 'august', 'ebook', 'first', 'post', 'may', 'last', 'updat', 'may', 'languag', 'english', 'start', 'project', 'gutenberg', 'ebook', 'metamorphosi', 'copyright', 'c', 'david', 'wylli', 'metamorphosi', 'franz', 'kafka', 'translat', 'david', 'wylli', 'one', 'morn', 'gregor', 'samsa', 'woke', 'troubl', 'dream', 'found', 'transform', 'bed', 'horribl', 'vermin', 'lay', 'back', 'lift', 'head', 'littl', 'could', 'see', 'brown', 'belli', 'slightli', 'dome', 'divid', 'arch', 'stiff', 'section']


# Tips for Cleaning Text for Word Embedding

Recently, the field of natural language processing has been moving away from bag-of-word models and word encoding toward word embeddings.

The benefit of word embeddings is that they encode each word into a dense vector that captures something about its relative meaning within the training text.

This means that variations of words like case, spelling, punctuation, and so on will automatically be learned to be similar in the embedding space. In turn, this can mean that the amount of cleaning required from your text may be less and perhaps quite different to classical text cleaning.

For example, it may no-longer make sense to stem words or remove punctuation for contractions.

Tomas Mikolov is one of the developers of word2vec, a popular word embedding method. He suggests only very minimal text cleaning is required when learning a word embedding model.