<a href="https://colab.research.google.com/github/bipinKrishnan/fastai_course/blob/master/text_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from torchvision.datasets.utils import download_and_extract_archive
from torch.utils.data import DataLoader, Dataset

from pathlib import Path

In [None]:
url = 'https://download.pytorch.org/tutorial/data.zip'
download_and_extract_archive(url, '.')

In [None]:
path = '/content/data/names/'

names, targets, vocabs

In [None]:
class Preprocess:
  def __init__(self, path):
    self.path = path

  def get_names_targets(self):
    self.corpus = []
    for files in Path(self.path).glob('*.txt'):
      with open(files) as f:
        for l in f.readlines():
          self.corpus.append((l.split('\n')[0], files.stem))

    return self.corpus

  def get_targets(self):
    targets = []
    for files in Path(self.path).glob('*.txt'):
      targets.append(files.stem)

    return targets

  def get_vocab(self):
    vocab = set()

    for name, target in self.corpus:
      vocab.add(name)

    return list(vocab)

In [None]:
class LoadDataset(Dataset):
  def __init__(self, names_targets, names_vocab, target_vocab):
    self.name_target = names_targets
    self.name_vocab = names_vocab
    self.target_vocab = target_vocab
  
  def __getitem__(self, idx): 
    item = self.name_target[idx]
    return self.name_vocab.index(item[0]), self.target_vocab.index(item[1])

  def __len__(self): return len(self.name_target)

In [None]:
pre = Preprocess(path)

data = pre.get_names_targets()
target_vocab = pre.get_targets()
name_vocab = pre.get_vocab()

In [None]:
ds = LoadDataset(data, name_vocab, target_vocab)

In [None]:
for x, y in ds:
  print(x, y)
  break

5823 0


In [None]:
dls = DataLoader(ds, 16, shuffle=True)

In [None]:
for data, label in dls:
  print(data.shape, label.shape)
  print(data[0], label[0],'\n', name_vocab[data[0].item()], target_vocab[label[0].item()])
  break

torch.Size([16]) torch.Size([16])
tensor(1468) tensor(8) 
 Jigailo Russian


# Spacy library

In [4]:
text = "In computer science, lexical analysis, lexing or tokenization is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an assigned and thus identified meaning). A program that performs lexical analysis may be termed a lexer, tokenizer,[1] or scanner, though scanner is also a term for the first stage of a lexer. A lexer is generally combined with a parser, which together analyze the syntax of programming languages, web pages, and so forth."

In [1]:
import spacy

In [3]:
spacy_nlp = spacy.load('en_core_web_sm')

In [5]:
doc = spacy_nlp(text)

In [None]:
[token.text for token in doc]

In [14]:
text1 = "ConcateStringAnd123 ConcateSepcialCharacter_!@# !@#$%^&*()_+ 0123456"
doc1 = spacy_nlp(text1)

[token.text for token in doc1]

['ConcateStringAnd123',
 'ConcateSepcialCharacter_!@',
 '#',
 '!',
 '@#$%^&*()_+',
 '0123456']

In [16]:
text2 = "Let’s go to N.Y.!"
doc2 = spacy_nlp(text2)

[token.text for token in doc2]

['Let', '’s', 'go', 'to', 'N.Y.', '!']

# NLTK

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
nltk.word_tokenize(text)

In [None]:
nltk.word_tokenize(text1)

In [27]:
nltk.sent_tokenize(text), nltk.sent_tokenize(text1), nltk.sent_tokenize(text2)

(['In computer science, lexical analysis, lexing or tokenization is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an assigned and thus identified meaning).',
  'A program that performs lexical analysis may be termed a lexer, tokenizer,[1] or scanner, though scanner is also a term for the first stage of a lexer.',
  'A lexer is generally combined with a parser, which together analyze the syntax of programming languages, web pages, and so forth.'],
 ['ConcateStringAnd123 ConcateSepcialCharacter_!',
  '@# !',
  '@#$%^&*()_+ 0123456'],
 ['Let’s go to N.Y.!'])

In [None]:
#parts of speech tagging
tokens = nltk.word_tokenize(text)
nltk.pos_tag(tokens)

# Stop word removal

### NLTK

### Remove Stopwords

We can remove stopwords while performing the following tasks:

Text Classification
  * Spam Filtering
  * Language Classification
  * Genre Classification
  * Caption Generation
  * Auto-Tag Generation

 
### Avoid Stopword Removal

   * Machine Translation
   * Language Modeling
   * Text Summarization
   * Question-Answering problems


In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

In [57]:
tokens = nltk.word_tokenize(text)
stop_word = set(stopwords.words('english'))

In [None]:
[token for token in tokens if token not in stop_word]

In [None]:
t = "He determined to drop his litigation with the monastry and relinguish his claims to the wood-cuting and \n fishery rihgts at once. He was the more ready to do this becuase the rights had become much less valuable, and he had \n indeed the vaguest idea where the wood and river in question were."

to = nltk.word_tokenize(t)
[token for token in to if token not in stop_word]

# Text normalization(stemming & lemmatization)

In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('wordnet')

In [78]:
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()
ps.stem('going'), lemmatizer.lemmatize('going', pos='v') #lemmatizes only the specified parts of speech

('go', 'go')

In [None]:
[ps.stem(token) for token in tokens if token not in stop_word]

In [79]:
lemma = []

for token in tokens:
  if token not in stop_word:
    word = lemmatizer.lemmatize(token, pos='n')
    word = lemmatizer.lemmatize(word, pos='v')
    word = lemmatizer.lemmatize(word, pos='a')

    lemma.append(word)

In [None]:
lemma

### Spacy lemmatization

In [82]:
nlp = spacy.load('en_core_web_sm')

In [None]:
doc = nlp(text)

[token.lemma_ for token in doc if token.text not in stop_word]