# Tokenization

In [1]:
import numpy as np
import nltk
from nltk import sent_tokenize
from nltk import word_tokenize

In [2]:
text = "Hello this a sample text. Where are you from?"

In [3]:
tokens_sents = nltk.sent_tokenize(text)
print(tokens_sents)

['Hello this a sample text.', 'Where are you from?']


In [4]:
tokens_words = nltk.word_tokenize(text)
print(tokens_words)

['Hello', 'this', 'a', 'sample', 'text', '.', 'Where', 'are', 'you', 'from', '?']


# Stemming

In [5]:
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

In [6]:
ps = PorterStemmer()
word = ("civilization")
ps.stem(word)

'civil'

In [7]:
ps.stem("Workers")

'worker'

In [8]:
stemmer = SnowballStemmer(language = "english")
stemmer.stem("Construction")

'construct'

In [9]:
stemmer.stem("Randomly")

'random'

# Lemmatization

In [10]:
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

In [24]:
print(lemmatizer.lemmatize("workers"))
print(lemmatizer.lemmatize("beaches"))

worker
beach


# POS Tagging

In [22]:
text = "The striped bats are hanging on their feet for best"
tokens = nltk.word_tokenize(text)
print("Parts of Speech: ",nltk.pos_tag(tokens))

Parts of Speech:  [('The', 'DT'), ('striped', 'JJ'), ('bats', 'NNS'), ('are', 'VBP'), ('hanging', 'VBG'), ('on', 'IN'), ('their', 'PRP$'), ('feet', 'NNS'), ('for', 'IN'), ('best', 'JJS')]


# Stop Words Removal

In [13]:
from nltk.corpus import stopwords

In [25]:
text = "Ich habe ein bisschen deutsch lernen"

In [26]:
tokens = word_tokenize(text.lower())

In [27]:
english_stopwords = stopwords.words('german')

tokens_wo_stopwords = [t for t in tokens if t not in english_stopwords]

print("Text without stop words:", " ".join(tokens_wo_stopwords))

Text without stop words: bisschen deutsch lernen


# Term Frequency & Inverse Document Frequency

---> term frequency(t,d) = count of term in document / number of words in document

---> inverse document frequency(t) = log(total numbers of words / occurrence of term in documents)

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
d0 = 'Nothing ever goes as planned in this accursed world.'
d1 = 'The longer you live, the more you realize that the only things that truly exist in this reality are merely pain, suffering and futility.'
d2 = 'Listen, everywhere you look in this world, wherever there is light, there will always be shadows to be found as well.'

string = [d0, d1, d2]

In [19]:
tfidf = TfidfVectorizer()

result = tfidf.fit_transform(string)

In [20]:
print('\nWord indexes:')
print(tfidf.vocabulary_)
 
# display tf-idf values
print('\ntf-idf value:')
print(result)


Word indexes:
{'nothing': 21, 'ever': 6, 'goes': 11, 'as': 4, 'planned': 24, 'in': 12, 'this': 33, 'accursed': 0, 'world': 39, 'the': 30, 'longer': 17, 'you': 40, 'live': 16, 'more': 20, 'realize': 26, 'that': 29, 'only': 22, 'things': 32, 'truly': 35, 'exist': 8, 'reality': 25, 'are': 3, 'merely': 19, 'pain': 23, 'suffering': 28, 'and': 2, 'futility': 10, 'listen': 15, 'everywhere': 7, 'look': 18, 'wherever': 37, 'there': 31, 'is': 13, 'light': 14, 'will': 38, 'always': 1, 'be': 5, 'shadows': 27, 'to': 34, 'found': 9, 'well': 36}

tf-idf value:
  (0, 39)	0.29048754376040503
  (0, 0)	0.38195621126357737
  (0, 33)	0.22558949136203243
  (0, 12)	0.22558949136203243
  (0, 24)	0.38195621126357737
  (0, 4)	0.29048754376040503
  (0, 11)	0.38195621126357737
  (0, 6)	0.38195621126357737
  (0, 21)	0.38195621126357737
  (1, 10)	0.17957271370263975
  (1, 2)	0.17957271370263975
  (1, 28)	0.17957271370263975
  (1, 23)	0.17957271370263975
  (1, 19)	0.17957271370263975
  (1, 3)	0.17957271370263975
  