In [1]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [67]:
import nltk
from nltk.tokenize import sent_tokenize
text = "Statistics skills, and programming skills are equally important for analytics. Statistics skills, and              domain knowledge are important for analytics. I like reading books and travelling."
sent_tokenize_list = sent_tokenize(text)
print(sent_tokenize_list)

['Statistics skills, and programming skills are equally important for analytics.', 'Statistics skills, and              domain knowledge are important for analytics.', 'I like reading books and travelling.']


In [68]:
#Word tokenizing
from nltk.tokenize import word_tokenize
print(word_tokenize(text))

['Statistics', 'skills', ',', 'and', 'programming', 'skills', 'are', 'equally', 'important', 'for', 'analytics', '.', 'Statistics', 'skills', ',', 'and', 'domain', 'knowledge', 'are', 'important', 'for', 'analytics', '.', 'I', 'like', 'reading', 'books', 'and', 'travelling', '.']


In [69]:
#Remove stop words
from nltk.corpus import stopwords
#Function to remove stopwords
def remove_stopwords(text, lang='english'):
    words = nltk.word_tokenize(text)
    lang_stopwords = stopwords.words(lang)
    stopwords_removed = [w for w in words if w.lower() not in lang_stopwords]
    return " ".join(stopwords_removed)
print(remove_stopwords("This is a sample        English sentence"))

sample English sentence


In [70]:
#Remove punctuations
import string
#Function to remove punctuations
def remove_punctuations(text):
    words = nltk.word_tokenize(text)
    punt_removed = [w for w in words if w.lower() not in string.punctuation]
    return " ".join(punt_removed)
print(remove_punctuations("This is a sample English      sentence, with punctuations!"))

This is a sample English sentence with punctuations


In [71]:
#Remove whitespace
#Function to remove whitespace
def remove_whitespace(text):
    return " ".join(text.split())
text = "This is a sample English sentence, \n    with whitespace\n\n\n and\n numbers 1234!"
print(remove_whitespace(text))

This is a sample English sentence, with whitespace and numbers 1234!


# Stemming and Lemmatization

In [72]:
#Simple Stemming
from nltk import PorterStemmer, LancasterStemmer, SnowballStemmer
#Porter Stemmer
from nltk.tokenize import word_tokenize
ps = PorterStemmer()
example_words = ["Python", "pythoner", "pythoning", "pythoned", "pythonly"]
print("\n-------Porter Stemmer-------")
for w in example_words:
    print(ps.stem(w))


-------Porter Stemmer-------
python
python
python
python
pythonli


In [73]:
#Lancaster Stemmer
ls = LancasterStemmer()
example_words = ["Python", "pythoner", "pythoning", "pythoned", "pythonly"]
print("\n-------Lancaster Stemmer-------")
for w in example_words:
    print(ls.stem(w)) #'pythonly' is also converted to 'python' here


-------Lancaster Stemmer-------
python
python
python
python
python


In [74]:
#Lemmatizer
#Simple Lemmatizer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize('cats'))
print(lemmatizer.lemmatize('cacti'))
print(lemmatizer.lemmatize('geese'))
print(lemmatizer.lemmatize('children'))
print(lemmatizer.lemmatize('plays'))
print(lemmatizer.lemmatize('rocks'))
print(lemmatizer.lemmatize('python'))
print(lemmatizer.lemmatize("better",pos="a")) #a is for adjective
print(lemmatizer.lemmatize('best',pos="a"))
print(lemmatizer.lemmatize('run'))
print(lemmatizer.lemmatize('run',"v")) #v is for verb

cat
cactus
goose
child
play
rock
python
good
best
run
run


# N-grams

In [75]:
from nltk.util import ngrams
from collections import Counter
#Function to extract n-grams from text
def get_ngrams(text,n):
    n_grams = ngrams(nltk.word_tokenize(text),n)
    return [' '.join(grams) for grams in n_grams]
text = "This is a sample English sentence"
print("1-gram: ",get_ngrams(text,1))
print("2-gram: ",get_ngrams(text,2))
print("3-gram: ",get_ngrams(text,3))
print("4-gram: ",get_ngrams(text,4))
print("5-gram: ",get_ngrams(text,5))
print("6-gram: ",get_ngrams(text,6))

1-gram:  ['This', 'is', 'a', 'sample', 'English', 'sentence']
2-gram:  ['This is', 'is a', 'a sample', 'sample English', 'English sentence']
3-gram:  ['This is a', 'is a sample', 'a sample English', 'sample English sentence']
4-gram:  ['This is a sample', 'is a sample English', 'a sample English sentence']
5-gram:  ['This is a sample English', 'is a sample English sentence']
6-gram:  ['This is a sample English sentence']


# Parts of Speech(PoS Tagging)

In [76]:
from nltk import chunk
tagged_sent = nltk.pos_tag(nltk.word_tokenize("This is a sample English sentence"))
print(tagged_sent)
tree = chunk.ne_chunk(tagged_sent)
tree.draw() #this will draw the sentence tree

[('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('sample', 'JJ'), ('English', 'JJ'), ('sentence', 'NN')]


In [77]:
#To get help about tags
nltk.help.upenn_tagset('JJ')

JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...


# Bag of Words(BoW)

In [87]:
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
#function to create a dictionary with key as filenames and values as text for all the files in a given folder
def CorpusFromDir(dir_path):
    result = dict(docs=[open(os.path.join(dir_path,f)).read() for f in os.listdir(dir_path)],ColNames=map(lambda x:x,os.listdir(dir_path)))
    return result
docs = CorpusFromDir("C:/Users/Ahmed Khan/Desktop/Data")
#Initialize
vectorizer = CountVectorizer()
doc_vec = vectorizer.fit_transform(docs.get('docs'))
#Create DataFrame
df = pd.DataFrame(doc_vec.toarray().transpose(), index = vectorizer.get_feature_names())
#Change column headers to be file names
df.columns = docs.get('ColNames')
print(df)

             Doc_1.txt  Doc_2.txt  Doc_3.txt
analytics            1          1          0
and                  1          1          1
are                  1          1          0
books                0          0          1
domain               0          1          0
equally              1          0          0
for                  1          1          0
important            1          1          0
knowledge            0          1          0
like                 0          0          1
programming          1          0          0
reading              0          0          1
skills               2          1          0
statistics           1          1          0
travelling           0          0          1


In [88]:
#TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
docs = CorpusFromDir('C:/Users/Ahmed Khan/Desktop/Data')
doc_vec = vectorizer.fit_transform(docs.get('docs'))
#Create DataFrame
df = pd.DataFrame(doc_vec.toarray().transpose(), index=vectorizer.get_feature_names())
#Change column headers to be filenames
df.columns = docs.get('ColNames')
print(df)

             Doc_1.txt  Doc_2.txt  Doc_3.txt
analytics     0.276703   0.315269   0.000000
and           0.214884   0.244835   0.283217
are           0.276703   0.315269   0.000000
books         0.000000   0.000000   0.479528
domain        0.000000   0.414541   0.000000
equally       0.363831   0.000000   0.000000
for           0.276703   0.315269   0.000000
important     0.276703   0.315269   0.000000
knowledge     0.000000   0.414541   0.000000
like          0.000000   0.000000   0.479528
programming   0.363831   0.000000   0.000000
reading       0.000000   0.000000   0.479528
skills        0.553405   0.315269   0.000000
statistics    0.276703   0.315269   0.000000
travelling    0.000000   0.000000   0.479528
