In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
f = open("data.txt", "r", encoding="ISO-8859-1")

In [3]:
text = f.read()

In [4]:
text[:1000]

"The Project Gutenberg EBook of Man to Man, by Jackson Gregory\n\nThis eBook is for the use of anyone anywhere at no cost and with\nalmost no restrictions whatsoever.  You may copy it, give it away or\nre-use it under the terms of the Project Gutenberg License included\nwith this eBook or online at www.gutenberg.org\n\n\nTitle: Man to Man\n\nAuthor: Jackson Gregory\n\nRelease Date: July 29, 2006 [EBook #18933]\n\nLanguage: English\n\n\n*** START OF THIS PROJECT GUTENBERG EBOOK MAN TO MAN ***\n\n\n\n\nProduced by Al Haines\n\n\n\n\n\n\n\n\n\n\n[Frontispiece: The blazing heat was such that men and horses and steers\nsuffered terribly.]\n\n\n\n\n\n\nMAN TO MAN\n\n\nBY\n\nJACKSON GREGORY\n\n\n\nAUTHOR OF\n\nJUDITH OF BLUE LAKE RANCH, THE BELLS OF SAN JUAN, SIX FEET FOUR, ETC.\n\n\n\n\nILLUSTRATED BY\n\nJ. G. SHEPHERD\n\n\n\n\n\nGROSSET & DUNLAP\n\nPUBLISHERS -------- NEW YORK\n\n\n\n\nCOPYRIGHT, 1920, BY\n\nCHARLES SCRIBNER'S SONS\n\n\nPublished October, 1920\n\n\n\n\nCONTENTS\n\n\nCHAPTER

In [5]:
b = text.split("\n")

In [6]:
len(b)

9770

In [7]:
b[:5]

['The Project Gutenberg EBook of Man to Man, by Jackson Gregory',
 '',
 'This eBook is for the use of anyone anywhere at no cost and with',
 'almost no restrictions whatsoever.  You may copy it, give it away or',
 're-use it under the terms of the Project Gutenberg License included']

In [8]:
sent_tokens = sent_tokenize(text)

In [9]:
len(sent_tokens)

5560

In [10]:
df = pd.DataFrame({"sentences": sent_tokens})

In [11]:
df.head()

Unnamed: 0,sentences
0,"The Project Gutenberg EBook of Man to Man, by ..."
1,"You may copy it, give it away or\nre-use it un..."
2,MAN TO MAN\n\n\nBY\n\nJACKSON GREGORY\n\n\n\nA...
3,ILLUSTRATED BY\n\nJ. G. SHEPHERD\n\n\n\n\n\nGR...
4,MISS BLUE CLOAK KNOWS WHEN SHE'S BEAT\n III.


In [12]:
df.shape

(5560, 1)

In [13]:
stop_words = stopwords.words("english")

In [14]:
lem = WordNetLemmatizer()

In [15]:
def cleaning(data):
    # 1. Tokenize
    text_tokens = word_tokenize(data.lower())
    # 2. Remove punctuations
    tokens_without_punc = [w for w in text_tokens if w.isalpha()]
    # 3. Remove stopwords
    tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words]
    # 4. Lemmatization
    text_cleaned = [lem.lemmatize(t) for t in tokens_without_sw]
    # 5. Join
    return " ".join(text_cleaned)

In [16]:
df["sentences_2"] = df["sentences"].apply(cleaning)

In [17]:
df.head(10)

Unnamed: 0,sentences,sentences_2
0,"The Project Gutenberg EBook of Man to Man, by ...",project gutenberg ebook man man jackson gregor...
1,"You may copy it, give it away or\nre-use it un...",may copy give away term project gutenberg lice...
2,MAN TO MAN\n\n\nBY\n\nJACKSON GREGORY\n\n\n\nA...,man man jackson gregory author judith blue lak...
3,ILLUSTRATED BY\n\nJ. G. SHEPHERD\n\n\n\n\n\nGR...,illustrated shepherd grosset dunlap publisher ...
4,MISS BLUE CLOAK KNOWS WHEN SHE'S BEAT\n III.,miss blue cloak know beat iii
5,NEWS OF A LEGACY\n IV.,news legacy iv
6,TERRY BEFORE BREAKFAST\n V. HOW STEVE PACK...,terry breakfast steve packard came home vi
7,BANK NOTES AND A BLIND MAN\n VII.,bank note blind man vii
8,THE OLD MOUNTAIN LION COMES DOWN FROM THE NORT...,old mountain lion come north viii
9,IN RED CREEK TOWN\n IX.,red creek town ix


In [18]:
df["sentences_3"] = df["sentences_2"].apply(lambda x: x.split())

In [19]:
df.head()

Unnamed: 0,sentences,sentences_2,sentences_3
0,"The Project Gutenberg EBook of Man to Man, by ...",project gutenberg ebook man man jackson gregor...,"[project, gutenberg, ebook, man, man, jackson,..."
1,"You may copy it, give it away or\nre-use it un...",may copy give away term project gutenberg lice...,"[may, copy, give, away, term, project, gutenbe..."
2,MAN TO MAN\n\n\nBY\n\nJACKSON GREGORY\n\n\n\nA...,man man jackson gregory author judith blue lak...,"[man, man, jackson, gregory, author, judith, b..."
3,ILLUSTRATED BY\n\nJ. G. SHEPHERD\n\n\n\n\n\nGR...,illustrated shepherd grosset dunlap publisher ...,"[illustrated, shepherd, grosset, dunlap, publi..."
4,MISS BLUE CLOAK KNOWS WHEN SHE'S BEAT\n III.,miss blue cloak know beat iii,"[miss, blue, cloak, know, beat, iii]"


In [20]:
# PoST

In [21]:
df["sentences_4"] = df["sentences_3"].apply(lambda x: nltk.pos_tag(x))

In [22]:
df.head()

Unnamed: 0,sentences,sentences_2,sentences_3,sentences_4
0,"The Project Gutenberg EBook of Man to Man, by ...",project gutenberg ebook man man jackson gregor...,"[project, gutenberg, ebook, man, man, jackson,...","[(project, NN), (gutenberg, NN), (ebook, NN), ..."
1,"You may copy it, give it away or\nre-use it un...",may copy give away term project gutenberg lice...,"[may, copy, give, away, term, project, gutenbe...","[(may, MD), (copy, VB), (give, VB), (away, RP)..."
2,MAN TO MAN\n\n\nBY\n\nJACKSON GREGORY\n\n\n\nA...,man man jackson gregory author judith blue lak...,"[man, man, jackson, gregory, author, judith, b...","[(man, NN), (man, NN), (jackson, NN), (gregory..."
3,ILLUSTRATED BY\n\nJ. G. SHEPHERD\n\n\n\n\n\nGR...,illustrated shepherd grosset dunlap publisher ...,"[illustrated, shepherd, grosset, dunlap, publi...","[(illustrated, VBN), (shepherd, JJ), (grosset,..."
4,MISS BLUE CLOAK KNOWS WHEN SHE'S BEAT\n III.,miss blue cloak know beat iii,"[miss, blue, cloak, know, beat, iii]","[(miss, JJ), (blue, JJ), (cloak, NN), (know, V..."


In [23]:
# Vectorization

In [24]:
# 1. Count Vectorization

In [25]:
# Apply to pd.Series, not to pd.DataFrame itself.

In [26]:
X_train = df["sentences_2"]

In [27]:
vectorizer = CountVectorizer()

In [28]:
vectorizer.fit(X_train)

CountVectorizer()

In [29]:
vectorizer.get_feature_names()[:5]

['aback', 'abandoned', 'abide', 'abiding', 'ability']

In [30]:
len(vectorizer.get_feature_names())

5974

In [31]:
X_train_count = vectorizer.transform(X_train)

In [32]:
a = pd.DataFrame(X_train_count.toarray(), columns=vectorizer.get_feature_names())

In [33]:
a.head()

Unnamed: 0,aback,abandoned,abide,abiding,ability,able,aboard,abreast,abrupt,abruptly,...,yielded,yielding,yonder,york,young,youngest,yourse,youth,youthful,zest
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
a.shape

(5560, 5974)

In [35]:
# TF-IDF

In [36]:
tf_idf_vectorizer = TfidfVectorizer()

In [37]:
tf_idf_vectorizer.fit(X_train)

TfidfVectorizer()

In [38]:
X_train_tf_idf = tf_idf_vectorizer.transform(X_train)

In [39]:
b = pd.DataFrame(X_train_tf_idf.toarray(), columns=tf_idf_vectorizer.get_feature_names())

In [40]:
b.head()

Unnamed: 0,aback,abandoned,abide,abiding,ability,able,aboard,abreast,abrupt,abruptly,...,yielded,yielding,yonder,york,young,youngest,yourse,youth,youthful,zest
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.242012,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
