# 1.Data Preparation / Pre-processing

In [1]:
import pandas as pd

In [2]:
# verses = pd.read_csv('verses.csv')
# data = pd.read_csv('file1.csv', on_bad_lines='skip')
df = pd.read_csv('verses.csv', sep='|', header=[0])
df.head(10)

Unnamed: 0,verse,context
0,1:1,"The Revelation of Jesus Christ, which God gave..."
1,1:2,"Who barerecord of the word of God, and of the ..."
2,1:3,"Blessed is he that readeth, and they that hear..."
3,1:4,John to the seven churches which are in Asia: ...
4,1:5,"And from JesusChrist, who is the faithful witn..."
5,1:6,And hath made uskings and priests unto God and...
6,1:7,"Behold, he cometh with clouds; and every eye s..."
7,1:8,"I am Alpha and Omega, the beginning and the en..."
8,1:9,"I John, who also am your brother, and companio..."
9,1:10,I was in the Spirit on the Lord


## 1.1 Remove Punctuation

In [3]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [4]:
#define punctuation
punct = string.punctuation
text = "Can you use a @smiley face, 'emoji' as a : period instead?"
no_punct = ""

for char in text:
    if char not in punct:
        no_punct = no_punct + char
        
# display the unpuncted string
no_punct

'Can you use a smiley face emoji as a  period instead'

In [5]:
def remove_punct(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

df['context_nopunc'] = df['context'].apply(lambda x: remove_punct(x))
df.head()

Unnamed: 0,verse,context,context_nopunc
0,1:1,"The Revelation of Jesus Christ, which God gave...",The Revelation of Jesus Christ which God gave ...
1,1:2,"Who barerecord of the word of God, and of the ...",Who barerecord of the word of God and of the t...
2,1:3,"Blessed is he that readeth, and they that hear...",Blessed is he that readeth and they that hear ...
3,1:4,John to the seven churches which are in Asia: ...,John to the seven churches which are in Asia G...
4,1:5,"And from JesusChrist, who is the faithful witn...",And from JesusChrist who is the faithful witne...


## 1.2 Lowercase

In [6]:
'nlp'=='NLP'.lower()

True

In [7]:
#add lower to teh remove_punc function

#list comprhansion 
#lambda function
#adding join to join chars into words 
def remove_punct(text):
    
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

df['context_nopunc'] = df['context'].apply(lambda x: remove_punct(x.lower()))
df.head()

Unnamed: 0,verse,context,context_nopunc
0,1:1,"The Revelation of Jesus Christ, which God gave...",the revelation of jesus christ which god gave ...
1,1:2,"Who barerecord of the word of God, and of the ...",who barerecord of the word of god and of the t...
2,1:3,"Blessed is he that readeth, and they that hear...",blessed is he that readeth and they that hear ...
3,1:4,John to the seven churches which are in Asia: ...,john to the seven churches which are in asia g...
4,1:5,"And from JesusChrist, who is the faithful witn...",and from jesuschrist who is the faithful witne...


## 1.3 Tokenization

In [8]:
import nltk
# nltk.download('punkt')
from nltk.tokenize import word_tokenize
import re

In [9]:
text = "Can you use a smiley face emoji as a  period instead"
tokens = re.split('\W+', text)
tokens

['Can',
 'you',
 'use',
 'a',
 'smiley',
 'face',
 'emoji',
 'as',
 'a',
 'period',
 'instead']

In [10]:
def tokenize(text):
    tokens = word_tokenize(text)
    return tokens
df['context_tokenized'] = df['context_nopunc'].apply(lambda x: tokenize(x))
df.head()

Unnamed: 0,verse,context,context_nopunc,context_tokenized
0,1:1,"The Revelation of Jesus Christ, which God gave...",the revelation of jesus christ which god gave ...,"[the, revelation, of, jesus, christ, which, go..."
1,1:2,"Who barerecord of the word of God, and of the ...",who barerecord of the word of god and of the t...,"[who, barerecord, of, the, word, of, god, and,..."
2,1:3,"Blessed is he that readeth, and they that hear...",blessed is he that readeth and they that hear ...,"[blessed, is, he, that, readeth, and, they, th..."
3,1:4,John to the seven churches which are in Asia: ...,john to the seven churches which are in asia g...,"[john, to, the, seven, churches, which, are, i..."
4,1:5,"And from JesusChrist, who is the faithful witn...",and from jesuschrist who is the faithful witne...,"[and, from, jesuschrist, who, is, the, faithfu..."


## 1.4 stop words

In [11]:
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords

In [12]:
stopwords_En = nltk.corpus.stopwords.words('english')

In [13]:
stopwords_En

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [14]:
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopwords_En]
    return text

df['context_nostop'] = df['context_tokenized'].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,verse,context,context_nopunc,context_tokenized,context_nostop
0,1:1,"The Revelation of Jesus Christ, which God gave...",the revelation of jesus christ which god gave ...,"[the, revelation, of, jesus, christ, which, go...","[revelation, jesus, christ, god, gave, unto, s..."
1,1:2,"Who barerecord of the word of God, and of the ...",who barerecord of the word of god and of the t...,"[who, barerecord, of, the, word, of, god, and,...","[barerecord, word, god, testimony, jesus, chri..."
2,1:3,"Blessed is he that readeth, and they that hear...",blessed is he that readeth and they that hear ...,"[blessed, is, he, that, readeth, and, they, th...","[blessed, readeth, hear, words, thisprophecy, ..."
3,1:4,John to the seven churches which are in Asia: ...,john to the seven churches which are in asia g...,"[john, to, the, seven, churches, which, are, i...","[john, seven, churches, asia, grace, unto, you..."
4,1:5,"And from JesusChrist, who is the faithful witn...",and from jesuschrist who is the faithful witne...,"[and, from, jesuschrist, who, is, the, faithfu...","[jesuschrist, faithful, witness, first, begott..."


## 1.4 Stemming

In [15]:
import nltk
ps= nltk.PorterStemmer();

In [16]:
dir(ps)

['MARTIN_EXTENSIONS',
 'NLTK_EXTENSIONS',
 'ORIGINAL_ALGORITHM',
 '__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_apply_rule_list',
 '_contains_vowel',
 '_ends_cvc',
 '_ends_double_consonant',
 '_has_positive_measure',
 '_is_consonant',
 '_measure',
 '_replace_suffix',
 '_step1a',
 '_step1b',
 '_step1c',
 '_step2',
 '_step3',
 '_step4',
 '_step5a',
 '_step5b',
 'mode',
 'pool',
 'stem',
 'vowels']

In [17]:
print(ps.stem('play'))
print(ps.stem('playing'))
print(ps.stem('played'))
print(ps.stem('plays'))

play
play
play
play


In [18]:
#Over-Steming 
#Over-steaming occurs when two words are stemmed from the same root of different stems. 

print(ps.stem('universal'))
print(ps.stem('university'))
print(ps.stem('universe'))

univers
univers
univers


In [19]:
#Under-Steming
#Under-stemming occurs when two words are stemmed from the same root of not a different stems

print(ps.stem('alumnus'))
print(ps.stem('alumni'))
print(ps.stem('alumnae'))

alumnu
alumni
alumna


In [20]:
def stemming(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text

df['context_stemmed'] = df['context_nostop'].apply(lambda x: stemming(x))
df.head(10)

Unnamed: 0,verse,context,context_nopunc,context_tokenized,context_nostop,context_stemmed
0,1:1,"The Revelation of Jesus Christ, which God gave...",the revelation of jesus christ which god gave ...,"[the, revelation, of, jesus, christ, which, go...","[revelation, jesus, christ, god, gave, unto, s...","[revel, jesu, christ, god, gave, unto, shewunt..."
1,1:2,"Who barerecord of the word of God, and of the ...",who barerecord of the word of god and of the t...,"[who, barerecord, of, the, word, of, god, and,...","[barerecord, word, god, testimony, jesus, chri...","[barerecord, word, god, testimoni, jesu, chris..."
2,1:3,"Blessed is he that readeth, and they that hear...",blessed is he that readeth and they that hear ...,"[blessed, is, he, that, readeth, and, they, th...","[blessed, readeth, hear, words, thisprophecy, ...","[bless, readeth, hear, word, thispropheci, kee..."
3,1:4,John to the seven churches which are in Asia: ...,john to the seven churches which are in asia g...,"[john, to, the, seven, churches, which, are, i...","[john, seven, churches, asia, grace, unto, you...","[john, seven, church, asia, grace, unto, youan..."
4,1:5,"And from JesusChrist, who is the faithful witn...",and from jesuschrist who is the faithful witne...,"[and, from, jesuschrist, who, is, the, faithfu...","[jesuschrist, faithful, witness, first, begott...","[jesuschrist, faith, wit, first, begotten, the..."
5,1:6,And hath made uskings and priests unto God and...,and hath made uskings and priests unto god and...,"[and, hath, made, uskings, and, priests, unto,...","[hath, made, uskings, priests, unto, god, fath...","[hath, made, usk, priest, unto, god, father, g..."
6,1:7,"Behold, he cometh with clouds; and every eye s...",behold he cometh with clouds and every eye sha...,"[behold, he, cometh, with, clouds, and, every,...","[behold, cometh, clouds, every, eye, shall, se...","[behold, cometh, cloud, everi, eye, shall, see..."
7,1:8,"I am Alpha and Omega, the beginning and the en...",i am alpha and omega the beginning and the end...,"[i, am, alpha, and, omega, the, beginning, and...","[alpha, omega, beginning, ending, saith, thelo...","[alpha, omega, begin, end, saith, thelord, com..."
8,1:9,"I John, who also am your brother, and companio...",i john who also am your brother and companion ...,"[i, john, who, also, am, your, brother, and, c...","[john, also, brother, companion, tribulationan...","[john, also, brother, companion, tribulationan..."
9,1:10,I was in the Spirit on the Lord,i was in the spirit on the lord,"[i, was, in, the, spirit, on, the, lord]","[spirit, lord]","[spirit, lord]"


## 1.4 Lemmatization

In [21]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Sc\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [22]:
import nltk

wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()

In [23]:
dir(wn)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'lemmatize']

In [24]:
print(wn.lemmatize('universal'))
print(wn.lemmatize('university'))
print(wn.lemmatize('universe'))

universal
university
universe


In [25]:
print(wn.lemmatize('alumnus'))
print(wn.lemmatize('alumni'))
print(wn.lemmatize('alumnae'))

alumnus
alumnus
alumna


In [26]:
def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

df['context_lemmatized'] = df['context_nostop'].apply(lambda x: lemmatizing(x))

df.head(10)

Unnamed: 0,verse,context,context_nopunc,context_tokenized,context_nostop,context_stemmed,context_lemmatized
0,1:1,"The Revelation of Jesus Christ, which God gave...",the revelation of jesus christ which god gave ...,"[the, revelation, of, jesus, christ, which, go...","[revelation, jesus, christ, god, gave, unto, s...","[revel, jesu, christ, god, gave, unto, shewunt...","[revelation, jesus, christ, god, gave, unto, s..."
1,1:2,"Who barerecord of the word of God, and of the ...",who barerecord of the word of god and of the t...,"[who, barerecord, of, the, word, of, god, and,...","[barerecord, word, god, testimony, jesus, chri...","[barerecord, word, god, testimoni, jesu, chris...","[barerecord, word, god, testimony, jesus, chri..."
2,1:3,"Blessed is he that readeth, and they that hear...",blessed is he that readeth and they that hear ...,"[blessed, is, he, that, readeth, and, they, th...","[blessed, readeth, hear, words, thisprophecy, ...","[bless, readeth, hear, word, thispropheci, kee...","[blessed, readeth, hear, word, thisprophecy, k..."
3,1:4,John to the seven churches which are in Asia: ...,john to the seven churches which are in asia g...,"[john, to, the, seven, churches, which, are, i...","[john, seven, churches, asia, grace, unto, you...","[john, seven, church, asia, grace, unto, youan...","[john, seven, church, asia, grace, unto, youan..."
4,1:5,"And from JesusChrist, who is the faithful witn...",and from jesuschrist who is the faithful witne...,"[and, from, jesuschrist, who, is, the, faithfu...","[jesuschrist, faithful, witness, first, begott...","[jesuschrist, faith, wit, first, begotten, the...","[jesuschrist, faithful, witness, first, begott..."
5,1:6,And hath made uskings and priests unto God and...,and hath made uskings and priests unto god and...,"[and, hath, made, uskings, and, priests, unto,...","[hath, made, uskings, priests, unto, god, fath...","[hath, made, usk, priest, unto, god, father, g...","[hath, made, uskings, priest, unto, god, fathe..."
6,1:7,"Behold, he cometh with clouds; and every eye s...",behold he cometh with clouds and every eye sha...,"[behold, he, cometh, with, clouds, and, every,...","[behold, cometh, clouds, every, eye, shall, se...","[behold, cometh, cloud, everi, eye, shall, see...","[behold, cometh, cloud, every, eye, shall, see..."
7,1:8,"I am Alpha and Omega, the beginning and the en...",i am alpha and omega the beginning and the end...,"[i, am, alpha, and, omega, the, beginning, and...","[alpha, omega, beginning, ending, saith, thelo...","[alpha, omega, begin, end, saith, thelord, com...","[alpha, omega, beginning, ending, saith, thelo..."
8,1:9,"I John, who also am your brother, and companio...",i john who also am your brother and companion ...,"[i, john, who, also, am, your, brother, and, c...","[john, also, brother, companion, tribulationan...","[john, also, brother, companion, tribulationan...","[john, also, brother, companion, tribulationan..."
9,1:10,I was in the Spirit on the Lord,i was in the spirit on the lord,"[i, was, in, the, spirit, on, the, lord]","[spirit, lord]","[spirit, lord]","[spirit, lord]"


## Zipf's Law

In [27]:
def terms_frequency (list):
    dictionary={}      # store terms in a dictionary (key,value)
    counted_list=[]      # fill in this list with words that are already counted
    for term in list:
        if term not in counted_list:
            freq=list.count(term)
            dictionary[term]=freq
            counted_list.append(term)

    return sorted(dictionary.items(), key=lambda x:x[1], reverse=True)

In [36]:
print(len(df), df['context_stemmed'][0])

4517 ['revel', 'jesu', 'christ', 'god', 'gave', 'unto', 'shewunto', 'servant', 'thing', 'must', 'shortli', 'come', 'pass', 'sentand', 'signifi', 'angel', 'unto', 'servant', 'john']


In [35]:
list = []
for i in range(len(df)):
    list += df['context_stemmed'][i]
terms_freq = terms_frequency(list)
terms_freq

[('shall', 1233),
 ('lord', 976),
 ('unto', 920),
 ('thi', 730),
 ('thou', 684),
 ('god', 612),
 ('thee', 442),
 ('ye', 408),
 ('upon', 332),
 ('said', 278),
 ('hath', 277),
 ('come', 274),
 ('man', 271),
 ('day', 268),
 ('say', 267),
 ('let', 264),
 ('one', 257),
 ('peopl', 218),
 ('hand', 213),
 ('saith', 213),
 ('also', 210),
 ('earth', 203),
 ('land', 203),
 ('israel', 192),
 ('thing', 189),
 ('us', 185),
 ('hast', 185),
 ('name', 170),
 ('great', 169),
 ('son', 166),
 ('behold', 161),
 ('hous', 156),
 ('word', 154),
 ('came', 152),
 ('heaven', 149),
 ('shalt', 145),
 ('go', 145),
 ('men', 145),
 ('like', 144),
 ('jesu', 142),
 ('king', 142),
 ('even', 141),
 ('made', 135),
 ('way', 135),
 ('ever', 133),
 ('make', 130),
 ('therefor', 128),
 ('holi', 127),
 ('prais', 124),
 ('work', 122),
 ('children', 122),
 ('forth', 122),
 ('give', 120),
 ('know', 119),
 ('thu', 118),
 ('among', 117),
 ('thereof', 116),
 ('offer', 116),
 ('citi', 115),
 ('neither', 115),
 ('side', 115),
 ('gate',

In [31]:
print(len(df))

4517


In [32]:
import matplotlib.pyplot as plt
x, y = zip(*terms_freq)
plt.xticks(rotation=90)
plt.plot(x, y)
plt.show()

KeyboardInterrupt: 

## Create function to remove punctuation, tokenize, remove stopwords, and stem

In [None]:
### Create function to remove punctuation, tokenize, remove stopwords, and stem

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
     #tokens = re.split('\W+', text)
    tokens = word_tokenize(text)
    text = " ".join([ps.stem(word) for word in tokens if word not in stopwords_En])
    return text
df=df[['verse','context']]
df['cleaned_text'] = df['context'].apply(lambda x: clean_text(x))
df.head(10)

## 2. Vectorizing text data

In [None]:
sentences = ["good movie", "not a good movie", "did not like", "i like it"]

### 2.1: Count vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
features_cv = vectorizer.fit_transform(sentences)  #Sparse Matrix

print(features_cv.shape)
print('Sparse Matrix :\n', features_cv)

features_cv = pd.DataFrame(features_cv.toarray())
features_cv.columns = vectorizer.get_feature_names()
features_cv

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# To create a Count Vectorizer, we simply need to instantiate one.
# There are special parameters we can set here when making the vectorizer, but
# for the most basic example, it is not needed.


vectorizer = CountVectorizer()
features_CountVec = vectorizer.fit_transform(df['cleaned_text'])  #Sparse_Matrix
print(features_CountVec.shape)
print('Sparse_Matrix :\n', features_CountVec)

# كان----- sparce matrice------ حولناها ل ماتريس عادىة----
features_CountVec = pd.DataFrame(features_CountVec.toarray())
features_CountVec.columns = vectorizer.get_feature_names()
features_CountVec

### 4.2.2: Vectorizing Data: N-Grams

In [None]:
# ngram = term -> 2terms -> 3terms
ngram_vect = CountVectorizer(ngram_range=(1,3))
features_ngram = ngram_vect.fit_transform(df['cleaned_text'])

print(features_ngram.shape)
print('Sparse Matrix :\n', features_ngram)

features_ngram = pd.DataFrame(features_ngram.toarray())
features_ngram.columns = ngram_vect.get_feature_names()
features_ngram

In [None]:
# الفرق بيناتهم انك تحسب وزن الكلمة و ليس عدد التكررا فقط
#  w = TF * IDF

### 4.2.3: Vectorizing Raw Data: TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

#tfidf = TfidfVectorizer(min_df=1)
tfidf = TfidfVectorizer( ngram_range=(2,2))
features_tfidf = tfidf.fit_transform(df['cleaned_text'])

print(features_tfidf.shape)
print('Sparse Matrix :\n', features_tfidf)

features_tfidf = pd.DataFrame(features_tfidf.toarray())
features_tfidf.columns = tfidf.get_feature_names()
features_tfidf.head(10)