# NLP 101 - Web Scraping, Tokenization and Data Representation with NLTK.

# Problem Statement
## Part 1


In [1]:
import requests
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import pandas as pd
from nltk import pos_tag
import io

In [2]:
url = "https://en.wikipedia.org/wiki/Natural_language_processing"
r = requests.get(url)
html_text = r.text
soup = BeautifulSoup(html_text)

In [3]:
for script in soup(["script", "style"]):
    script.extract() 

In [4]:
sample = soup.get_text()

In [5]:
stopword = stopwords.words("english")

In [7]:
sample_stop = [x for x in sample.lower().split() if x not in stopword]

In [8]:
d ={}
for x in sample_stop:
    if(x not in d.keys()):
        d[x] = 1
    else:
        d[x] += 1

In [9]:
stem_words = []
stemmer = PorterStemmer()
for x in list(d.keys()):
    stem_words.append(stemmer.stem(x))

In [10]:
lemmatize_words = []
lemmatizer = WordNetLemmatizer()
for x in list(d.keys()):
    lemmatize_words.append(lemmatizer.lemmatize(x))

In [11]:
words = list(d.keys())
freq = list(d.values())

In [12]:
final_dict = {"Word":words, "Stemmed Word":stem_words, "Lemmatized Words":lemmatize_words, "Frequency":freq}

In [13]:
df = pd.DataFrame(final_dict)

In [14]:
print("Total Terms before stopword removal:",len(sample.split()))
print("Total Terms after stopword removal:",sum(freq))
print("Unique Stemmed Words:",len(list(set(stem_words))))
print("Unique Lemmatized Word:",len(list(set(lemmatize_words))))
df.head()

Total Terms before stopword removal: 4621
Total Terms after stopword removal: 3103
Unique Stemmed Words: 1553
Unique Lemmatized Word: 1679


Unnamed: 0,Word,Stemmed Word,Lemmatized Words,Frequency
0,natural,natur,natural,48
1,language,languag,language,71
2,processing,process,processing,21
3,-,-,-,1
4,wikipedia,wikipedia,wikipedia,3


## Part 2

In [15]:
stopword.append("language")

In [16]:
sample_stop = [x for x in sample.lower().split() if x not in stopword]

In [17]:
d ={}
for x in sample_stop:
    if(x not in d.keys()):
        d[x] = 1
    else:
        d[x] += 1

In [18]:
words = list(d.keys())
freq = list(d.values())

In [19]:
final_dict = {"Word":words, "Frequency":freq}

In [20]:
df = pd.DataFrame(final_dict)

In [21]:
print("Total Terms before stopword removal:",len(sample.split()))
print("Total Terms after stopword removal:",sum(freq))
df.head()

Total Terms before stopword removal: 4621
Total Terms after stopword removal: 3032


Unnamed: 0,Word,Frequency
0,natural,48
1,processing,21
2,-,1
3,wikipedia,3
4,"wikipedia,",1


In [22]:
removed_words = [x for x in stopword if x in sample.lower().split()]

In [23]:
removed_words_pos = pos_tag(removed_words)

In [24]:
removed_words_pos

[('i', 'NN'),
 ('we', 'PRP'),
 ('you', 'PRP'),
 ('your', 'PRP$'),
 ('it', 'PRP'),
 ('its', 'PRP$'),
 ('they', 'PRP'),
 ('their', 'PRP$'),
 ('what', 'WP'),
 ('which', 'WDT'),
 ('who', 'WP'),
 ('this', 'DT'),
 ('that', 'IN'),
 ('these', 'DT'),
 ('those', 'DT'),
 ('is', 'VBZ'),
 ('are', 'VBP'),
 ('was', 'VBD'),
 ('were', 'VBD'),
 ('be', 'VB'),
 ('been', 'VBN'),
 ('being', 'VBG'),
 ('have', 'VB'),
 ('has', 'VBZ'),
 ('had', 'VBN'),
 ('do', 'RP'),
 ('a', 'DT'),
 ('an', 'DT'),
 ('the', 'DT'),
 ('and', 'CC'),
 ('but', 'CC'),
 ('if', 'IN'),
 ('or', 'CC'),
 ('because', 'IN'),
 ('as', 'IN'),
 ('until', 'IN'),
 ('while', 'IN'),
 ('of', 'IN'),
 ('at', 'IN'),
 ('by', 'IN'),
 ('for', 'IN'),
 ('with', 'IN'),
 ('about', 'NN'),
 ('between', 'IN'),
 ('into', 'IN'),
 ('through', 'IN'),
 ('during', 'IN'),
 ('before', 'RB'),
 ('after', 'IN'),
 ('to', 'TO'),
 ('from', 'IN'),
 ('up', 'RP'),
 ('in', 'IN'),
 ('out', 'RP'),
 ('on', 'IN'),
 ('over', 'IN'),
 ('under', 'IN'),
 ('further', 'JJ'),
 ('then', 'RB'),
 (

### Part 3

In [16]:
import requests
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import pandas as pd
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
import io
url_1 = "https://en.wikipedia.org/wiki/Natural_language_processing"
url_2 = "https://en.wikipedia.org/wiki/Machine_learning"
r = requests.get(url_1)
html_text = r.text
soup = BeautifulSoup(html_text)
for script in soup(["script", "style"]):
    script.extract()
content_1 = soup.get_text()
r = requests.get(url_2)
html_text = r.text
soup = BeautifulSoup(html_text)
for script in soup(["script", "style"]):
    script.extract()
content_2 = soup.get_text()
with io.open('nlp.doc', mode = 'w+',encoding="utf-8") as file_1:
    file_1.write(content_1)
with io.open('ml.doc', mode = 'w+',encoding="utf-8") as file_2:
    file_2.write(content_2)
stopword = stopwords.words("english")
content_1_stop = [x for x in content_1.lower().split() if x not in stopword]
content_2_stop = [x for x in content_2.lower().split() if x not in stopword]
all_words = content_1_stop + content_2_stop
all_words = list(set(all_words))
bool_1 = []
bool_2 = []
for x in all_words:
    if x in content_1_stop:
        bool_1.append(1)
    else:
        bool_1.append(0)
    if x in content_2_stop:
        bool_2.append(1)
    else:
        bool_2.append(0)
final_dict = {"Word":all_words, "Doc 1":bool_1, "Doc 2":bool_2}
df = pd.DataFrame(final_dict)
df.head()

Unnamed: 0,Word,Doc 1,Doc 2
0,outputs,0,1
1,model.,0,1
2,"representation.""",0,1
3,function,1,1
4,"door""",1,0


In [17]:
query=input("Enter query:")
for q in query.lower().split():
    try:
        f1=df['Doc 1'][all_words.index(q)]
        print("Presence of \""+ query +"\" in Doc 1:",f1)
    except ValueError:
        print("Presence of \"" + query + "\" in Doc 1:",0)
    try:
        f2=df['Doc 2'][all_words.index(q)]
        print("Presence of \""+ query +"\" in Doc 2:",f2)
    except ValueError:
        print("Presence of \"" + query + "\" in Doc 2:",0)
d1 = {}
d2 = {}
for x in all_words:
        if x in content_1_stop:
            for y in content_1_stop:
                if x == y:
                    if x not in d1.keys():
                        d1[x] = 1
                    else:
                        d1[x] += 1
        else:
            d1[x] = 0
        if x in content_2_stop:
            for y in content_2_stop:
                if x == y:
                    if x not in d2.keys():
                        d2[x] = 1
                    else:
                        d2[x] += 1
        else:
            d2[x] = 0
final_dict = {"Word":all_words, "Freq in Doc 1": list(d1.values()), "Freq in Doc 2": list(d2.values())}
df = pd.DataFrame(final_dict)
df

Enter query:function
Presence of "function" in Doc 1: 1
Presence of "function" in Doc 2: 1


Unnamed: 0,Word,Freq in Doc 1,Freq in Doc 2
0,outputs,0,3
1,model.,0,3
2,"representation.""",0,1
3,function,1,6
4,"door""",1,0
...,...,...,...
4574,observed,0,2
4575,developed,2,1
4576,contain,0,4
4577,at&t,0,1


In [18]:
word = []
pos = []
for x in all_words:
    i = 0
    if x in content_1_stop:
        for y in content_1_stop:
            i += 1
            if x == y:
                word.append(x)
                pos.append(content_1_stop.index(x,i-1))
final_dict = {"Word":word, "Position":pos}
df = pd.DataFrame(final_dict)
df.head()

Unnamed: 0,Word,Position
0,function,2152
1,"door""",1703
2,"""why",241
3,opposite,1775
4,earliest-used,341


In [19]:
word = []
pos = []
for x in all_words:
    i = 0
    if x in content_2_stop:
        for y in content_2_stop:
            i += 1
            if x == y:
                word.append(x)
                pos.append(content_2_stop.index(x,i-1))
final_dict = {"Word":word, "Position":pos}
df = pd.DataFrame(final_dict)
df.head()

Unnamed: 0,Word,Position
0,outputs,1553
1,outputs,1572
2,outputs,1580
3,model.,2517
4,model.,3622


## Thankyou