In [2]:
from tqdm import tqdm
import concurrent.futures
import requests
from bs4 import BeautifulSoup
import mysql.connector
import time

In [3]:
url = 'https://www.35mmc.com/26/09/2016/leica-c2-zoom-review/'

In [4]:
webReq = requests.get(url)
soup_obj = BeautifulSoup(webReq.text, "html.parser")
if soup_obj.find('article').text != None:
    raw_text = soup_obj.find('article').text
    # get title
    get_title = soup_obj.find('h1', {'class':'entry-title'}).text

In [5]:
import re
import spacy
from nltk.corpus import stopwords

class TextCleaner:
    """Designed for Inverted Indexing"""
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
        self.stop_words = set(stopwords.words('english'))

    def normalize(self, raw_text):
        """Remove special characters and lowercase text"""
        return re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", " ", raw_text.lower())

    def remove_stopwords(self, raw_text):
        """Remove stopwords"""
        words = [word for word in raw_text.split() if word not in self.stop_words]
        return " ".join(words)

    def lemmatize(self, raw_text):
        """Perform lemmatization"""
        doc = self.nlp(raw_text)
        return [token.lemma_ for token in doc]

    def clean(self, raw_text):
        """Clean text by normalizing, removing stopwords, and lemmatizing, Returning a list of token"""
        raw_text = self.normalize(raw_text)
        raw_text = self.remove_stopwords(raw_text)
        return self.lemmatize(raw_text)

In [6]:
clean = TextCleaner()
clean.clean(webReq.text)

['doctype',
 'html',
 'html',
 'lang',
 'en',
 'we',
 'head',
 'meta',
 'charset',
 'utf',
 '8',
 'script',
 'datum',
 'ezscrex',
 'false',
 'datum',
 'cfasync',
 'false',
 'datum',
 'pagespeed',
 'defer',
 'var',
 'ez',
 'ez',
 'ez',
 'stms',
 'date',
 'ez',
 'evt',
 'ez',
 'script',
 'ez',
 'ck',
 'ez',
 'ck',
 'ez',
 'template',
 'ez',
 'template',
 'isorig',
 'true',
 'ez',
 'queue',
 'function',
 'var',
 'e',
 '0',
 '0',
 'n',
 '1',
 'r',
 '0',
 'function',
 'e',
 'n',
 'r',
 'var',
 'l',
 'name',
 'e',
 'funcname',
 'parameter',
 'null',
 'n',
 'null',
 'n',
 'instanceof',
 'array',
 'n',
 'n',
 'isblock',
 'blockedby',
 'r',
 'deletewhencomplete',
 'iserror',
 '1',
 'iscomplete',
 '1',
 'isinitialize',
 '1',
 'proceediferror',
 'istimedelay',
 '1',
 'process',
 'function',
 'u',
 'func',
 'e',
 'l',
 'isinitialize',
 '0',
 'l',
 'iscomplete',
 '0',
 'u',
 'func',
 'apply',
 'e',
 'var',
 'l',
 'funcname',
 'split',
 'n',
 'null',
 'length',
 '3',
 'n',
 '3',
 'length',
 'window'

In [4]:
get_title

'Leica C2 Zoom Review – A Mind of Its Own – by Torsten Kathke'

# Normalize Text

In [7]:
raw_text_norm = raw_text.lower()
raw_text_norm

'\n\n \n\npoint & shoot\nleica c2 zoom review – a mind of its own – by torsten kathke\nseptember 26, 2016\n\n\nthe dream is that someone will just give you a leica. the dream is alive. well, at least it is not completely dead. i was given a leica c2 zoom.\na few months ago, a friend and colleague of mine made an intervention with her mother, who had decided, after sifting through a family member’s collection of things, to keep the nikon slr but trash the lesser cameras. a co-conspirator in the visual arts, my friend pointed out that she might know someone who could use these unwanted things. this is how one monday in june, i found two cameras and several camera-related odds and ends on my desk at work.\none of the cameras was a ridiculously well-built voigtländer rangefinder from the 1950s. the other was a plastic point and shoot from the 1990s.\xa0of course i will review the plastic point and shoot first. i mean, who wants to hear yet again about quality workmanship from a period befo

# Clear Unicode

In [8]:
import re

raw_text_norm_uni = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", " ", raw_text_norm)

raw_text_norm_uni

'     point   shoot leica c2 zoom review   a mind of its own   by torsten kathke september 26  2016   the dream is that someone will just give you a leica  the dream is alive  well  at least it is not completely dead  i was given a leica c2 zoom  a few months ago  a friend and colleague of mine made an intervention with her mother  who had decided  after sifting through a family member s collection of things  to keep the nikon slr but trash the lesser cameras  a co conspirator in the visual arts  my friend pointed out that she might know someone who could use these unwanted things  this is how one monday in june  i found two cameras and several camera related odds and ends on my desk at work  one of the cameras was a ridiculously well built voigtl nder rangefinder from the 1950s  the other was a plastic point and shoot from the 1990s  of course i will review the plastic point and shoot first  i mean  who wants to hear yet again about quality workmanship from a period before skimping wa

# Remove stopwords + tokenize

In [9]:
# with nltk
import nltk.corpus as nltkc

stopwords = set(nltkc.stopwords.words('english'))
# tokenized
raw_text_norm_uni_stopnltk_token = []

for word in raw_text_norm_uni.split():
    if word not in stopwords:
        raw_text_norm_uni_stopnltk_token.append(word)

# joined
raw_text_norm_uni_stopnltk_join = " ".join(raw_text_norm_uni_stopnltk_token)

raw_text_norm_uni_stopnltk_join

'point shoot leica c2 zoom review mind torsten kathke september 26 2016 dream someone give leica dream alive well least completely dead given leica c2 zoom months ago friend colleague mine made intervention mother decided sifting family member collection things keep nikon slr trash lesser cameras co conspirator visual arts friend pointed might know someone could use unwanted things one monday june found two cameras several camera related odds ends desk work one cameras ridiculously well built voigtl nder rangefinder 1950s plastic point shoot 1990s course review plastic point shoot first mean wants hear yet quality workmanship period skimping rule camera construction build things anymore country going tubes country matter one bothers anymore precisely everything sucks attention spans instagram internet kids internet right mean right even want hear cameras trust voigtl nder easily survive another hundred years plenty time review one later plastic point shoot hand good long electronics ke

# Stemming

In [31]:
# # import these modules
# from nltk.stem import PorterStemmer
# from nltk.tokenize import word_tokenize

# ps = PorterStemmer()

# for w in raw_text_norm_uni_stopnltk_token:
#     print(w, " : ", ps.stem(w))

import nltk
lemma = nltk.wordnet.WordNetLemmatizer()

# for token in raw_text_norm_uni_stopnltk_token:
#     print(token, " : ", lemma.lemmatize(token))

'eating'

In [10]:
import spacy
# from nltk.stem import PorterStemmer

clean_token = []
nlp = spacy.load("en_core_web_sm")

doc = nlp(raw_text_norm_uni_stopnltk_join)
for token in doc:
    # print(token, " : ", token.lemma_)
    clean_token.append(token)
    
clean_token    

[point,
 shoot,
 leica,
 c2,
 zoom,
 review,
 mind,
 torsten,
 kathke,
 september,
 26,
 2016,
 dream,
 someone,
 give,
 leica,
 dream,
 alive,
 well,
 least,
 completely,
 dead,
 given,
 leica,
 c2,
 zoom,
 months,
 ago,
 friend,
 colleague,
 mine,
 made,
 intervention,
 mother,
 decided,
 sifting,
 family,
 member,
 collection,
 things,
 keep,
 nikon,
 slr,
 trash,
 lesser,
 cameras,
 co,
 conspirator,
 visual,
 arts,
 friend,
 pointed,
 might,
 know,
 someone,
 could,
 use,
 unwanted,
 things,
 one,
 monday,
 june,
 found,
 two,
 cameras,
 several,
 camera,
 related,
 odds,
 ends,
 desk,
 work,
 one,
 cameras,
 ridiculously,
 well,
 built,
 voigtl,
 nder,
 rangefinder,
 1950s,
 plastic,
 point,
 shoot,
 1990s,
 course,
 review,
 plastic,
 point,
 shoot,
 first,
 mean,
 wants,
 hear,
 yet,
 quality,
 workmanship,
 period,
 skimping,
 rule,
 camera,
 construction,
 build,
 things,
 anymore,
 country,
 going,
 tubes,
 country,
 matter,
 one,
 bothers,
 anymore,
 precisely,
 everything,

# Full Function
#### Normalize + Unicode Removed + Stopword Removed(NLTK) + Stemming(SPACY) + Tokenized

In [6]:
import re
import nltk.corpus as nltkc
import spacy
# ------------------------------------------------
# ------------------------------------------------
def txt_cleaner(raw_text):
    raw_text_norm = raw_text.lower()
    raw_text_norm_uni = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", " ", raw_text_norm)
    # ------------------------------------------------
    stopwords = set(nltkc.stopwords.words('english'))
    raw_text_norm_uni_stopnltk_token = []
    for word in raw_text_norm_uni.split():
        if word not in stopwords:
            raw_text_norm_uni_stopnltk_token.append(word)
    raw_text_norm_uni_stopnltk_join = " ".join(raw_text_norm_uni_stopnltk_token)
    # ------------------------------------------------
    clean_token = []
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(raw_text_norm_uni_stopnltk_join)
    for token in doc:
        # print(token, " : ", token.lemma_)
        clean_token.append(token)
    return clean_token

# finalfunction : raw text cleaning

In [1]:
import re
import spacy
from nltk.corpus import stopwords

def txt_cleaner(raw_text):
    """Normalize + Unicode Removed + Stopword Removed(NLTK) + Stemming(SPACY) + Tokenized"""
    
    # Normalize and remove special characters
    raw_text = re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", " ", raw_text.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in raw_text.split() if word not in stop_words]
    raw_text = " ".join(words)
    
    # Perform lemmatization
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(raw_text)
    clean_tokens = [token.lemma_ for token in doc]
    
    return clean_tokens

# Final Function in OOP Class form

In [32]:
import re
import spacy
from nltk.corpus import stopwords

class TextCleaner:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
        self.stop_words = set(stopwords.words('english'))

    def normalize(self, raw_text):
        """Remove special characters and lowercase text"""
        return re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", " ", raw_text.lower())

    def remove_stopwords(self, raw_text):
        """Remove stopwords"""
        words = [word for word in raw_text.split() if word not in self.stop_words]
        return " ".join(words)

    def lemmatize(self, raw_text):
        """Perform lemmatization"""
        doc = self.nlp(raw_text)
        return [token.lemma_ for token in doc]

    def clean(self, raw_text):
        """Clean text by normalizing, removing stopwords, and lemmatizing"""
        raw_text = self.normalize(raw_text)
        raw_text = self.remove_stopwords(raw_text)
        return self.lemmatize(raw_text)


In [33]:
text_cleaner = TextCleaner()
normalized_text = text_cleaner.normalize(raw_text)
no_stopwords_text = text_cleaner.remove_stopwords(normalized_text)
lemmatized_text = text_cleaner.lemmatize(no_stopwords_text)

# Unit testing

In [16]:
import unittest

class TestTextCleaner(unittest.TestCase):
    def setUp(self):
        self.text_cleaner = TextCleaner()
        self.raw_text = "This is some raw text that needs to be cleaned."
        self.normalized_text = "this is some raw text that needs to be cleaned "
        self.normalized_text_false = "this is some raw text that needs to be cleaned"
        self.no_stopwords_text = "This raw text needs cleaned."
        self.no_stopwords_text_false = "This raw text needs cleaned"
        self.lemmatized_text = ['raw', 'text', 'need', 'clean']

    def test_normalize(self):
        self.assertEqual(self.text_cleaner.normalize(self.raw_text), self.normalized_text)
        self.assertNotEqual(self.text_cleaner.normalize(self.raw_text), self.normalized_text_false)

    def test_remove_stopwords(self):
        self.assertEqual(self.text_cleaner.remove_stopwords(self.raw_text), self.no_stopwords_text)
        self.assertNotEqual(self.text_cleaner.remove_stopwords(self.raw_text), self.raw_text)

#     Cannot test due to depends on each library's algorithm
#     def test_lemmatize(self):
#         self.assertEqual(self.text_cleaner.lemmatize(self.no_stopwords_text), self.lemmatized_text)
        
    def test_clean(self):
        self.assertEqual(self.text_cleaner.clean(self.raw_text), self.lemmatized_text)
        self.assertNotEqual(self.text_cleaner.clean(self.raw_text), self.raw_text)

# if __name__ == '__main__':
#     unittest.main()
unittest.main(argv=[''], verbosity=2, exit=False)

test_clean (__main__.TestTextCleaner) ... ok
test_normalize (__main__.TestTextCleaner) ... ok
test_remove_stopwords (__main__.TestTextCleaner) ... ok

----------------------------------------------------------------------
Ran 3 tests in 1.809s

OK


<unittest.main.TestProgram at 0x1d6635bf8b0>

In [6]:
text_cleaner = TextCleaner()
normalized_text = text_cleaner.clean('This is some raw text that needs to be cleaned.')
normalized_text

['raw', 'text', 'need', 'clean']

# Run Time 

In [21]:
import time

start = time.time()

text_cleaner = TextCleaner()
text_cleaner.clean(raw_text)

end = time.time()

print("Time : ", end-start, " sec.")

Time :  1.1791625022888184  sec.


# prep. for invert index counting frequency
### kept as dictionary


In [36]:
# clean_text = ['first', 'word', 'second', 'text', 'hello', 'third']
clean_text = lemmatized_text

freq_dict = {}
for gram in clean_text:
    if gram in freq_dict:
        freq_dict[gram] += 1
    else:
        freq_dict[gram] = 1


In [53]:
from collections import Counter

freq_dict_function = Counter(lemmatized_text)

print("Total Words : ", len(freq_dict_function))
for i in freq_dict_function:
    if freq_dict_function[i] > 10:
        print(i, " : ", freq_dict_function[i])

Total Words :  1056
point  :  23
shoot  :  21
leica  :  60
c2  :  46
zoom  :  72
well  :  18
make  :  17
camera  :  55
know  :  12
could  :  11
use  :  23
one  :  21
end  :  23
plastic  :  17
even  :  11
want  :  27
good  :  11
long  :  12
maybe  :  11
auto  :  21
like  :  13
40  :  15
mm  :  32
lens  :  20
90  :  17
also  :  15
still  :  13
image  :  13
picture  :  21
would  :  20
set  :  14
minolta  :  19
wide  :  17
take  :  17
much  :  11
flash  :  11
film  :  21
