# Natural Language Processing - Part 3

In [5]:
import pandas as pd
import spacy
import re
import nltk
from bs4 import BeautifulSoup

In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
imdb = pd.read_csv('/content/drive/My Drive/NLP/IMDB Dataset.csv')

# Pre Processing

In [7]:
spc_en = spacy.load('en')

def pre_processing(text):

  # remove all html tags
  soup = BeautifulSoup(text)
  string =  soup.get_text()
  
  # convert uppercase into lowercase 
  string_lower = string.lower()
  
  # remove evererything is not letters
  list_only_letters = re.findall(r'[a-z]+', string_lower)

  # set english stopwords
  stopwords = nltk.corpus.stopwords.words('english')
  stop = set(stopwords)

  # remove stopwords
  list_meaningful_words = [word for word in list_only_letters \
                          if word not in stopwords]
  # join list strings on space
  string_meaninful_words = " ".join(list_meaningful_words)

  # innitializing spacy object
  spc_letters = spc_en(string_meaninful_words)

  # lemminization
  tokens = [token.lemma_ if token.pos_ == 'VERB' else str(token) \
            for token in spc_letters]

  return " ".join(tokens)

In [10]:
%%time

imdb['review_cleaned'] = imdb['review'].apply(pre_processing)

CPU times: user 22min 35s, sys: 9.58 s, total: 22min 45s
Wall time: 22min 47s


# Bigramas

In [1]:
from gensim.models.phrases import Phrases, Phraser

In [46]:
sent = [row.split() for row in imdb['review_cleaned']]

In [48]:
%%time

phrases = Phrases(sent, min_count=30, progress_per=10000)
bigram = Phraser(phrases)

CPU times: user 40.7 s, sys: 407 ms, total: 41.1 s
Wall time: 41.1 s


In [49]:
sentences = bigram[sent]

# Word 2 Vector

In [3]:
import multiprocessing

from gensim.models import Word2Vec

The slowest run took 20.48 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 3: 730 ns per loop


In [50]:
w2v_model = Word2Vec(min_count=2, size=80, window=2)

In [51]:
%%time

w2v_model.build_vocab(sentences, progress_per=10000) 

CPU times: user 29.5 s, sys: 131 ms, total: 29.6 s
Wall time: 29.6 s


In [52]:
%%time

w2v_model.train(sentences, 
                total_examples=w2v_model.corpus_count, 
                epochs=20, 
                report_delay=1)

CPU times: user 14min 39s, sys: 2.98 s, total: 14min 42s
Wall time: 9min 16s


(105702873, 112657480)