In [0]:
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

**Preprocessing**

In [0]:
# Importing stop words
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))
len(stop_words)
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [0]:
pos_map = {
'CC': 'n',
'CD': 'n',
'DT': 'n',
'EX': 'n',
'FW': 'n',
'IN': 'n',
'JJ': 'a',
'JJR': 'a',
'JJS': 'a',
'LS': 'n',
'MD': 'v',
'NN': 'n',
'NNS': 'n',
'NNP': 'n',
'NNPS': 'n',
'PDT': 'n',
'POS': 'n',
'PRP': 'n',
'PRP$': 'r',
'RB': 'r',
'RBR': 'r',
'RBS': 'r',
'RP': 'n',
'TO': 'n',
'UH': 'n',
'VB': 'v',
'VBD': 'v',
'VBG': 'v',
'VBN': 'v',
'VBP': 'v',
'VBZ': 'v',
'WDT': 'n',
'WP': 'n',
'WP$': 'n',
'WRB': 'r'
}

In [0]:
from bs4 import BeautifulSoup
import re
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

# We initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

def tokenizer(example_sent):
    # example_sent = "Hi, it's me"
    # Since all the stopwords are in lower case, we have to convert the string to lowercase first
    example_sent = example_sent.lower()

    #This removes all digits from the given text
    example_sent = re.sub(" \d+", " ", example_sent)

    #This removes name of the review writter and extra white spaces
    trim_text = example_sent.split("Written by")
    example_sent = trim_text[0].strip()

    # This was a simple tokenizer that kept the punctuation symbols
    # word_tokens = word_tokenize(example_sent)
    
    # Japanese words are kept as a single word, so we can remove them easily, but urls, emails and html tags are splitted, so we
    # have to remove them before tokenizing
    
    # Removing url, emails and html tags
    # HTML TAGS
    example_sent = BeautifulSoup(example_sent, 'lxml').text

    # EMAIL ADDRESSES
    example_sent = re.sub(r'[\w\.-]+@[\w\.-]+', ' ', example_sent)

    # URLs
    example_sent = re.sub(r'http\S+', '', example_sent)

    # Removing punctuation symbol
    tokenizer = RegexpTokenizer(r'\w+')
    word_tokens = tokenizer.tokenize(example_sent)
    # Now we have obtained the tokenized words without punctuation symbols and with stopwords

    # POS Tagging the data (the stopwords improve the accuracy of the pos tagging, so we'll remove them later)
    # This method returns a list of tuples: (word, classification)
    tags = nltk.pos_tag(word_tokens)

    # We lemmatize all the words in the text by their category
    for i, word in enumerate(word_tokens):
        # Returns the lemmatized word given its category (if the key is not part of the map, the word is considered a noun)
        word_tokens[i] = lemmatizer.lemmatize(word, pos=pos_map.get(tags[i][1] , 'n'))

    # Removing stop words
    filtered_sentence = [w for w in word_tokens if not w in stop_words]

    # In html:  <br>  -->  br
    # In email:  jua@gmail.com  --> jua, gmail, com
    # In url: https://www.youtube.com  --> http, www, youtube, com
    return filtered_sentence

In [0]:
tokenizer("This is a kind,g & 1288 two t-imes")

['kind', 'g', 'two', 'imes']

**Starting Main part of Coding here**

In [0]:
from gensim.models.wrappers import FastText
import pandas as pd
from google.colab import drive

In [0]:
drive.mount('/gdrive')
%cd /gdrive
%cd My\ Drive/
%cd Colab\ Notebooks

In [0]:
movies_dataset = pd.read_json('IMDB_reviews.json', lines=True)
movies_dataset.head()

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary
0,10 February 2006,tt0111161,ur1898687,True,"In its Oscar year, Shawshank Redemption (writt...",10,A classic piece of unforgettable film-making.
1,6 September 2000,tt0111161,ur0842118,True,The Shawshank Redemption is without a doubt on...,10,Simply amazing. The best film of the 90's.
2,3 August 2001,tt0111161,ur1285640,True,I believe that this film is the best story eve...,8,The best story ever told on film
3,1 September 2002,tt0111161,ur1003471,True,"**Yes, there are SPOILERS here**This film has ...",10,Busy dying or busy living?
4,20 May 2004,tt0111161,ur0226855,True,At the heart of this extraordinary movie is a ...,8,"Great story, wondrously told and acted"


In [0]:
movies_dataset.count()

review_date       573913
movie_id          573913
user_id           573913
is_spoiler        573913
review_text       573913
rating            573913
review_summary    573913
dtype: int64

In [0]:
review_spoiler = movies_dataset[movies_dataset["is_spoiler"]==True]
review_nonspoiler = movies_dataset[movies_dataset["is_spoiler"]==False]

print(review_spoiler["is_spoiler"].count())
print(review_nonspoiler["is_spoiler"].count())
print("74% Non-Spoiler and 26% Spoiler reviews")

150924
422989
74% Non-Spoiler and 26% Spoiler reviews


In [0]:
review_text = movies_dataset[["is_spoiler","review_text"]]
review_text.head()

Unnamed: 0,is_spoiler,review_text
0,True,"In its Oscar year, Shawshank Redemption (writt..."
1,True,The Shawshank Redemption is without a doubt on...
2,True,I believe that this film is the best story eve...
3,True,"**Yes, there are SPOILERS here**This film has ..."
4,True,At the heart of this extraordinary movie is a ...


In [0]:
%%time
review_labled_file= open("reviews_labled_file.txt","w+")
for index,row in review_text.iterrows():
  tokkenize_words = tokenizer(row["review_text"])
  review_labled_file.write("__label__"+str(row["is_spoiler"])+" ")
  for words in tokkenize_words:
    review_labled_file.write(words+" ")
  review_labled_file.write("\n")

review_labled_file.close()
print(tokkenize_words)

In [0]:
#Installing FastText library
!git clone https://github.com/facebookresearch/fastText.git
%cd fastText

fatal: destination path 'fastText' already exists and is not an empty directory.
/gdrive/My Drive/Colab Notebooks/fastText


In [0]:
!sudo pip install .

Processing /gdrive/My Drive/Colab Notebooks/fastText
Building wheels for collected packages: fasttext


  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.1-cp36-cp36m-linux_x86_64.whl size=2827832 sha256=020f2f8fba55175a37f66c9934994c94c51dbe598f1bda7e8149efc01ef23e61
  Stored in directory: /tmp/pip-ephem-wheel-cache-jhs9goxt/wheels/c3/f0/49/322e1370b7c691aee976dfe8a8b1c23ef2a6fe913f1a2b480f
Successfully built fasttext
Installing collected packages: fasttext
Successfully installed fasttext-0.9.1


In [0]:
%cd ..

/gdrive/My Drive


**Applying FastText model**

In [0]:
import fasttext

In [0]:
#!head -n 20000 reviews_labled_file.txt > reviews.train
#!tail -n 5000 reviews_labled_file.txt > reviews.valid

In [0]:
#%%time
#model = fasttext.train_supervised(input='reviews.train', epoch=25,lr=0.1,wordNgrams=2)

CPU times: user 1min 1s, sys: 449 ms, total: 1min 2s
Wall time: 1min 2s


**Making Balaned dataset for testing and training**


In [0]:
%%time
reviews_balanced_labled_file = open("reviews_balanced_labled_file.txt","w+")
spoiler_reviews = review_text[review_text["is_spoiler"] == True]
non_spoiler_reviews = review_text[review_text["is_spoiler"] == False]

#spoiler_reviews.count
#non_spoiler_reviews.count
for index,(spoiler,non_spoiler) in enumerate(zip(spoiler_reviews["review_text"],non_spoiler_reviews["review_text"])):
  #print(spoiler)
  if(index >= 15000):
    tokkenize_spoiler_words = tokenizer(spoiler)
    tokkenize_non_spoiler_words = tokenizer(non_spoiler)
    reviews_balanced_labled_file.write("__label__True ")
    for words in tokkenize_spoiler_words:
      reviews_balanced_labled_file.write(words+" ")
    reviews_balanced_labled_file.write("\n")

    reviews_balanced_labled_file.write("__label__False ")
    for words in tokkenize_non_spoiler_words:
      reviews_balanced_labled_file.write(words+" ")
    reviews_balanced_labled_file.write("\n")

  if(index >= 50000):
    break

reviews_balanced_labled_file.close()

CPU times: user 15min 56s, sys: 2 s, total: 15min 58s
Wall time: 15min 59s


**Spliting balanced record between 55000 training examples and 15000 test example**

In [0]:
#Split 25000 balanced reviews into train file and 5000 in test file 
!head -n 55000 reviews_balanced_labled_file.txt > reviews_balanced.train
!tail -n 15000 reviews_balanced_labled_file.txt > reviews_balanced.valid

In [0]:
%%time
balanced_model = fasttext.train_supervised(input='reviews_balanced.train', epoch=5,lr=0.1,wordNgrams=3)

CPU times: user 54.9 s, sys: 332 ms, total: 55.2 s
Wall time: 55.4 s


**Model's Output**
15000, 0.712,0.712 = 15000 records, 0.712 precision and 0.712 recall

In [0]:
result = balanced_model.test("reviews_balanced.valid")
print(result)
f1 = (2*result[1]*result[2]) / (result[1]+result[2])
print(f1) 

(15000, 0.7122666666666667, 0.7122666666666667)
0.7122666666666666


In [0]:
balanced_model.predict("like clever movie like scary movie disposition already spent money two awful movie come hollywood year abysmal godsend first glance promising ultimately stupid disappoint forgotten proceed care late shyamalan work village trailer look promise desolate turn last century village sorrounded forest horrible creature live promise careful lately first check around net amaze see big load negative review roger ebert instance whose opinion usually respect give horribly low grade great nevertheless choose see must say quite pleasantly surprise lady gentleman nicely shot atmospheric thriller great cast good story finishing touch shyamalan cleverness could simply call brilliant compare late script hollywood vomit audience lousy review well basically two kind people want see movie first horror fan expect gruesome chilling potentially bloody tale puzzle movie fan less interested movie solve late shyamalan puzzle movie horror crowd disappoint scare movie way much characterization drama taste crowd well people twist time guessable although shyamalan still trick sleeve see seem shyamalan always live shadow masterpiece th sense people still remember get sock knock powerful end keep expect happen every follow movie bad hollywood realize twist trendy lately lot movie final twist stupid cheap illogical people today set expectation damn high especially see shyamalan name movie poster movie great atmosphere great cast fantastic mostly love clever logical whatever say consistent compare hollywood crap get serve lately good movie watch puzzle great movie well cent")

(('__label__True',), array([0.69451565]))

In [0]:
print(balanced_model['life'])

[-0.00590492  0.01446329 -0.0065431  -0.04612118  0.12874627  0.03796918
  0.05304569 -0.06361787  0.05034089  0.07864754 -0.03487376  0.01380542
  0.03031745  0.03229074 -0.07920521  0.01162964  0.02253317 -0.0042737
  0.01118704  0.00413822 -0.02564579  0.05541728 -0.00460801 -0.08184667
  0.03153966  0.00481635  0.02743126 -0.01452066  0.02139711  0.06736952
  0.00749059  0.01611351 -0.061692    0.00535904 -0.03925924 -0.0053232
 -0.03062334 -0.08202317  0.01669816  0.03659871  0.02393936  0.08669323
  0.01293368  0.03421118 -0.00031267 -0.03764828  0.058515   -0.01916988
 -0.05435836  0.02000924  0.03697843  0.00894822 -0.05317028  0.04528531
 -0.03900929 -0.1582035  -0.03711943  0.00615377  0.02943057  0.02692623
 -0.04032357  0.03548437 -0.02160002  0.00423668 -0.02201628 -0.02969143
 -0.07143614  0.01813809 -0.02329556  0.04860833 -0.08811036  0.04594233
 -0.00270451 -0.03042773 -0.01615424 -0.02413941  0.06808087  0.04424851
  0.06972113 -0.03991004 -0.04503711 -0.00320185  0.0

In [0]:
%cd Colab\ Notebooks

/gdrive/My Drive/Colab Notebooks


**Testing with whole dataset**

In [0]:
!head -n 401739 reviews_labled_file.txt > reviews_whole.train
!tail -n 172174 reviews_labled_file.txt > reviews_whole.valid

In [0]:
%%time
balanced_model = fasttext.train_supervised(input='reviews_whole.train', epoch=5,lr=0.1,wordNgrams=3)

CPU times: user 4.01 s, sys: 183 ms, total: 4.19 s
Wall time: 4.25 s


In [0]:
result = balanced_model.test("reviews_whole.valid")
print(result)
f1 = (2*result[1]*result[2]) / (result[1]+result[2])
print(f1) 

(5749, 0.7622195164376413, 0.7622195164376413)
0.7622195164376413
