# Text Pre-processing

**Tasks**
- remove html tags 

## import packages

In [40]:
import pandas as pd

# text pre-processing
from bs4 import BeautifulSoup

# text modelling 
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import spacy

import multiprocessing

## functions 

In [41]:
def preprocess_text(text):
    """ 
    Produces the text with html tags removed and converts to all lower case. 
    
    Arguments
    ---------
    text (pandas.core.series.Series) A series of text documents. 
    
    Returns
    -------
    pandas.core.series.Series A series of text with html tags removed & lower case letters. 
        
    """
    # initialize a list for cleaned text 
    clean_text = []
    for doc in text:
        
        ## remove html tags with beautifulsoup 
        soup = BeautifulSoup(doc)
        text = soup.get_text().lower()

        # append the text to a new series 
        clean_text.append(text)

    # convert list to a pandas series 
    clean_text = pd.Series(clean_text)
    
    return clean_text
    

## load data 

In [42]:
train = pd.read_csv("data/Train.csv")
X_train = train.text
y_train = train.label

In [43]:
X_train.head()

0    I grew up (b. 1965) watching and loving the Th...
1    When I put this movie in my DVD player, and sa...
2    Why do people who do not know what a particula...
3    Even though I have great interest in Biblical ...
4    Im a die hard Dads Army fan and nothing will e...
Name: text, dtype: object

In [44]:
y_train.head()

0    0
1    0
2    0
3    0
4    1
Name: label, dtype: int64

## pre-process text 


In [45]:
# pre-process the text 
X_train = preprocess_text(X_train)

In [46]:
X_train[0]

'i grew up (b. 1965) watching and loving the thunderbirds. all my mates at school watched. we played "thunderbirds" before school, during lunch and after school. we all wanted to be virgil or scott. no one wanted to be alan. counting down from 5 became an art form. i took my children to see the movie hoping they would get a glimpse of what i loved as a child. how bitterly disappointing. the only high point was the snappy theme tune. not that it could compare with the original score of the thunderbirds. thankfully early saturday mornings one television channel still plays reruns of the series gerry anderson and his wife created. jonatha frakes should hand in his directors chair, his version was completely hopeless. a waste of film. utter rubbish. a cgi remake may be acceptable but replacing marionettes with homo sapiens subsp. sapiens was a huge error of judgment.'

## Tokenize text

In [47]:
# load spacy model 
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])
# convert each document to a list of tokens
docs = []
for doc in nlp.pipe(X_train):
    doc_tokens = [token.text for token in doc]
    docs.append(doc_tokens)

## Build Doc2Vec Model 
The `Doc2Vec` instances take 2 inputs. A single document that is represented as a list of unicode strings (tokens) and a unique `tag` for the document. Can just be an integer index. 

The data structure input into `Doc2Vec` should be a list of `TaggedDocument`. 

In [48]:
# tag the documents 
tagged_docs = [TaggedDocument(words= doc, tags=[tag]) for tag, doc in enumerate(docs)]


In [25]:
# set number of processing cores 
cores = multiprocessing.cpu_count()


In [30]:
# set model params
max_epochs = 100
vec_size = 100
min_count=2
alpha = 0.025
dm=1
window=10

# initialize the model 
model = Doc2Vec(vector_size=vec_size,
               min_count=min_count,
               dm=dm,
               epochs=max_epochs,
               window=window, 
               workers=cores)


The `.build_vocab` builds a dictionary for the model. It consists of all the unique words from the training corpus along with their word count frequency in the corpus. 

The vocabulary can be access by: `model.wv.vocab`

In [38]:
# get the number of times "love" is used in the corpus
print("Word love is used {} times throughout the corpus.".format(model.wv.vocab['love'].count))

Word love is used 10403 times throughout the corpus.


In [31]:
# build the vocobulary 
model.build_vocab(tagged_docs)

model.train(documents=tagged_docs, 
            total_examples=model.corpus_count, 
            epochs=model.epochs)

In [32]:
# save the model 
model.save("results/d2v.model")

## Explore the Model 

In [2]:
# load the saved model 
model = Doc2Vec.load("results/d2v.model")

In [20]:
# load spacy model 
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])

# infer doc vector for a new document 
test_data = nlp("i love the imdb.")

# get the text from the document
test_data_text = [token.text for token in test_data]

In [21]:
# look at the test_data
test_data_text

['i', 'love', 'the', 'imdb', '.']

### Infer a vector 

In [23]:
# look at the inferred vector for the test document 
test_data_vector = model.infer_vector(test_data_text)
test_data_vector

array([-1.3734988 ,  0.18976772, -0.6103211 , -0.14179629,  0.0648508 ,
        0.74232054, -0.3678494 , -0.45160022, -0.43654352,  0.34054545,
        0.09597275,  0.5183157 ,  0.6617551 , -0.03661314,  0.9086173 ,
        0.0165403 , -0.930917  , -0.24195996,  0.8548493 ,  0.170209  ,
        0.48797396, -0.13557488, -0.690034  , -0.29931948,  0.94054925,
        0.2746831 ,  1.129763  , -0.6760563 , -0.103834  , -0.6569413 ,
       -0.5803231 , -0.078127  ,  0.35254028, -0.27329898,  0.7407973 ,
        0.12073723,  0.06908166, -0.39688852, -0.90500194,  1.1494603 ,
       -0.14802633, -0.2030067 ,  0.74796444, -0.9517787 , -0.14173594,
       -0.00737411, -0.1972842 ,  0.3129382 , -0.29342565, -1.3055211 ,
       -0.12911199,  0.09247501,  0.62931174,  1.1020342 , -0.14113016,
        0.13749035, -0.13245723,  0.51655716,  0.10914865,  0.01775738,
        0.5450587 ,  0.73628545, -0.18425721,  0.36158788, -0.1964601 ,
        0.41417643,  0.30609086, -0.02259944, -0.32677224, -0.92

In [24]:
# check that the vector is 100 dimensions 
len(test_data_vector)

100

### Get doc vector for a document in training data 


In [30]:
# get doc vector for document with tag 0
model.docvecs[0]

array([ 1.0750184 , -1.9905442 , -4.4494996 , -0.53621876,  3.4369726 ,
        2.8636072 ,  1.4251504 , -0.7873193 ,  1.6152562 ,  1.7793663 ,
        3.8462756 , -2.4988034 , -0.6999853 ,  3.5884042 ,  1.1586093 ,
       -0.8732167 ,  1.7217646 , -0.17224672,  0.46198705,  0.7636378 ,
       -0.32666126, -3.6621585 , -3.4452653 ,  0.7645475 , -0.85489446,
       -0.67878807,  3.566059  , -2.6591861 , -3.8239892 , -1.3997021 ,
        2.7022707 ,  0.4104823 ,  0.5845527 ,  1.5927402 , -2.1032183 ,
        0.84438735,  3.537056  , -0.9749475 ,  1.007521  , -1.9738562 ,
        1.1589872 ,  1.7555975 , -0.5638223 , -1.5018481 ,  2.7231636 ,
       -2.4119732 , -1.0664274 ,  1.5496196 ,  3.2364523 ,  0.6432062 ,
        0.45458597,  1.8402778 , -0.77025706,  1.7505522 , -0.06603717,
        0.03137324,  0.52317595,  1.290314  ,  0.7653793 , -0.8724251 ,
       -0.8246546 ,  1.0141776 ,  0.9919102 , -1.3215085 ,  2.8690574 ,
       -0.07864825,  0.25158533,  0.14985803,  1.3841814 , -1.17

In [29]:
# get the doc vector for the document with tag 1
model.docvecs[1]

array([-0.7390024 ,  2.581647  , -4.2145786 , -2.2802477 , -0.3221571 ,
        4.741527  ,  0.17803015,  0.6950085 ,  0.13797103,  1.9404503 ,
        1.6041267 , -1.5517176 , -3.6905885 , -2.1935484 , -0.94091237,
       -0.00773263,  0.9710394 ,  0.62761796, -1.1550109 ,  0.18732233,
        0.07355768, -3.2035456 ,  0.8499388 ,  0.06021781,  0.8296846 ,
        3.2018712 , -1.2890564 ,  0.14946471,  4.286643  , -4.837212  ,
        2.058952  , -1.7170444 ,  2.4506824 ,  0.67044514, -0.9461834 ,
       -1.9214734 , -0.5804667 , -0.49450612,  0.26986015,  0.09560651,
       -3.2881393 ,  1.0870681 ,  3.6450682 ,  0.61791784,  2.1359613 ,
       -4.0119777 ,  0.6193669 , -0.42527515,  0.8767335 ,  0.06208834,
       -0.51736164,  4.9553204 ,  0.24020366, -0.29787928,  0.8699073 ,
        1.5285801 , -2.281047  ,  1.1915747 , -2.7606277 , -0.7646156 ,
        5.294057  ,  2.4270234 ,  4.6306705 ,  1.6088039 ,  1.81563   ,
       -0.19997375,  2.6723425 ,  0.26849866, -0.90769136,  5.36

### Most similar document 
This to returns the document tags along with the cosine similarity score to `doc 1`. 

In [55]:
most_sim_docs = model.docvecs.most_similar(1)
most_sim_docs

[(25471, 0.524448812007904),
 (4513, 0.46837306022644043),
 (12130, 0.46624577045440674),
 (1437, 0.465658038854599),
 (17813, 0.4653197228908539),
 (29350, 0.456831693649292),
 (16058, 0.4560166597366333),
 (32890, 0.45247572660446167),
 (33254, 0.45109111070632935),
 (34499, 0.4502790570259094)]

In [88]:
# look at doc 1 
query_text = " ".join(tagged_docs[1].words)
query_text

"when i put this movie in my dvd player , and sat down with a coke and some chips , i had some expectations . i was hoping that this movie would contain some of the strong - points of the first movie : awsome animation , good flowing story , excellent voice cast , funny comedy and a kick - ass soundtrack . but , to my disappointment , not any of this is to be found in atlantis : milo 's return . had i read some reviews first , i might not have been so let down . the following paragraph will be directed to those who have seen the first movie , and who enjoyed it primarily for the points mentioned.when the first scene appears , your in for a shock if you just picked atlantis : milo 's return from the display - case at your local videoshop ( or whatever ) , and had the expectations i had . the music feels as a bad imitation of the first movie , and the voice cast has been replaced by a not so fitting one . ( with the exception of a few characters , like the voice of sweet ) . the actual d

In [89]:
# initialize a dictionary to hold most similar texts 
# keyed by their index in the TaggedDocuments lists 
most_similar_texts = {}

# get the texts of the most similar docs
for most_sim_doc in most_sim_docs:
    # get the tagged doc index 
    index = most_sim_doc[0]
    
    # convert the tokens from most similar into text
    most_sim_text = " ".join(tagged_docs[index].words)
    
    # append the text to the list 
    most_similar_texts[index] = most_sim_text
    

In [90]:
# convert the most similar texts into a dataframe 
most_similar_df = pd.DataFrame(most_similar_texts, index=[1]).transpose().rename(columns={1:'most_similar_texts'})
most_similar_df

Unnamed: 0,most_similar_texts
25471,"all dogs go to heaven was a quirky , funny mov..."
4513,this is a very cool movie . the ending of the ...
12130,this third pokemon movie is too abstract for y...
1437,i wish i could find some good things to say ab...
17813,it started out slow after an excellent animate...
29350,this movie is quite better than the first one ...
16058,102 dalmatians [ walt disney ] : i was n't a f...
32890,i had absolutely nothing to do the past weeken...
33254,i could n't keep from commenting after reading...
34499,they did it again : ripped off an old show 's ...


In [77]:
# most similar text 25471
most_similar_df.loc[25471][0]

"all dogs go to heaven was a quirky , funny movie ; with good name talent who 's voices lended an adult familiarity to a cartoon basicly for kids . it was just interesting enough to be likeable by adults aside from something good for the kids to watch.unfortunately adgth2 is a valueless sequel trying to make a bit of cash rideing on the coattails of the first . charlie sheen is a passable replacement for burt reynolds in this second movie and sheena easton 's voice in a few of the movies lovely but forgettable songs makes her a worthwhile pick as a co - star for this . add dom deluise from the first movie and you 'd think this would be a decent mix to make this sequel at least relatively decent compared to the first one.unfortunately even with the addition of other good voice actors such as bebe neuwirth in the horrible role of anabelle , this movie can not be saved from the atrocious production values and animation skills ( or lack thereof ) present all over this movie . horrible edit

In [79]:
most_similar_df.loc[4513][0]

"this is a very cool movie . the ending of the movie is a bit more defined than the play 's ending , but either way it is still a good movie ."

In [80]:
most_similar_df.loc[12130][0]

"this third pokemon movie is too abstract for younger kids to follow and too repetitious to entertain older kids . the message of the film-- about dealing with loss-- is subverted by the return of the young girl 's father during the film 's credits . team rocket provide some amusement , but they 're not really part of the small plot , so they do n't appear very often ."

### Most similar words

In [33]:
model.wv.most_similar('love')

[('hate', 0.6397315859794617),
 ('adore', 0.5936261415481567),
 ('enjoy', 0.5934271812438965),
 ('loved', 0.5764082670211792),
 ('dislike', 0.5725849270820618),
 ('prefer', 0.535727858543396),
 ('agree', 0.5338077545166016),
 ('mean', 0.5323845744132996),
 ('recommend', 0.5302844047546387),
 ('think', 0.5271144509315491)]

In [34]:
model.wv.most_similar('violent')

[('brutal', 0.5958952903747559),
 ('graphic', 0.593812882900238),
 ('disturbing', 0.5733826756477356),
 ('gory', 0.5440038442611694),
 ('tame', 0.5431349277496338),
 ('explicit', 0.5158456563949585),
 ('violence', 0.4995356798171997),
 ('graphically', 0.48809197545051575),
 ('nasty', 0.47407007217407227),
 ('gruesome', 0.4647546410560608)]

In [82]:
model.wv.most_similar('grass')

[('stairways', 0.41978752613067627),
 ('fluorescent', 0.41299641132354736),
 ('shaven', 0.39963239431381226),
 ('movement', 0.3951754570007324),
 ('foam', 0.3854529559612274),
 ('compadres', 0.38530921936035156),
 ('pond', 0.3831974267959595),
 ('cadwell', 0.37259089946746826),
 ('jello', 0.3722376823425293),
 ('fur', 0.3714328408241272)]

In [83]:
model.wv.most_similar('cat')

[('dog', 0.5081092119216919),
 ('rabbit', 0.5076563954353333),
 ('madman', 0.4489584267139435),
 ('snake', 0.4232267737388611),
 ('grandpa', 0.4164738655090332),
 ('mickey', 0.4138184189796448),
 ('walrus', 0.40957218408584595),
 ('baby', 0.40911436080932617),
 ('"randy', 0.4075496196746826),
 ('pet', 0.40383607149124146)]

In [84]:
model.wv.most_similar('hero')

[('heroes', 0.6354663968086243),
 ('protagonist', 0.6213669180870056),
 ('heroine', 0.5825915932655334),
 ('villain', 0.5639052987098694),
 ('girlfriend', 0.46861889958381653),
 ('boyfriend', 0.45519381761550903),
 ('catchphrase', 0.4545667767524719),
 ('antagonist', 0.4537510573863983),
 ('dad', 0.45145243406295776),
 ('henchman', 0.4488012194633484)]

## Assess the Model 

The first sanity check in assessing the model is to compare the document vector obtained through model training to the doc vector produced via the `infer_vector` method of the doc2vec model (use the trainted model to infer the training document vectors). These inferred vectors are expected to be very close to the vectors learned during training and this check just makes sure nothing has gone very wrong with the model. 