# Text Pre-processing

**Tasks**
- remove html tags 

## import packages

In [5]:
import pandas as pd

# text pre-processing
from bs4 import BeautifulSoup

# text modelling 
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import spacy

import multiprocessing

## functions 

In [11]:
def preprocess_text(text):
    """ 
    Produces the text with html tags removed and converts to all lower case. 
    
    Arguments
    ---------
    text (pandas.core.series.Series) A series of text documents. 
    
    Returns
    -------
    pandas.core.series.Series A series of text with html tags removed & lower case letters. 
        
    """
    # initialize a list for cleaned text 
    clean_text = []
    for doc in text:
        
        ## remove html tags with beautifulsoup 
        soup = BeautifulSoup(doc)
        text = soup.get_text().lower()

        # append the text to a new series 
        clean_text.append(text)

    # convert list to a pandas series 
    clean_text = pd.Series(clean_text)
    
    return clean_text
    

In [None]:
# initialize a dict to store the doc vectors 
vector_dict = {}
colnames = []
for i in range(len(model.docvecs)):
    # build the dict of doc vectors 
    vector_dict[i] = model.docvecs[i]
    
# create the column names
for dim in range(vec_size):
    colname = "dim_{0}".format(dim)
    colnames.append(colname)
    
# create a dataframe of doc vectors
vector_df = pd.DataFrame(vector_dict).transpose()
# set the col names to be number of dimensions
vector_df.columns = colnames

## load data 

In [12]:
train = pd.read_csv("data/Train.csv")
X_train = train.text
y_train = train.label

In [13]:
X_train.head()

0    I grew up (b. 1965) watching and loving the Th...
1    When I put this movie in my DVD player, and sa...
2    Why do people who do not know what a particula...
3    Even though I have great interest in Biblical ...
4    Im a die hard Dads Army fan and nothing will e...
Name: text, dtype: object

In [56]:
print("There are {} training documents.".format(len(X_train)))

There are 40000 training documents.


In [14]:
y_train.head()

0    0
1    0
2    0
3    0
4    1
Name: label, dtype: int64

## pre-process text 


In [15]:
# pre-process the text 
X_train = preprocess_text(X_train)

In [100]:
X_train[0]

'i grew up (b. 1965) watching and loving the thunderbirds. all my mates at school watched. we played "thunderbirds" before school, during lunch and after school. we all wanted to be virgil or scott. no one wanted to be alan. counting down from 5 became an art form. i took my children to see the movie hoping they would get a glimpse of what i loved as a child. how bitterly disappointing. the only high point was the snappy theme tune. not that it could compare with the original score of the thunderbirds. thankfully early saturday mornings one television channel still plays reruns of the series gerry anderson and his wife created. jonatha frakes should hand in his directors chair, his version was completely hopeless. a waste of film. utter rubbish. a cgi remake may be acceptable but replacing marionettes with homo sapiens subsp. sapiens was a huge error of judgment.'

In [103]:
X_train_clean_df = X_train.to_frame()
X_train_clean_df.columns = ["text"]
X_train_clean_df.head()

Unnamed: 0,text
0,i grew up (b. 1965) watching and loving the th...
1,"when i put this movie in my dvd player, and sa..."
2,why do people who do not know what a particula...
3,even though i have great interest in biblical ...
4,im a die hard dads army fan and nothing will e...


In [104]:
# save the cleaned train data 
X_train_clean_df.to_csv("data/X_train_clean.csv")

## Tokenize text

In [17]:
# load spacy model 
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])
# convert each document to a list of tokens
docs = []
for doc in nlp.pipe(X_train):
    doc_tokens = [token.text for token in doc]
    docs.append(doc_tokens)

## Build Doc2Vec Model 
The `Doc2Vec` instances take 2 inputs. A single document that is represented as a list of unicode strings (tokens) and a unique `tag` for the document. Can just be an integer index. 

The data structure input into `Doc2Vec` should be a list of `TaggedDocument`. 

In [18]:
# tag the documents 
tagged_docs = [TaggedDocument(words= doc, tags=[tag]) for tag, doc in enumerate(docs)]


In [19]:
# set number of processing cores 
cores = multiprocessing.cpu_count()


In [20]:
# set model params
max_epochs = 100
vec_size = 100
min_count=2
alpha = 0.025
dm=1
window=10

# initialize the model 
model = Doc2Vec(vector_size=vec_size,
               min_count=min_count,
               dm=dm,
               epochs=max_epochs,
               window=window, 
               workers=cores)


The `.build_vocab` builds a dictionary for the model. It consists of all the unique words from the training corpus along with their word count frequency in the corpus. 

The vocabulary can be access by: `model.wv.vocab`

In [25]:
# build the vocobulary 
model.build_vocab(tagged_docs)

In [26]:
# get the number of times "love" is used in the corpus
print("Word love is used {} times throughout the corpus.".format(model.wv.vocab['love'].count))

Word love is used 10403 times throughout the corpus.


In [27]:
# train the model
model.train(documents=tagged_docs, 
            total_examples=model.corpus_count, 
            epochs=model.epochs)

In [29]:
# save the model 
model.save("results/d2v.model")

## Explore the Model 

In [2]:
# load the saved model 
model = Doc2Vec.load("results/d2v.model")

In [31]:
# load spacy model 
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])

# infer doc vector for a new document 
test_data = nlp("i love the imdb.")

# get the text from the document
test_data_text = [token.text for token in test_data]

In [32]:
# look at the test_data
test_data_text

['i', 'love', 'the', 'imdb', '.']

### Infer a vector 

In [33]:
# look at the inferred vector for the test document 
test_data_vector = model.infer_vector(test_data_text)
test_data_vector

array([ 0.3660542 , -0.35544372, -0.16209263, -0.31187433, -0.3878265 ,
       -0.39781448,  0.86542064,  0.39633673, -0.6795637 , -0.6038303 ,
       -0.3122509 ,  0.4093655 ,  0.12963279,  0.1514839 , -0.04725248,
        0.04435014, -0.30326045,  0.55323535, -0.24522036, -0.6166701 ,
       -0.16535714, -0.6879205 ,  0.53495556,  0.6721085 , -0.26523936,
       -0.25310084,  0.6211377 ,  0.57464284, -0.27010357,  0.14269952,
       -0.49309996, -0.11419404,  1.4739265 ,  0.24999467, -0.38384798,
        0.0229602 ,  0.6399379 ,  1.1553128 , -0.6511551 , -0.31271848,
        0.0299325 , -0.08276461, -0.15362152,  0.21290593,  0.33925855,
        0.22131093,  0.12085311,  0.58260417, -0.3449641 ,  0.4908272 ,
       -0.6026803 ,  0.21922089,  0.01464912, -0.83664036, -0.48538578,
        0.2655007 ,  0.571258  , -0.4635811 , -1.1522716 , -1.2571613 ,
       -0.91198575, -0.1254218 ,  0.07205608, -0.2206613 , -0.78305876,
        0.02976534,  0.12442748, -0.6776802 ,  0.58454186, -0.21

In [34]:
# check that the vector is 100 dimensions 
len(test_data_vector)

100

### Get doc vector for a document in training data 


In [35]:
# get doc vector for document with tag 0
model.docvecs[0]

array([ 0.78455496, -0.3975019 , -0.33198473,  0.96559227,  2.0129318 ,
        1.8874876 ,  0.5107813 ,  3.2886736 ,  3.4330976 , -3.7912304 ,
       -0.39993343,  1.5135373 ,  0.43739882,  0.38999793, -1.2009306 ,
       -1.7992256 ,  1.1126815 ,  3.2383354 , -0.6476589 , -0.9192871 ,
        1.3353504 ,  0.20898631, -2.086641  ,  4.331556  ,  1.078577  ,
        3.1066291 ,  1.0192515 , -0.995883  , -0.25814018, -3.7610812 ,
       -1.5125705 , -3.084904  , -2.0255165 , -1.2520654 ,  2.9139643 ,
       -0.35589314,  2.7762187 ,  1.2263558 ,  0.9764989 ,  1.4696493 ,
        1.4430372 , -1.3313451 ,  1.0044692 ,  1.668386  ,  0.0148002 ,
        1.4403996 ,  0.5381073 , -1.1963979 , -3.035492  , -1.0050448 ,
       -0.22705024, -1.8456436 , -0.25627303,  1.8898085 , -1.9619877 ,
       -1.9410548 , -1.2830682 ,  0.47285038, -0.6256406 ,  2.8846972 ,
       -3.8962908 , -1.5789651 ,  0.6843119 ,  1.1497818 , -0.8217583 ,
        0.9814244 , -0.49725807,  2.9933703 ,  1.2505748 ,  0.27

In [36]:
# get the doc vector for the document with tag 1
model.docvecs[1]

array([ 1.0276638 , -3.7865596 , -1.5838888 ,  3.1244397 , -0.8921881 ,
       -1.5585665 ,  3.3190892 ,  1.9880208 ,  2.08938   , -0.5240976 ,
       -2.2305508 ,  3.5747876 , -1.9769565 ,  1.8865669 ,  0.38790658,
        3.4950302 , -2.8666272 ,  5.3179765 ,  2.8796036 , -0.12509392,
       -4.834104  , -0.20956892, -1.3277993 ,  0.6330954 , -1.9985896 ,
        3.5359511 ,  0.0689284 , -2.369902  , -0.10371426,  1.5585757 ,
       -1.0368974 ,  1.5165955 ,  2.4063566 , -0.2352102 ,  3.966048  ,
        1.9501489 ,  5.435891  ,  1.4581848 ,  1.5568197 , -1.9583993 ,
        1.6910241 ,  0.55866873,  1.5060309 , -2.7832923 ,  0.11567657,
        3.2735307 ,  0.39607665, -1.6610876 , -1.4223877 , -1.536292  ,
       -0.59399366, -0.19992135,  0.06863879, -1.8700184 ,  0.01535263,
        0.95391506, -1.1874477 ,  0.05783149,  0.427828  , -0.26444364,
       -4.614383  , -0.66521543,  1.0540657 , -1.4417485 , -3.6814992 ,
        4.929885  , -0.8851586 ,  1.9309868 , -2.2361135 ,  0.90

### Most similar document 
This to returns the document tags along with the cosine similarity score to `doc 1`. 

In [37]:
most_sim_docs = model.docvecs.most_similar(1)
most_sim_docs

[(21946, 0.47781458497047424),
 (25471, 0.4739196300506592),
 (33118, 0.45629557967185974),
 (34488, 0.4521036744117737),
 (17886, 0.45017537474632263),
 (4513, 0.44906243681907654),
 (25708, 0.44853517413139343),
 (26544, 0.4482578933238983),
 (18306, 0.44417983293533325),
 (34499, 0.4436340034008026)]

In [38]:
# look at doc 1 
query_text = " ".join(tagged_docs[1].words)
query_text

"when i put this movie in my dvd player , and sat down with a coke and some chips , i had some expectations . i was hoping that this movie would contain some of the strong - points of the first movie : awsome animation , good flowing story , excellent voice cast , funny comedy and a kick - ass soundtrack . but , to my disappointment , not any of this is to be found in atlantis : milo 's return . had i read some reviews first , i might not have been so let down . the following paragraph will be directed to those who have seen the first movie , and who enjoyed it primarily for the points mentioned.when the first scene appears , your in for a shock if you just picked atlantis : milo 's return from the display - case at your local videoshop ( or whatever ) , and had the expectations i had . the music feels as a bad imitation of the first movie , and the voice cast has been replaced by a not so fitting one . ( with the exception of a few characters , like the voice of sweet ) . the actual d

In [39]:
# initialize a dictionary to hold most similar texts 
# keyed by their index in the TaggedDocuments lists 
most_similar_texts = {}

# get the texts of the most similar docs
for most_sim_doc in most_sim_docs:
    # get the tagged doc index 
    index = most_sim_doc[0]
    
    # convert the tokens from most similar into text
    most_sim_text = " ".join(tagged_docs[index].words)
    
    # append the text to the list 
    most_similar_texts[index] = most_sim_text
    

In [40]:
# convert the most similar texts into a dataframe 
most_similar_df = pd.DataFrame(most_similar_texts, index=[1]).transpose().rename(columns={1:'most_similar_texts'})
most_similar_df

Unnamed: 0,most_similar_texts
21946,very bad movie ........ and i mean very bad .....
25471,"all dogs go to heaven was a quirky , funny mov..."
33118,"since watching the trailer in "" the little mer..."
34488,"to be clear from the get go , ' the bagman ' i..."
17886,this movie is just plain silly . almost every ...
4513,this is a very cool movie . the ending of the ...
25708,i had a bit of hope for this hour long film ma...
26544,i did n't enjoy this film . i thought the acti...
18306,my main criticism with the movie is the animat...
34499,they did it again : ripped off an old show 's ...


In [44]:
# most similar text 25471
most_similar_df.loc[21946][0]

"very bad movie ........ and i mean very bad ... the plot is predictable , and it 's eally cheesy , the creativeness of the battle and the dance scenes for the time are the only reason i did n't give the movie a one , other than that ... this is def a movie one can def afford not to watch ..... i feel while watching the movie , the idea behind the movie was an interesting one tho kind of cliché .... bringing country bumpkins to the city blah blah blah , but i feel it might have been at least a little better if it just was n't so cheesy , very poorly portrayed from idea to screen , i think . the plot is somewhat predictable at times , tho the dancing i can say at times , is pretty good , the break dance battle twist was good ..... if u just pop the movie and watch the dance scenes and make up your own dialog maybe it can be a 5 ... lol"

In [45]:
most_similar_df.loc[25471][0]

"all dogs go to heaven was a quirky , funny movie ; with good name talent who 's voices lended an adult familiarity to a cartoon basicly for kids . it was just interesting enough to be likeable by adults aside from something good for the kids to watch.unfortunately adgth2 is a valueless sequel trying to make a bit of cash rideing on the coattails of the first . charlie sheen is a passable replacement for burt reynolds in this second movie and sheena easton 's voice in a few of the movies lovely but forgettable songs makes her a worthwhile pick as a co - star for this . add dom deluise from the first movie and you 'd think this would be a decent mix to make this sequel at least relatively decent compared to the first one.unfortunately even with the addition of other good voice actors such as bebe neuwirth in the horrible role of anabelle , this movie can not be saved from the atrocious production values and animation skills ( or lack thereof ) present all over this movie . horrible edit

In [46]:
most_similar_df.loc[33118][0]

'since watching the trailer in " the little mermaid ii : return to the sea " dvd , i had a feeling that this movie is gon na be great \'cause i am a huge disney fan . and guess what ? i \'m right ! this movie is a very worthy successor to the original classic " lady and the tramp".it tells the story of scamp , lady and tramp \'s mischievious son scamp , who wants to be wild and free instead of living a housedog life . though the movie might not be as good as the first one , it has a great moral that you could n\'t find anywhere else until you watch it.i admit that the movie is n\'t for everyone , but those of you who hate it , all i can say is that you do n\'t have a spirit for this and i suggest that you should n\'t go see it again . but hey ! it \'s really an awesome story , packed with brilliant animation , music , and star - studded voice talents featuring scott wolf(party of five ) and alyssa milano(charmed ) . so if you have n\'t seen the movie , why standing there ? go and grab 

### Most similar words

In [47]:
model.wv.most_similar('love')

[('hate', 0.5928248167037964),
 ('loved', 0.5912954211235046),
 ('enjoy', 0.5877125263214111),
 ('mean', 0.5757228136062622),
 ('adore', 0.5671936273574829),
 ('recommend', 0.5667845010757446),
 ('think', 0.5659958124160767),
 ('suggest', 0.5496004223823547),
 ('dislike', 0.5439720153808594),
 ('liked', 0.5359171628952026)]

In [48]:
model.wv.most_similar('violent')

[('brutal', 0.6238125562667847),
 ('graphic', 0.5960907936096191),
 ('tame', 0.5844867825508118),
 ('disturbing', 0.5794614553451538),
 ('violence', 0.5529541969299316),
 ('gory', 0.5460259914398193),
 ('explicit', 0.5345040559768677),
 ('nasty', 0.5000576972961426),
 ('vicious', 0.4735429286956787),
 ('violence.the', 0.4725140929222107)]

In [49]:
model.wv.most_similar('grass')

[('compadres', 0.4567449688911438),
 ('airless', 0.4091748595237732),
 ('stairways', 0.4077175259590149),
 ('halo', 0.40057915449142456),
 ('wildebeests', 0.38299745321273804),
 ('foliage', 0.3772423267364502),
 ('marebito', 0.3742396831512451),
 ('champs', 0.37367862462997437),
 ('jello', 0.3735056519508362),
 ('fluorescent', 0.3645794987678528)]

In [50]:
model.wv.most_similar('cat')

[('dog', 0.5416181087493896),
 ('rabbit', 0.4458198547363281),
 ('sloth', 0.437106192111969),
 ('shark', 0.42957472801208496),
 ('jack', 0.4259747564792633),
 ('pet', 0.41923123598098755),
 ('twin', 0.4170091152191162),
 ('puppetmaster', 0.4132137894630432),
 ('snake', 0.411146879196167),
 ('parrot', 0.40274378657341003)]

In [51]:
model.wv.most_similar('hero')

[('heroes', 0.6207660436630249),
 ('protagonist', 0.6020461320877075),
 ('heroine', 0.5781691074371338),
 ('villain', 0.5496834516525269),
 ('everyman', 0.48411333560943604),
 ('boyfriend', 0.47233858704566956),
 ('girlfriend', 0.4598536193370819),
 ('antagonist', 0.4566743075847626),
 ('partner', 0.454791784286499),
 ('superhero', 0.4533049166202545)]

## Extract Doc Vectors 

In [95]:
# initialize a dict to store the doc vectors 
vector_dict = {}
colnames = []
for i in range(len(model.docvecs)):
    # build the dict of doc vectors 
    vector_dict[i] = model.docvecs[i]
    
# create the column names
for dim in range(vec_size):
    colname = "dim_{0}".format(dim)
    colnames.append(colname)

In [96]:
# create a dataframe of doc vectors
vector_df = pd.DataFrame(vector_dict).transpose()
# set the col names to be number of dimensions
vector_df.columns = colnames

In [98]:
print("There are {0} documents; each document is represented in {1} dimensions.".format(vector_df.shape[0], vector_df.shape[1]))

There are 40000 documents; each document is represented in 100 dimensions.


In [91]:
# look at the first few documents 
# each row represents a movie review 
vector_df.head()

Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,...,dim_90,dim_91,dim_92,dim_93,dim_94,dim_95,dim_96,dim_97,dim_98,dim_99
0,0.784555,-0.397502,-0.331985,0.965592,2.012932,1.887488,0.510781,3.288674,3.433098,-3.79123,...,-1.244087,1.192884,0.891242,2.866597,8.549128,-0.478778,1.692087,2.417104,1.771905,-1.312251
1,1.027664,-3.78656,-1.583889,3.12444,-0.892188,-1.558566,3.319089,1.988021,2.08938,-0.524098,...,0.397586,2.595253,2.952577,-2.68418,1.753658,-1.200642,-0.714399,-1.44035,1.5492,-1.61397
2,0.591049,-1.170077,0.830686,3.815988,-0.730998,0.646123,4.21671,3.768117,1.581684,0.66036,...,2.918566,0.399827,-1.357417,-0.124357,0.657655,-2.304216,-3.644226,-0.174466,2.901997,0.991823
3,-1.579851,0.370804,0.325788,2.72486,0.076738,0.271273,2.576356,1.767729,-1.268051,-3.105602,...,1.757116,-3.50896,-0.034007,0.554777,2.150062,1.295318,-0.126439,2.238556,1.850339,-1.993807
4,-1.05891,-1.690887,-0.004952,-2.187342,-3.583359,2.466982,2.651749,0.484369,1.344823,-2.241074,...,-2.418538,-4.10603,2.691415,-2.924291,-0.589641,-2.774781,-1.087569,-1.415747,-0.630574,0.065931


In [99]:
# save the document vector dataset
vector_df.to_csv("data/train_d2v.csv")