# Loading

In [185]:
import pandas as pd
import numpy as np
import re
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from gensim.models import Word2Vec

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,topic,text
0,0,Student,In het kader van kernfusie op aarde: MAAK JE ...
1,1,InvestmentBanking,"Just so you know, this blog isn't about being ..."
2,2,Education,"I I donÃ¢ÂÂt remember his name, but I remem..."
3,3,Science,so there they were in the cotton candy shack. ...
4,4,Student,By: TIRI!!! I was sitting in the police sta...


In [4]:
df.reset_index(inplace = True)

In [5]:
df = df[['index', 'topic', 'text']]

In [6]:
df.head()

Unnamed: 0,index,topic,text
0,0,Student,In het kader van kernfusie op aarde: MAAK JE ...
1,1,InvestmentBanking,"Just so you know, this blog isn't about being ..."
2,2,Education,"I I donÃ¢ÂÂt remember his name, but I remem..."
3,3,Science,so there they were in the cotton candy shack. ...
4,4,Student,By: TIRI!!! I was sitting in the police sta...


# Processing

### Removing unwanted characters

In [7]:
df.text[50][:1500]

'Ã¢Â\x80Â\x9cIÃ¢Â\x80Â\x99m sorry, what?Ã¢Â\x80Â\x9d I mumbled as I looked up, meeting eyes with my waitress. IÃ¢Â\x80Â\x99m not entirely sure how long she had been standing there or how many times she had asked her question. Ã¢Â\x80Â\x9cOh um, right. Could I get a mocha?Ã¢Â\x80Â\x9d  Ã¢Â\x80Â\x9cSure thing, Hon. No problem,Ã¢Â\x80Â\x9d she replied in a motherly voice as she turned to head back to the counter.  Ã¢Â\x80Â\x9cOh, and a little extra chocolate, please!Ã¢Â\x80Â\x9d I called after her. It wasnÃ¢Â\x80Â\x99t that I was a chocolate freak. I just had to cover up the bite of the espresso. Too much of that, and my stomach was done.  The cafÃ\x83Â© was quaint and quiet. But then again, how much business can you expect on a Wednesday night? Sure, there was the troubled Goth in the corner, the terribly hip guy strumming his guitar in the other corner, and a pair of half-dressed, bleach-blonde creatures I can only describe as Ã¢Â\x80Â\x9cGigglyÃ¢Â\x80Â\x9d sitting a few tables to my ri

In [8]:
df['text'] = df.text.replace('[^a-zA-Z.,?: /\\\'\"]', '', regex = True)

In [9]:
df.text[50][:1500]

'Im sorry, what? I mumbled as I looked up, meeting eyes with my waitress. Im not entirely sure how long she had been standing there or how many times she had asked her question. Oh um, right. Could I get a mocha?  Sure thing, Hon. No problem, she replied in a motherly voice as she turned to head back to the counter.  Oh, and a little extra chocolate, please I called after her. It wasnt that I was a chocolate freak. I just had to cover up the bite of the espresso. Too much of that, and my stomach was done.  The caf was quaint and quiet. But then again, how much business can you expect on a Wednesday night? Sure, there was the troubled Goth in the corner, the terribly hip guy strumming his guitar in the other corner, and a pair of halfdressed, bleachblonde creatures I can only describe as Giggly sitting a few tables to my right. And then there was me. What the hell was I doing here? Ah well, the chair was comfortable enough, and I was well on my way to some mochafilled solitude  She must

In [10]:
df.head()

Unnamed: 0,index,topic,text
0,0,Student,In het kader van kernfusie op aarde: MAAK JE ...
1,1,InvestmentBanking,"Just so you know, this blog isn't about being ..."
2,2,Education,"I I dont remember his name, but I remember th..."
3,3,Science,so there they were in the cotton candy shack. ...
4,4,Student,By: TIRI I was sitting in the police statio...


### Removing stop words

In [11]:
stopwords_set = set( stopwords.words('english') + stopwords.words('spanish') + stopwords.words('dutch') )

In [12]:
list(stopwords_set)[0:5]

['la', 'zal', 'para', "it's", 'does']

In [13]:
pattern = r",*(\s*\b(?:{}))\b".format("|".join(stopwords_set))

In [14]:
def df_remove_stopwords(text):
    text = re.sub('[^a-zA-Z,.? ]', '', text)
    return re.sub(pattern, "", text).strip()

In [15]:
df['details'] = df.text.apply(df_remove_stopwords)

In [16]:
df.head()

Unnamed: 0,index,topic,text,details
0,0,Student,In het kader van kernfusie op aarde: MAAK JE ...,In kader kernfusie aarde MAAK JE EIGEN WATERS...
1,1,InvestmentBanking,"Just so you know, this blog isn't about being ...",Just know blog isnt political. If anything apo...
2,2,Education,"I I dont remember his name, but I remember th...",I I dont remember name I remember first time ...
3,3,Science,so there they were in the cotton candy shack. ...,"cotton candy shack. bo jangles, mr.johnreed, c..."
4,4,Student,By: TIRI I was sitting in the police statio...,By TIRI I sitting police station someone ca...


### Tokenizing

In [17]:
def df_sent_tokenize(text):
    return sent_tokenize(text)

In [18]:
df['details'] = df.details.apply(df_sent_tokenize)

In [19]:
df.head()

Unnamed: 0,index,topic,text,details
0,0,Student,In het kader van kernfusie op aarde: MAAK JE ...,[In kader kernfusie aarde MAAK JE EIGEN WATER...
1,1,InvestmentBanking,"Just so you know, this blog isn't about being ...","[Just know blog isnt political., If anything a..."
2,2,Education,"I I dont remember his name, but I remember th...",[I I dont remember name I remember first time...
3,3,Science,so there they were in the cotton candy shack. ...,"[cotton candy shack., bo jangles, mr.johnreed,..."
4,4,Student,By: TIRI I was sitting in the police statio...,[By TIRI I sitting police station someone c...


In [20]:
def df_word_tokenize(sentences):
    return [
        word_tokenize(sentence)
        for sentence in sentences
    ]

In [21]:
df['details'] = df.details.apply(df_word_tokenize)

In [22]:
df.head()

Unnamed: 0,index,topic,text,details
0,0,Student,In het kader van kernfusie op aarde: MAAK JE ...,"[[In, kader, kernfusie, aarde, MAAK, JE, EIGEN..."
1,1,InvestmentBanking,"Just so you know, this blog isn't about being ...","[[Just, know, blog, isnt, political, .], [If, ..."
2,2,Education,"I I dont remember his name, but I remember th...","[[I, I, dont, remember, name, I, remember, fir..."
3,3,Science,so there they were in the cotton candy shack. ...,"[[cotton, candy, shack, .], [bo, jangles, ,, m..."
4,4,Student,By: TIRI I was sitting in the police statio...,"[[By, TIRI, I, sitting, police, station, someo..."


# Extracting title

In [120]:
def df_extract_title(details):
    try:
        doc = [
            TaggedDocument(words, [index])
            for index, words in enumerate(details)
        ]

        model = Doc2Vec(epochs = 30)
        model.build_vocab(doc)
        model.train(doc, total_examples = model.corpus_count, epochs = model.epochs)
        
        counts = dict()
        for word in model.wv.key_to_index:
            counts[word] = model.wv.get_vecattr(word, 'count')
        
        counts = sorted(
            counts.items(),
            key = lambda item: item[1],
            reverse = True
        )[0:len(model.wv.index_to_key) // 2]
        
        counts = list(dict(counts).keys())
        
        negative_check = ['I', '?', '.' ,'he', 'him', 'she', 'her']
        negative = list()
        for neg in negative_check:
            if neg in model.wv.index_to_key:
                negative.append(neg)
        
        most_common = model.wv.most_similar(positive = counts, negative = negative)
        title = ' '.join([word[0] for word in most_common])
        
        return title
    except Exception as e:
        print(e)
        return np.NaN

In [121]:
df_extract_title(df.details[10])

'take book Dame place along one read Paris banks Tower'

In [124]:
df['title'] = df.details.apply(df_extract_title)

you must first build vocabulary before training the model
you must first build vocabulary before training the model
you must first build vocabulary before training the model
you must first build vocabulary before training the model


In [125]:
df.head()

Unnamed: 0,index,topic,text,details,title
0,0,Student,In het kader van kernfusie op aarde: MAAK JE ...,"[[In, kader, kernfusie, aarde, MAAK, JE, EIGEN...",nucleus enough Now percent When form In Your N...
1,1,InvestmentBanking,"Just so you know, this blog isn't about being ...","[[Just, know, blog, isnt, political, .], [If, ...",Bush North make much didnt may Iraq country Pr...
2,2,Education,"I I dont remember his name, but I remember th...","[[I, I, dont, remember, name, I, remember, fir...",Im job say We could work knew education teachi...
3,3,Science,so there they were in the cotton candy shack. ...,"[[cotton, candy, shack, .], [bo, jangles, ,, m...",else still little would make nbspnbsp jeepers ...
4,4,Student,By: TIRI I was sitting in the police statio...,"[[By, TIRI, I, sitting, police, station, someo...",talking deal murder roses saying home know goi...


# Recommending by Title

In [144]:
df.drop(['details'], axis = 1, inplace = True)

In [132]:
df.title.isnull().sum()

4

In [133]:
df.dropna(inplace = True)

In [134]:
df.isnull().sum()

index    0
topic    0
text     0
title    0
dtype: int64

In [145]:
df.head()

Unnamed: 0,index,topic,text,title
0,0,Student,In het kader van kernfusie op aarde: MAAK JE ...,nucleus enough Now percent When form In Your N...
1,1,InvestmentBanking,"Just so you know, this blog isn't about being ...",Bush North make much didnt may Iraq country Pr...
2,2,Education,"I I dont remember his name, but I remember th...",Im job say We could work knew education teachi...
3,3,Science,so there they were in the cotton candy shack. ...,else still little would make nbspnbsp jeepers ...
4,4,Student,By: TIRI I was sitting in the police statio...,talking deal murder roses saying home know goi...


In [146]:
def df_tag_row_title(row):
    try:
        return TaggedDocument(
            word_tokenize(row['title']),
            [row['index'], row['topic']]
        )
    except Exception as e:
        print(e)
        return np.NaN

In [148]:
df['details'] = df.apply(df_tag_row_title, axis = 1)

In [149]:
df.head()

Unnamed: 0,index,topic,text,title,details
0,0,Student,In het kader van kernfusie op aarde: MAAK JE ...,nucleus enough Now percent When form In Your N...,"([nucleus, enough, Now, percent, When, form, I..."
1,1,InvestmentBanking,"Just so you know, this blog isn't about being ...",Bush North make much didnt may Iraq country Pr...,"([Bush, North, make, much, didnt, may, Iraq, c..."
2,2,Education,"I I dont remember his name, but I remember th...",Im job say We could work knew education teachi...,"([Im, job, say, We, could, work, knew, educati..."
3,3,Science,so there they were in the cotton candy shack. ...,else still little would make nbspnbsp jeepers ...,"([else, still, little, would, make, nbspnbsp, ..."
4,4,Student,By: TIRI I was sitting in the police statio...,talking deal murder roses saying home know goi...,"([talking, deal, murder, roses, saying, home, ..."


In [152]:
model = Doc2Vec(epochs = 30)

In [153]:
model.build_vocab(df.details)

In [154]:
model.train(df.details, total_examples = model.corpus_count, epochs = model.epochs)

In [184]:
for i in range(10):
    words = model.wv.most_similar(model.infer_vector(word_tokenize('violent means to destroy the organization')), topn = 10 )
    for word in words:
        print(word[0], end = ' ')
    print()

South father board There point following Dan police gave mind 
board mouth mind woman South happened police father Dan heart 
town weapons technology number blood means child upon Now near 
woman happened mouth slowly young upon body round police theres 
mind lived South quite police board headed mouth others Dan 
South father upon wife ones sent taking gave woman body 
taking upon wife ones sent father body gave South states 
child number town technology means blood weapons upon individual states 
upon wife taking gave near sent South town ones students 
number weapons technology town blood means child upon states male 


In [377]:
def get_recommendations(df, model, sentence):
    words = model.wv.most_similar(model.infer_vector(word_tokenize(sentence)), topn = 10 )
    
    results = set()
    for word in words:
        results.update(list(df[df['title'].str.contains(word[0])]['title']))
        
    return list(results)

In [378]:
get_recommendations(df, model, df.title[0])

['What vehicles repeat even might significant drive suggest case given',
 'question Honor see one asked never practical What Malcolm El',
 '.... ... mom seen What time They one take home',
 'health What needs Develop barriers women Prevention nbspnbsp among nbspnbspnbspnbspnbspnbspnbspnbsp',
 'Ashlee hit Kassie D fell go people since started sat',
 'right away things Indians didnt since found social last job',
 'communicate A may every nodes wired When IP protocol information',
 'formation tell black make duty use front used officer My',
 'registration completed Certificate tax company establishment Registration government may Representative',
 'Subjects Testimony Womens Shattered Henke Suzette LifeWriting Recovery scriptotherapy understand',
 'know , Whats Had done love day .... youve time',
 'Vaughan Logan ... The may Storm know uniform music dont',
 'Had thing one done stars youve em depends Whats good',
 'gon dont moment people na .. since really another around',
 'asked people goi

In [379]:
get_recommendations(df, model, 'my new sentence')

['Geometry idea cant th something May Biology went Latin still',
 'His table thoughts front group Well hour life noticed day',
 'church fact first Detective close enough looked made two front',
 'power would George big West Wing Washington thing fact mean',
 'new In fact good favorite remember say school decade time',
 'great world thing little even In play guys fact President',
 'home stuff left Kinker came group room people time much',
 'time trying people although know say hes fact first friend',
 'restless brotherhood We love tomorrow understanding another world life group',
 'boy set makes Nights night youll better hope hear theyre',
 'March urlLink history like people loyalty Party result well Communist',
 'idea misunderstood free debate double burden dialoguing Ive But make',
 'TypeI breakfast dinner every diabetic control component wasnt read fact',
 'including Uighur international Xinjiang groups torture Tibetan Uygur Falun year',
 'might good available get information like co