In [1]:
import nltk
import pandas as pd
import numpy as np
import re

In [2]:
with open('data/textranking.txt','r') as f:
    text = f.read()

In [3]:
print(text)

Virat smashed another ton
Worldcup Starts at 30th May 2019, in England
Narendra Modi Vists Andhra Pradesh for Election summit
Currently phase 9 elections completed in West Bengal
Temperature in India touching 50 C due to global warming


In [19]:
Q = 'When worldcup starts ?'

# Word Embedding
- Bag of Word (BOW)
- TF-IDF (Term Frequeny and Inverse Document Frequency)

In [4]:
docs = text.split('\n')
df = pd.DataFrame(docs,columns=['Documents'])

### Text cleaning
- Lower
- Remove Special Characters

In [5]:
def textprocess(text):
    text = text.lower() # convert into lower case
    text = re.sub(r'[^a-z0-9]',' ',text)
    return text

In [6]:
df['Documents'] = df['Documents'].apply(textprocess)

In [7]:
df

Unnamed: 0,Documents
0,virat smashed another ton
1,worldcup starts at 30th may 2019 in england
2,narendra modi vists andhra pradesh for electio...
3,currently phase 9 elections completed in west ...
4,temperature in india touching 50 c due to glob...


### Lemma
- taking root word

In [10]:
from nltk.stem import wordnet
from nltk import pos_tag

In [9]:
lema = wordnet.WordNetLemmatizer()

In [12]:
sent = df['Documents'].loc[0]
sent

'virat smashed another ton'

In [44]:
def lemmatizer(pos_tag):
    word, pos = pos_tag
    if pos.startswith('R'):
        pos = 'r'
    elif pos.startswith('V'):
        pos = 'v'
    elif pos.startswith('J'):
        pos ='a'
    else:
        pos ='n'
        
    return lema.lemmatize(word,pos=pos)
    

In [54]:
def joinlemma(sent):
    tokens = sent.split()
    pos = pos_tag(tokens)
    return " ".join([lemmatizer(tag) for tag in pos])

In [56]:
df['lemm'] = df['Documents'].apply(joinlemma)

In [57]:
df

Unnamed: 0,Documents,lemm
0,virat smashed another ton,virat smash another ton
1,worldcup starts at 30th may 2019 in england,worldcup start at 30th may 2019 in england
2,narendra modi vists andhra pradesh for electio...,narendra modi vists andhra pradesh for electio...
3,currently phase 9 elections completed in west ...,currently phase 9 election complete in west be...
4,temperature in india touching 50 c due to glob...,temperature in india touch 50 c due to global ...


# TF-IDF

$tfidf = log(TF+1) * log(\frac{N}{df+1})$

- TF = Term Frequency 
- DF = Document Frequeny
- N  = Number of times word is repeated in entire corpus

term = word

In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [76]:
tfidf = TfidfVectorizer()

In [78]:
X = tfidf.fit_transform(df['lemm']).toarray()

In [81]:
Q = 'when world cup starts?'

In [83]:
query = textprocess(Q)
query = joinlemma(query)
query_vector = tfidf.transform([query]).toarray()

In [84]:
query_vector.shape

(1, 33)

In [85]:
from sklearn.metrics.pairwise import cosine_similarity

In [86]:
cosine_similarity(X,query_vector)

array([[ 0.       ],
       [ 0.3664082],
       [ 0.       ],
       [ 0.       ],
       [ 0.       ]])