In [1]:
import pandas as pd

news_data = pd.read_csv("news.csv")

news_data.head()


Unnamed: 0,publish_date,headline
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [2]:
news_data.shape

(894000, 2)

In [3]:
news_data = news_data.loc[:10000,]

In [4]:
news_data = news_data.drop("publish_date", axis=1)

In [5]:
### Text Similarty
import spacy
nlp = spacy.load("en_core_web_sm")

def clean_doc(sent):
    doc = nlp(sent)
    lemma_out = [token.lemma_ for token in doc]
    lemma_out = " ".join(lemma_out)
    return lemma_out

news_data["headline_clean"] = news_data["headline"].apply(lambda x: clean_doc(x))





In [6]:
### Bag of words representation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vec = CountVectorizer(stop_words="english", min_df = 0.001)
# vec = TfidfVectorizer(stop_words="english", min_df = 0.001)
vec = vec.fit(news_data["headline_clean"])
vectorized_data = vec.transform(news_data["headline_clean"])



In [7]:
### New Test Row
new_sentence = "Hike in oil prices"
new_sentence = clean_doc(new_sentence)

sentence_list = []
sentence_list.append(new_sentence)
vec_sentence = vec.transform(sentence_list)



In [8]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
similarity = cosine_similarity(vectorized_data, vec_sentence.toarray())
news_data["similarity_score"] = similarity.flatten()
news_data.sort_values(by="similarity_score", ascending=False)

Unnamed: 0,headline,headline_clean,similarity_score
2067,oil price approaches usd 40 mark,oil price approach usd 40 mark,0.816497
2068,oil prices contribute to massive caltex turnar...,oil price contribute to massive caltex turnaround,0.816497
6424,burning wells may not push oil price up,burn well may not push oil price up,0.707107
4663,further oil deposit found off dongara,further oil deposit find off dongara,0.707107
768,oil price hikes may weigh on rates decision,oil price hike may weigh on rate decision,0.707107
...,...,...,...
3358,pm praises bachtiar over bali investigation,pm praise bachtiar over bali investigation,0.000000
3359,police hunt service station bandit,police hunt service station bandit,0.000000
3360,police probe crash attempted bank robbery,police probe crash attempt bank robbery,0.000000
3361,powder found in returned anti terrorism kits,powder find in return anti terrorism kit,0.000000
