In [25]:
import pandas as pd
import numpy as np

In [26]:
data= pd.read_csv("/content/drive/MyDrive/india-news-headlines.csv")
data

Unnamed: 0,publish_date,headline_category,headline_text
0,20010102,unknown,Status quo will not be disturbed at Ayodhya; s...
1,20010102,unknown,Fissures in Hurriyat over Pak visit
2,20010102,unknown,America's unwanted heading for India?
3,20010102,unknown,For bigwigs; it is destination Goa
4,20010102,unknown,Extra buses to clear tourist traffic
...,...,...,...
3650965,20220331,city.srinagar,J&K sacks 2 cops; 3 other employees over terro...
3650966,20220331,entertainment.hindi.bollywood,Ranbir Kapoor says 'Rishi Kapoor enjoyed his a...
3650967,20220331,city.trichy,As Covid-19 cases drop to nil in southern dist...
3650968,20220331,city.erode,Tamil Nadu sees marginal rise of Covid cases w...


In [27]:
data.head()

Unnamed: 0,publish_date,headline_category,headline_text
0,20010102,unknown,Status quo will not be disturbed at Ayodhya; s...
1,20010102,unknown,Fissures in Hurriyat over Pak visit
2,20010102,unknown,America's unwanted heading for India?
3,20010102,unknown,For bigwigs; it is destination Goa
4,20010102,unknown,Extra buses to clear tourist traffic


In [28]:
data.tail

<bound method NDFrame.tail of          publish_date              headline_category  \
0            20010102                        unknown   
1            20010102                        unknown   
2            20010102                        unknown   
3            20010102                        unknown   
4            20010102                        unknown   
...               ...                            ...   
3650965      20220331                  city.srinagar   
3650966      20220331  entertainment.hindi.bollywood   
3650967      20220331                    city.trichy   
3650968      20220331                     city.erode   
3650969      20220331                     city.salem   

                                             headline_text  
0        Status quo will not be disturbed at Ayodhya; s...  
1                      Fissures in Hurriyat over Pak visit  
2                    America's unwanted heading for India?  
3                       For bigwigs; it is destinatio

In [29]:
data.shape

(3650970, 3)

In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3650970 entries, 0 to 3650969
Data columns (total 3 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   publish_date       int64 
 1   headline_category  object
 2   headline_text      object
dtypes: int64(1), object(2)
memory usage: 83.6+ MB


In [31]:
data.describe()

Unnamed: 0,publish_date
count,3650970.0
mean,20131930.0
std,52522.82
min,20010100.0
25%,20100420.0
50%,20140230.0
75%,20170930.0
max,20220330.0


DATA PREPROCESSING


In [32]:
data.isnull()

Unnamed: 0,publish_date,headline_category,headline_text
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
...,...,...,...
3650965,False,False,False
3650966,False,False,False
3650967,False,False,False
3650968,False,False,False


In [33]:
data.isnull().sum()

publish_date         0
headline_category    0
headline_text        0
dtype: int64

In [34]:
data.duplicated()

0          False
1          False
2          False
3          False
4          False
           ...  
3650965    False
3650966    False
3650967    False
3650968    False
3650969    False
Length: 3650970, dtype: bool

In [35]:
data.duplicated().sum()

24860

In [36]:
data.drop_duplicates(inplace=True)

In [37]:
data.duplicated().sum()

0

MODEL FITTING

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [39]:
# tfidf calculation
text_content = data['headline_text']
vector = TfidfVectorizer(max_df=0.3,         # drop words that occur in more than X percent of documents
                             #min_df=8,      # only use words that appear at least X times
                             stop_words='english', # remove stop words
                             lowercase=True, # Convert everything to lower case 
                             use_idf=True,   # Use idf
                             norm=u'l2',     # Normalization
                             smooth_idf=True # Prevents divide-by-zero errors
                            )
tfidf = vector.fit_transform(text_content)

In [40]:
df = pd.DataFrame(tfidf[0].T.todense(),index=vector.get_feature_names(),columns=["TF-IDF"])



In [41]:
data = df.sort_values('TF-IDF', ascending=False)

In [42]:
data

Unnamed: 0,TF-IDF
disturbed,0.506482
quo,0.473163
vajpayee,0.415386
ayodhya,0.395103
status,0.362839
...,...
foretells,0.000000
foretold,0.000000
forev,0.000000
foreve,0.000000


Using the TF-IDF with cosine similarity, rank the news headlines based on a user query

In [43]:
from sklearn.metrics.pairwise import cosine_similarity

In [44]:
#doc2_tfidf=vector.transform(["Status quo will not be disturbed "])
# calculate the cosine similarity between the documents
#sim = cosine_similarity(tfidf, doc2_tfidf).flatten()
#print(sim)

In [45]:
# Get user query
query = input("Enter your query: ")
doc2_tfidf=vector.transform([query])

# sort the headlines by cosine similarity and print the top results
related_headlines_indices = sim.argsort()[:-5:-1]
print("Top related headlines:")
for i in related_headlines_indices:
    print(text_content[i])

Enter your query: Status quo will not be disturbed 
Top related headlines:
Status quo will not be disturbed at Ayodhya; says Vajpayee
Bollywood acting guru comes south
She has a baby at 56
China sets up quake monitoring station at Everest
