### TF-IDF AND COSINE SIMILARITY

In [1]:
import re
import string
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer

def retrieve_docs_and_clean():

  r = requests.get('https://sports.ndtv.com/fifa-world-cup-2022/news')
  soup = BeautifulSoup(r.content, 'html.parser')

  #THE FOLLOWING CODE NEED TO BE MODIFIED TO SUITE FOR THE ABOVE URL
  #link = []
  #for i in soup.find('div', {'class':'most__wrap'}).find_all('a'):
  #    i['href'] = i['href'] + '?page=all'
  #    link.append(i['href'])
  link = []
  for i in soup.find_all('a',attrs={'href': re.compile("^http")}):
      i['href'] = i['href'] + '?page=all'
      link.append(i['href'])

  

  # Retrieve Paragraphs
  documents = []
  for i in link:
      r = requests.get(i)
      soup = BeautifulSoup(r.content, 'html.parser')

      sen = []
      for i in soup.find_all('p'):
          sen.append(i.text)
      documents.append(' '.join(sen))

  # Clean Paragraphs
  documents_clean = []
  for d in documents:
      document_test = re.sub(r'[^\x00-\x7F]+', ' ', d)
      document_test = re.sub(r'@\w+', '', document_test)
      document_test = document_test.lower()
      document_test = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', document_test)
      document_test = re.sub(r'[0-9]', '', document_test)
      document_test = re.sub(r'\s{2,}', ' ', document_test)
      documents_clean.append(document_test)

  return documents_clean

In [2]:
docs = retrieve_docs_and_clean()

# Create Term-Document Matrix with TF-IDF weighting
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)

# Create a DataFrame
df = pd.DataFrame(X.T.toarray(), index=vectorizer.get_feature_names_out())
print(df.head())
print(df.shape)

             0         1    2    3    4    5    6    7    8    9   ...   64  \
aaron  0.000000  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   
aasif  0.000000  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   
aayan  0.000000  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   
aaye   0.034961  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   
aayi   0.031895  0.017491  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   

        65   66   67   68   69   70   71   72   73  
aaron  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
aasif  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
aayan  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
aaye   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
aayi   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 74 columns]
(3293, 74)


In [3]:
docs = retrieve_docs_and_clean()
# Create Term-Document Matrix with TF-IDF weighting
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)

# Create a DataFrame
df = pd.DataFrame(X.T.toarray(), index=vectorizer.get_feature_names_out())
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,64,65,66,67,68,69,70,71,72,73
aaron,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aasif,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aayan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaye,0.034961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aayi,0.031895,0.017491,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
len(docs)

74

In [5]:

def get_similar_articles(q, df):
  print("query:", q)
  print("The following are articles with the highest cosine similarity values: ")
  q = [q]
  q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)
  sim = {}
  for i in range(74):
    sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)

  sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)
  
  for k, v in sim_sorted:
    
    if v != 0.0 :
      
      print("Similarity Values:", v)
      print(docs[k])
      print()


q1 = 'barcelona'
q2 = 'spain'
q3 = 'argentina'

get_similar_articles(q1, df)
print('-'*100)
get_similar_articles(q2, df)
print('-'*100)
get_similar_articles(q3, df)

query: barcelona
The following are articles with the highest cosine similarity values: 
Similarity Values: nan
 

Similarity Values: nan


Similarity Values: nan


Similarity Values: nan
 

----------------------------------------------------------------------------------------------------
query: spain
The following are articles with the highest cosine similarity values: 
Similarity Values: nan
 

Similarity Values: 0.0665151891938807
the group e in the fifa world cup turned out to be the group of death with japan beating both germany and spain to qualify for the round of as group winners while spain progressed as runners up due to a better goal difference champions germany were knocked out but japan s win over spain wasn t devoid of controversy their second goal has triggered a storm on social media with many suggesting that var was wrong in allowing the goal especially since it led to the germans elimination window rrcode window rrcode rrcode push function function v d o ai ai d crea

  sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)


### BM25 ranking for the document retrieval system 

In [6]:
!pip install rank_bm25

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [7]:
#importing required libraries
from rank_bm25 import *

In [8]:
#Tokenizing dorpus for calculating scores
tokenized_corpus = [doc.split() for doc in docs]
bm25 = BM25Okapi(tokenized_corpus)

In [9]:
#Function to print similer documents
def bm25_ir(q,docs):
  print("query:", q)
  print("The following are articles with the highest cosine similarity values: ")
  tokenized_query = q.split(" ")
  #calculating bm25 scores and storing value corresponding to each document in a dictionary
  doc_scores = bm25.get_scores(tokenized_query)
  sim={}
  for i in range (len(doc_scores)):
    if doc_scores[i] !=0.0:
      #print(i,doc_scores[i])
      sim[i]=doc_scores[i]
  sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)
  print(sim_sorted)
  for k, v in sim_sorted[:3]:
      print("Similarity Values:", v)
      print(docs[k])
      print()

#Trying out with different quires
q1 = 'barcelona'
q2 = 'spain'
q3 = 'argentina'

bm25_ir(q1, docs)
print('-'*100)
bm25_ir(q2, docs)
print('-'*100)
bm25_ir(q3, docs)


query: barcelona
The following are articles with the highest cosine similarity values: 
[]
----------------------------------------------------------------------------------------------------
query: spain
The following are articles with the highest cosine similarity values: 
[(28, 5.901891710939172), (26, 5.666189092881767), (20, 2.9054856706669523)]
Similarity Values: 5.901891710939172
the group e in the fifa world cup turned out to be the group of death with japan beating both germany and spain to qualify for the round of as group winners while spain progressed as runners up due to a better goal difference champions germany were knocked out but japan s win over spain wasn t devoid of controversy their second goal has triggered a storm on social media with many suggesting that var was wrong in allowing the goal especially since it led to the germans elimination window rrcode window rrcode rrcode push function function v d o ai ai d createelement script ai defer true ai async true ai s