## 1-Loading and visualizing the dataset:

In [3]:
import pandas as pd

# Load the dataset
data = pd.read_csv('/FSM/S2/NLP_WebMining/IRProject_ABCnewsDataset/searsh/abcnews-date-text.csv')

# Visualize the dataset
print(data.head())

   publish_date                                      headline_text  \
0      20030219  aba decides against community broadcasting lic...   
1      20030219     act fire witnesses must be aware of defamation   
2      20030219     a g calls for infrastructure protection summit   
3      20030219           air nz staff in aust strike for pay rise   
4      20030219      air nz strike to affect australian travellers   

                                                link  
0  https://discover.abc.net.au/index.html#/?query...  
1  https://discover.abc.net.au/index.html#/?query...  
2  https://discover.abc.net.au/index.html#/?query...  
3  https://discover.abc.net.au/index.html#/?query...  
4  https://discover.abc.net.au/index.html#/?query...  


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1244184 entries, 0 to 1244183
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   publish_date   1244184 non-null  int64 
 1   headline_text  1244184 non-null  object
 2   link           1244184 non-null  object
dtypes: int64(1), object(2)
memory usage: 28.5+ MB


## 2-Preprocessing the dataset:

In [6]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

# Preprocessing function
def preprocess(text):
    text = text.lower() # convert to lowercase
    text = re.sub(r'\d+', '', text) # remove digits
    text = re.sub(r'[^\w\s]', '', text) # remove punctuation
    stop_words = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(text) # tokenization
    tokens = [stemmer.stem(w) for w in tokens if not w in stop_words] # remove stopwords and stemming (les réduire à leur forme de base)
    return ' '.join(tokens)

# Apply preprocessing function to the headline_text column
data['clean_headline_text'] = [preprocess(text) for text in data['headline_text']]


In [8]:
# Visualize preprocessed dataset
data.head()

Unnamed: 0,publish_date,headline_text,link,clean_headline_text
0,20030219,aba decides against community broadcasting lic...,https://discover.abc.net.au/index.html#/?query...,aba decid commun broadcast licenc
1,20030219,act fire witnesses must be aware of defamation,https://discover.abc.net.au/index.html#/?query...,act fire wit must awar defam
2,20030219,a g calls for infrastructure protection summit,https://discover.abc.net.au/index.html#/?query...,g call infrastructur protect summit
3,20030219,air nz staff in aust strike for pay rise,https://discover.abc.net.au/index.html#/?query...,air nz staff aust strike pay rise
4,20030219,air nz strike to affect australian travellers,https://discover.abc.net.au/index.html#/?query...,air nz strike affect australian travel


## 3-Vectorizing the preprocessed text using TF-IDF:

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TfidfVectorizer object
vectorizer = TfidfVectorizer()

# Vectorize preprocessed text
vectors = vectorizer.fit_transform(data['clean_headline_text'])

# Visualize vectorized dataset
print(vectors.shape)

(1244184, 73270)


## 4-Building the NearestNeighbors model:

In [10]:
from sklearn.neighbors import NearestNeighbors

# Create NearestNeighbors object
knn = NearestNeighbors(n_neighbors=10, metric='cosine')

# Fit vectorized data to NearestNeighbors model
knn.fit(vectors)


NearestNeighbors(metric='cosine', n_neighbors=10)

## 5-Saving the model and vectorizer:

In [None]:
# saving the model
import pickle
pickle.dump(knn, open('IRmodel.pkl','wb'))

In [None]:
# saving the vectorizer
with open('vectorizer.pkl','wb') as f:
    pickle.dump(vectorizer,f)

## 6-testing examples:

In [11]:
# Example query
query = 'football player'


def search(query):
    # Preprocess query
    query = preprocess(query)

    # Vectorize query
    query_vector = vectorizer.transform([query])

    # Find the k nearest neighbors to the query
    distances, indices = knn.kneighbors(query_vector)

    # Print the most relevant documents
    for i, index in enumerate(indices[0]):
        print(f'Top document {i+1} : DATE = {data.iloc[index]["publish_date"]} ')
        print(data.iloc[index]['headline_text'])
        print('\n')

In [12]:
search(query)

Top document 1 : DATE = 20120228 
football community mourns player


Top document 2 : DATE = 20041109 
football player murder trial begins


Top document 3 : DATE = 20080512 
sa players defend football park


Top document 4 : DATE = 20100906 
broken hill football players await best player


Top document 5 : DATE = 20091027 
football players charged over alleged rapes


Top document 6 : DATE = 20140116 
date set for football player in court


Top document 7 : DATE = 20191010 
football violence leaves players in fear


Top document 8 : DATE = 20030725 
indigenous football players to show off skills


Top document 9 : DATE = 20100708 
football team wins gold after players death


Top document 10 : DATE = 20110527 
afl player moves to american football


