# News Recommender Project by Zainab Popoola

In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [2]:
df = pd.read_json('News_Category_Dataset_v3.json', lines=True)

In [3]:
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209527 entries, 0 to 209526
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   link               209527 non-null  object        
 1   headline           209527 non-null  object        
 2   category           209527 non-null  object        
 3   short_description  209527 non-null  object        
 4   authors            209527 non-null  object        
 5   date               209527 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 9.6+ MB


In [5]:
df = df[df['date'] >= pd.Timestamp(2021,1,1)]

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3464 entries, 0 to 3463
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   link               3464 non-null   object        
 1   headline           3464 non-null   object        
 2   category           3464 non-null   object        
 3   short_description  3464 non-null   object        
 4   authors            3464 non-null   object        
 5   date               3464 non-null   datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 189.4+ KB


In [7]:
df.duplicated().sum()

0

In [8]:
df.isna().sum()

link                 0
headline             0
category             0
short_description    0
authors              0
date                 0
dtype: int64

In [9]:
df_temp = df.copy()

In [10]:
df_temp["day_and_month"] = df_temp["date"].dt.strftime("%a") + "_" + df_temp["date"].dt.strftime("%b")

In [11]:
df_temp['text'] = df_temp['headline'] + " " + df_temp['category'] + " " + df_temp['short_description'] + " " + df_temp['authors'] + " " + df_temp['day_and_month']

In [12]:
df_temp.drop(columns = ['link', 'date','headline', 'category', 'short_description', 'authors', 'day_and_month'], inplace=True)

In [13]:
df_temp

Unnamed: 0,text
0,Over 4 Million Americans Roll Up Sleeves For O...
1,"American Airlines Flyer Charged, Banned For Li..."
2,23 Of The Funniest Tweets About Cats And Dogs ...
3,The Funniest Tweets From Parents This Week (Se...
4,Woman Who Called Cops On Black Bird-Watcher Lo...
...,...
3459,Let 2021 Be The Year You Celebrate What You're...
3460,Senate Overrides Trump's Veto Of Defense Bill ...
3461,"In Dueling New Year Messages, Trump Reflects, ..."
3462,Ex-Defense Secretary Slams Trump's Hold On GOP...


In [14]:
en_stopwords = stopwords.words("English")
stemmer = PorterStemmer()

In [15]:
def clean(text):
    text = re.sub("[^A-Za-z1-9 ]", "", text)
    text = text.lower()
    tokens = word_tokenize(text)
    clean_list = []
    for token in tokens:
        if token not in en_stopwords:
            clean_list.append(stemmer.stem(token))
    return " ".join(clean_list)

In [16]:
df_temp['text'] = df_temp['text'].apply(clean)

In [17]:
df_temp['text'].head()

0    4 million american roll sleev omicrontarget co...
1    american airlin flyer charg ban life punch fli...
2    23 funniest tweet cat dog week sept 1723 comed...
3    funniest tweet parent week sept 1723 parent ac...
4    woman call cop black birdwatch lose lawsuit ex...
Name: text, dtype: object

#### Using Tfidf vectorizer

In [18]:
vectorizer = TfidfVectorizer()

In [19]:
text_matrix = vectorizer.fit_transform(df_temp['text'])

In [20]:
text_matrix = text_matrix.toarray()

#### Using Nearest Neighbors

In [21]:
nn = NearestNeighbors(n_neighbors=5)

In [22]:
nn.fit(text_matrix)

In [23]:
def inference(text):
    text= clean(text)
    text_matrix = vectorizer.transform([text])
    return nn.kneighbors(n_neighbors=5, X=text_matrix, return_distance=False)

**test_text gotten from Aljazeera**

In [24]:
test_text = "Jamaica's Usain Bolt missing $12.7m in investment fraud case Lawyers of the Olympic sprinter say Bolt’s account is missing more than $12.7m from his account. Lawyers for eight-time Olympic gold medallist Usain Bolt say their client has been defrauded of $12.7m from his account with a private investment firm in Jamaica that authorities are investigating. Bolt was informed last week that his account balance at Kingston-based Stocks and Securities Ltd (SSL) had inexplicably dwindled."

In [25]:
inference(test_text)

array([[2216, 1063,  769, 3414,  246]], dtype=int64)

In [26]:
df.iloc[[2216, 1063,  769, 3414,  246]]

Unnamed: 0,link,headline,category,short_description,authors,date
2216,https://www.huffpost.com/entry/texas-democrats...,"Texas Democrats Continue Holdout, Don’t Show F...",POLITICS,More than 50 Democrats last month bolted to th...,"Acacia Coronado, Paul J. Weber, Associated Press",2021-08-07
1063,https://www.huffpost.com/entry/i-stand-with-pu...,Twitter Reportedly Bans 100 'I Stand With Puti...,POLITICS,A professor in Qatar was the first to spot the...,Mary Papenfuss,2022-03-06
769,https://www.huffpost.com/entry/ford-suv-recall...,"Ford Recalls More Than 250,000 SUVs That Can R...",U.S. NEWS,Documents posted by U.S. safety regulators say...,,2022-04-29
3414,https://www.huffpost.com/entry/better-late-tha...,'Better Late Than Never': Account Reposting Tr...,POLITICS,The @SuspendThePres account reposted Trump's w...,Jeremy Blum,2021-01-09
246,https://www.huffpost.com/entry/ex-twitter-staf...,Ex-Twitter Staffer Convicted For Sharing Priva...,U.S. NEWS,Ahmad Abouammo received cash payments and gift...,Marita Vlachou,2022-08-10
