In [1]:
import pandas as pd
import numpy as np

# Baseline Data

In [2]:
df=pd.read_csv("news.csv")
df.head()

Unnamed: 0,article_id,source_id,source_name,author,title,description,url,url_to_image,published_at,content,category,full_content
0,89541,,International Business Times,Paavan MATHEMA,UN Chief Urges World To 'Stop The Madness' Of ...,UN Secretary-General Antonio Guterres urged th...,https://www.ibtimes.com/un-chief-urges-world-s...,https://d.ibtimes.com/en/full/4496078/nepals-g...,2023-10-30 10:12:35.000000,UN Secretary-General Antonio Guterres urged th...,Nepal,UN Secretary-General Antonio Guterres urged th...
1,89542,,Prtimes.jp,,RANDEBOOよりワンランク上の大人っぽさが漂うニットとベストが新登場。,[株式会社Ainer]\nRANDEBOO（ランデブー）では2023年7月18日(火)より公...,https://prtimes.jp/main/html/rd/p/000000147.00...,https://prtimes.jp/i/32220/147/ogp/d32220-147-...,2023-10-06 04:40:02.000000,"RANDEBOO2023718()WEB2023 Autumn Winter \n""Nepa...",Nepal,
2,89543,,VOA News,webdesk@voanews.com (Agence France-Presse),UN Chief Urges World to 'Stop the Madness' of ...,UN Secretary-General Antonio Guterres urged th...,https://www.voanews.com/a/un-chief-urges-world...,https://gdb.voanews.com/01000000-0a00-0242-60f...,2023-10-30 10:53:30.000000,"Kathmandu, Nepal UN Secretary-General Antonio...",Nepal,
3,89545,,The Indian Express,Editorial,Sikkim warning: Hydroelectricity push must be ...,Ecologists caution against the adverse effects...,https://indianexpress.com/article/opinion/edit...,https://images.indianexpress.com/2023/10/edit-...,2023-10-06 01:20:24.000000,At least 14 persons lost their lives and more ...,Nepal,At least 14 persons lost their lives and more ...
4,89547,,The Times of Israel,Jacob Magid,"200 foreigners, dual nationals cut down in Ham...","France lost 35 citizens, Thailand 33, US 31, U...",https://www.timesofisrael.com/200-foreigners-d...,https://static.timesofisrael.com/www/uploads/2...,2023-10-27 01:08:34.000000,"Scores of foreign citizens were killed, taken ...",Nepal,


In [3]:
df.isnull().sum()


article_id          0
source_id       80880
source_name         0
author           8219
title              40
description       383
url                 0
url_to_image     5624
published_at        0
content             0
category           42
full_content    46943
dtype: int64

In [4]:
df=df.drop(columns=["article_id","source_id","source_name","author","description","category","url","url_to_image","published_at","content","full_content"])

In [5]:
df=df.dropna(subset=["title"])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 105335 entries, 0 to 105374
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   title   105335 non-null  object
dtypes: object(1)
memory usage: 1.6+ MB


In [6]:
df.head()

Unnamed: 0,title
0,UN Chief Urges World To 'Stop The Madness' Of ...
1,RANDEBOOよりワンランク上の大人っぽさが漂うニットとベストが新登場。
2,UN Chief Urges World to 'Stop the Madness' of ...
3,Sikkim warning: Hydroelectricity push must be ...
4,"200 foreigners, dual nationals cut down in Ham..."


# Text Preprocessing

# Lower Casing

In [7]:
def to_lower(text):
    return text.lower()

# Removing Punctuations

In [11]:
import re
def remove_punct(text):
    return re.sub(r"[^\w\s]", "",text)
    

# Removing Extra Spaces

In [12]:
def remove_space(text):
    return re.sub(r"\s+"," ",text).strip()

# Removing Stop Words

In [17]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aksha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [18]:
from nltk.corpus import stopwords
STOP_WORDS=set(stopwords.words("english"))

def remove_stopwords(text):
    words=text.split()
    words=[w for w in words if w not in STOP_WORDS]
    return " ".join(words)

# Pre Processing pipeline

In [20]:
def preprocess_text(text):
    if text is None:
        return""
    text=to_lower(text)
    text=remove_punct(text)
    text=remove_space(text)
    text=remove_stopwords(text)
    return text

In [24]:
df['headlines']=df['title'].apply(preprocess_text)
df.drop(columns=['title'],inplace=True)
df


Unnamed: 0,headlines
0,un chief urges world stop madness climate change
1,randebooよりワンランク上の大人っぽさが漂うニットとベストが新登場
2,un chief urges world stop madness climate change
3,sikkim warning hydroelectricity push must acco...
4,200 foreigners dual nationals cut hamas assaul...
...,...
105370,done wrong party work says karnataka deputy cm...
105371,fc barcelona guarantees 776 million champions ...
105372,three hospitals ignored gravely ill fiancé you...
105373,kerbers farm bringing farm table manhattans we...


# Feature Extraction

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer=TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2),
    min_df=5,
    max_df=0.9
)
X_baseline=vectorizer.fit_transform(df['headlines'])
baseline_mean=np.asarray(X_baseline.mean(axis=0))

array([[0.00021993, 0.00016431, 0.00015971, ..., 0.00034463, 0.00079342,
        0.00037945]], shape=(1, 5000))

In [33]:
import pickle
pickle.dump(vectorizer,open("vectorizer.pkl","wb"))
pickle.dump(baseline_mean,open("baseline_mean.pkl","wb"))