In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt 
import seaborn as sns
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df1=pd.read_csv('C://python//News_articles_classification/entertainment_data.csv')
df2=pd.read_csv('C://python/News_articles_classification/business_data.csv')
df3=pd.read_csv('C://python/News_articles_classification/education_data.csv')
df4=pd.read_csv('C://python/News_articles_classification/sports_data.csv')
df5=pd.read_csv('C://python/News_articles_classification/technology_data.csv')

# Data Pre-Processing

In [3]:
# joining all the dataframes
dfs=[df1,df2,df3,df4,df5]
df = pd.concat(dfs)

In [4]:
# Shuffling the columns so the df have some randomness
from sklearn.utils import shuffle
df = shuffle(df)

In [5]:
df.head()

Unnamed: 0,headlines,description,content,url,category
851,Assam Board HSLC 2024: SEBA announces Class 10...,Assam Board HSLC Exams: In addition to the Cla...,Assam Board HSLC 2024: The Board of Secondary ...,https://indianexpress.com/article/education/as...,education
1209,Animal box office collection day 14: Ranbir Ka...,Animal box office collection day 14: Ranbir Ka...,Ranbir Kapoor-starrer Animal has been in the t...,https://indianexpress.com/article/entertainmen...,entertainment
1874,WFI’s emergency general council meeting in Ayo...,The ministry on Saturday also suspended WFI's ...,The Wrestling Federation of India’s (WFI) emer...,https://indianexpress.com/article/sports/sport...,sports
1437,Indian users will be able to test AI-based Goo...,Google has launched its Search with Generative...,Google has launched its Search with Generative...,https://indianexpress.com/article/technology/a...,technology
25,"Online retailer eBay to slash 1,000 jobs, redu...","San Jose, California-based eBay is the latest ...","Online retailer eBay Inc will cut about 1,000 ...",https://indianexpress.com/article/business/com...,business


In [6]:
# resetting the index
df.reset_index(inplace=True)

In [7]:
df.head()

Unnamed: 0,index,headlines,description,content,url,category
0,851,Assam Board HSLC 2024: SEBA announces Class 10...,Assam Board HSLC Exams: In addition to the Cla...,Assam Board HSLC 2024: The Board of Secondary ...,https://indianexpress.com/article/education/as...,education
1,1209,Animal box office collection day 14: Ranbir Ka...,Animal box office collection day 14: Ranbir Ka...,Ranbir Kapoor-starrer Animal has been in the t...,https://indianexpress.com/article/entertainmen...,entertainment
2,1874,WFI’s emergency general council meeting in Ayo...,The ministry on Saturday also suspended WFI's ...,The Wrestling Federation of India’s (WFI) emer...,https://indianexpress.com/article/sports/sport...,sports
3,1437,Indian users will be able to test AI-based Goo...,Google has launched its Search with Generative...,Google has launched its Search with Generative...,https://indianexpress.com/article/technology/a...,technology
4,25,"Online retailer eBay to slash 1,000 jobs, redu...","San Jose, California-based eBay is the latest ...","Online retailer eBay Inc will cut about 1,000 ...",https://indianexpress.com/article/business/com...,business


In [8]:
df.shape

(10000, 6)

In [9]:
# removing the useless columns like index and url
df.drop(columns=["index","url"],inplace=True)

### Mapping the category column for classification

In [10]:
df['category'].value_counts()

category
education        2000
entertainment    2000
sports           2000
technology       2000
business         2000
Name: count, dtype: int64

In [11]:
# mapping
def map_category(category):
    category_map = {
        'sports': 0,
        'business': 1,
        'entertainment': 2,
        'education': 3,
        'technology': 4
    }
    return category_map.get(category, -1)
df['Category'] = df['category'].apply(map_category)

In [12]:
df.drop(columns=['category'],inplace=True)

In [13]:
df.head()

Unnamed: 0,headlines,description,content,Category
0,Assam Board HSLC 2024: SEBA announces Class 10...,Assam Board HSLC Exams: In addition to the Cla...,Assam Board HSLC 2024: The Board of Secondary ...,3
1,Animal box office collection day 14: Ranbir Ka...,Animal box office collection day 14: Ranbir Ka...,Ranbir Kapoor-starrer Animal has been in the t...,2
2,WFI’s emergency general council meeting in Ayo...,The ministry on Saturday also suspended WFI's ...,The Wrestling Federation of India’s (WFI) emer...,0
3,Indian users will be able to test AI-based Goo...,Google has launched its Search with Generative...,Google has launched its Search with Generative...,4
4,"Online retailer eBay to slash 1,000 jobs, redu...","San Jose, California-based eBay is the latest ...","Online retailer eBay Inc will cut about 1,000 ...",1


In [14]:
# joining the columns to create one and then i will drop the columns
df['Content'] = df['headlines'] + ' ' + df['description'] + ' ' + df['content']

df.drop(['headlines', 'description', 'content'], axis=1, inplace=True)

In [15]:
df.head()

Unnamed: 0,Category,Content
0,3,Assam Board HSLC 2024: SEBA announces Class 10...
1,2,Animal box office collection day 14: Ranbir Ka...
2,0,WFI’s emergency general council meeting in Ayo...
3,4,Indian users will be able to test AI-based Goo...
4,1,"Online retailer eBay to slash 1,000 jobs, redu..."


### Counting the number of words in the entire dataset so i will have an idea of scale 

In [16]:
total_word_count = 0
for content in df['Content']:
    total_word_count += sum(1 for _ in content.split())
print(total_word_count)


2646524


### Stemming

In [17]:
ps = PorterStemmer()

In [18]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [ps.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = " ".join(stemmed_content)
    return stemmed_content

In [19]:
df['Content'] = df['Content'].apply(stemming)

In [20]:
df.head()

Unnamed: 0,Category,Content
0,3,assam board hslc seba announc class datesheet ...
1,2,anim box offic collect day ranbir kapoor starr...
2,0,wfi emerg gener council meet ayodhya call mini...
3,4,indian user abl test ai base googl search feat...
4,1,onlin retail ebay slash job reduc contract san...


In [23]:
# so after pre-proccessing around 1M words that are stop words , numbers etc were removed 
total_word_count = 0
for content in df['Content']:
    total_word_count += sum(1 for _ in content.split())
print(total_word_count)


1580692


# Model building and evaluation 

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, mean_squared_error, r2_score, mean_absolute_error

In [27]:
X = df['Content']
Y = df['Category']

In [28]:
X_train , X_test , Y_train , Y_test  = train_test_split(X, Y, test_size = 0.2 , stratify = Y , random_state = 42)

In [29]:
# Vectorization 
vc = TfidfVectorizer()
X_train = vc.fit_transform(X_train)
X_test = vc.transform(X_test)

## SVM 

### logistic reggression performed really well so you can also try that but i am using svm as it had best accurecy 

In [34]:
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.svm import SVC

In [35]:
model = SVC()
model.fit(X_train, Y_train)

# Make predictions
Y_pred_cls = model.predict(X_test)

accuracy_cls = accuracy_score(Y_test, Y_pred_cls)

f1_cls = f1_score(Y_test, Y_pred_cls, average='weighted')

report_cls = classification_report(Y_test, Y_pred_cls)
    
# Print the results for classification
print(f"Model: SVM")
print(f"Accuracy: {accuracy_cls:.4f}")
print(f"F1-Score: {f1_cls:.4f}")
print(f"Classification Report:\n{report_cls}")
print("="*50)

Model: SVM
Accuracy: 0.9885
F1-Score: 0.9885
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       400
           1       0.99      0.98      0.98       400
           2       1.00      0.99      1.00       400
           3       0.99      0.99      0.99       400
           4       0.97      0.98      0.97       400

    accuracy                           0.99      2000
   macro avg       0.99      0.99      0.99      2000
weighted avg       0.99      0.99      0.99      2000



In [47]:
def val_to_category(val):
    category_map = {
        0:'sports',
        1:'business',
        2:'entertainment',
        3:'education',
        4:'technology'
     }
    return category_map.get(val,-1)
    

In [52]:
def make_predictions(headlines,	description, content):
    text = headlines + " "+description +" "+ content
    text = stemming(text)
    text = vc.transform([text])
    val = model.predict(text)
    val = val_to_category(int(val[0]))
    print("News category is : ",val)

In [53]:
make_predictions("kohli got his 50th century","kholi has scored his 50th century today at stadium","the run machine kholi hit another milestone my scoring his 50th odi centry toda at stadium")

News category is :  sports


In [54]:
make_predictions("ambani earns 10M in a hour" ," Mukesh Ambani earnd 10M ruppes in a single hour","Mukesh Ambani the chairman of Reliance Industries Limited has earns around 10M ruppes every single hour this shows the that how much potential in there in indian market")

News category is :  business


In [60]:
make_predictions("Change is NCERT syllabus","","") # as you can see model can even predict with title only

News category is :  education
