# NEWS CLASSIFICATION

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Data Reading
df = pd.read_csv('news-data.csv')
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [3]:
df.shape

(2225, 2)

## Null Value Treatment

In [4]:
df.isna().sum()

category    0
text        0
dtype: int64

Found no null values in our dataset

## Duplicated Value Treatment

In [5]:
df = df.drop_duplicates()
df.shape

(2126, 2)

In [6]:
df = df.reset_index() # Need to do reset of index values of removel of duplicated indexs values

In [7]:
len(df.category.unique())

5

In [8]:
df.category.value_counts()

sport            504
business         503
politics         403
entertainment    369
tech             347
Name: category, dtype: int64

It's seems like a bit balanced data

In [10]:
df['text'].apply(lambda x: len(x.split(' '))).sum()

893008

In [None]:
# Data cleaning and preporcessing
import nltk
import re
from nltk.corpus import stopwords

special_char_remover = re.compile('[/(){}\[\]\@,:;?$''""]')
extra_symbol_removel = re.compile('[^0-9a-z #+_]')
stop_words = stopwords.words('english')

def clean_text(text):
    text = text.lower()
    text = special_char_remover.sub(' ',text)
    text = extra_symbol_removel.sub('', text)
    text = ' '.join((word) for word in text.split() if word not in stop_words)
    return text
df['text'] = df['text'].apply(clean_text)

In [31]:
# # Data cleaning and preporcessing
# import nltk
# import re
# from nltk.stem.porter import PorterStemmer
# ps = PorterStemmer()
# from nltk.corpus import stopwords

# special_char_remover = re.compile('[/(){}\[\]\@,:;?$''""]')
# extra_symbol_removel = re.compile('[^0-9a-z #+_]')
# stop_words = stopwords.words('english')

# def clean_text(text):
#     text = text.lower()
#     text = special_char_remover.sub(' ',text)
#     text = extra_symbol_removel.sub('', text)
#     text = ' '.join(ps.stem(word) for word in text.split() if word not in stop_words)
#     return text
# df['text'] = df['text'].apply(clean_text)

In [32]:
df['text'].apply(lambda x: len(x.split(' '))).sum()

471718

## Train Test Split

In [13]:
X = df.text
y = df.category
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [14]:
X_train.shape, X_test.shape, y_train.shape,y_test.shape

((1700,), (426,), (1700,), (426,))

## Applying Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score


lr = Pipeline([('tfidf', TfidfVectorizer()),
               ('clf', LogisticRegression()),
              ])

lr.fit(X_train,y_train)
y_pred1 = lr.predict(X_test)

print(f'Accuracy is : {accuracy_score(y_pred1,y_test)}')

Accuracy is : 0.9671361502347418


## Applying Naive Bayes Classifier

In [17]:
from sklearn.naive_bayes import MultinomialNB


nb = Pipeline([('tfidf', TfidfVectorizer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train,y_train)
y_pred2 = nb.predict(X_test)

print(f'Accuracy is : {accuracy_score(y_pred2,y_test)}')

Accuracy is : 0.9624413145539906


## Applying Xgboost Classifier

In [18]:
from xgboost import XGBClassifier


xgb = Pipeline([('tfidf', TfidfVectorizer()),
               ('clf', XGBClassifier()),
              ])
xgb.fit(X_train,y_train)
y_pred3 = xgb.predict(X_test)

print(f'Accuracy is : {accuracy_score(y_pred3,y_test)}')

Accuracy is : 0.9553990610328639


In [19]:
from sklearn.ensemble import RandomForestClassifier


rfc = Pipeline([('tfidf', TfidfVectorizer()),
               ('clf', RandomForestClassifier()),
              ])
rfc.fit(X_train,y_train)
y_pred4 = rfc.predict(X_test)

print(f'Accuracy is : {accuracy_score(y_pred4,y_test)}')

Accuracy is : 0.9647887323943662


In [20]:
lr.predict(['Freak injuries rule Gary Ballance, Phil Salt out of County Championship opening round '])

array(['sport'], dtype=object)

In [21]:
nb.predict(['One Dead, Several Injured As Violence Marks Polling Day in Kerala'])

array(['entertainment'], dtype='<U13')

In [22]:
nb.predict(['Katrina Kaif tests positive for coronavirus, under home quarantine'])

array(['politics'], dtype='<U13')

In [None]:
lr.predict(['Bollywood star Katrina Kaif on Tuesday said she has tested positive for the coronavirus and is currently isolated at home. The actor, through an Instagram story, informed her fans about her COVID-19 diagnosis and also asked people, who came in contact with her in the last few days, to get themselves tested.I have tested positive for Covid-19. Have immediately isolated myself and will be under home quarantine. I’m following all safety protocols under the advice of my doctors. Requesting everyone who came in contact with me to get tested immediately too. Grateful for all your love and support. Please stay safe and take care Katrina wrote.'])

In [None]:
lr.predict(['BJP is looking for an opportunity to serve the people of Bengal while chief minister Mamata Banerjee is "intimidating and threatening voters", Prime Minister Narendra Modi alleged at a rally in Howrah on Tuesday.'])

In [None]:
lr.predict(['RBI has stated that accommodative stance will prevail as long as necessary to sustain growth on a durable basis and continue to mitigate COVID-19 disruptions, while keeping inflation under check'])

In [None]:
lr.predict(['Philosophical debates have arisen over the use of technology, with disagreements over whether technology improves the human condition or worsens it. Neo-Luddism, anarcho-primitivism, and similar reactionary movements criticize the pervasiveness of technology, arguing that it harms the environment and alienates people; proponents of ideologies such as transhumanism and techno-progressivism view continued technological progress as beneficial to society and the human condition.'])

## Model Deployment

In [None]:
import pickle
from flask import Flask,render_template,url_for, request
import joblib as jb

In [None]:
f = open('News-classifier.pkl','wb')
pickle.dump(lr, f)
f.close()