## Importing libraries

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## Data Reading

In [2]:
df = pd.read_csv('news-data.csv')
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [3]:
df.shape

(2225, 2)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  2225 non-null   object
 1   text      2225 non-null   object
dtypes: object(2)
memory usage: 34.9+ KB


In [5]:
df.isna().sum()

category    0
text        0
dtype: int64

In [6]:
df = df.drop_duplicates()
df.shape

(2126, 2)

In [7]:
df = df.reset_index() # Need to do reset of index values of removel of duplicated indexs values

In [8]:
len(df.category.unique())

5

In [9]:
df.category.value_counts()

sport            504
business         503
politics         403
entertainment    369
tech             347
Name: category, dtype: int64

## Data cleaning and preporcessing

In [10]:
import nltk
import re
from nltk.corpus import stopwords

special_char_remover = re.compile('[/(){}\[\]\@,:;?$''""]')
extra_symbol_removel = re.compile('[^0-9a-z #+_]')
stop_words = stopwords.words('english')

def clean_text(text):
    text = text.lower()
    text = special_char_remover.sub(' ',text)
    text = extra_symbol_removel.sub('', text)
    text = ' '.join((word) for word in text.split() if word not in stop_words)
    return text
df['text'] = df['text'].apply(clean_text)

## TfidfVectorizer

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features= 1500, ngram_range=(1, 2))

X = tfidf.fit_transform(df['text']).toarray()

In [12]:
X

array([[0.02854708, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.05243374, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.05403895, 0.        , 0.05869556, ..., 0.        , 0.        ,
        0.        ],
       [0.0370173 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [13]:
len(X[0])

1500

In [14]:
y = df.category

In [15]:
y

0                tech
1            business
2               sport
3               sport
4       entertainment
            ...      
2121         business
2122         politics
2123    entertainment
2124         politics
2125            sport
Name: category, Length: 2126, dtype: object

In [17]:
X.shape

(2126, 1500)

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 3)

In [20]:
print(X_train.shape)
print()
print(X_test.shape)

(1700, 1500)

(426, 1500)


## LogisticRegression

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lr = LogisticRegression()

lr.fit(X_train,y_train)
y_pred1 = lr.predict(X_test)

print("******** LogisticRegression ********")
print(f'Accuracy is : {accuracy_score(y_pred1,y_test)}')

******** LogisticRegression ********
Accuracy is : 0.9765258215962441


## XGBClassifier

In [19]:
from xgboost import XGBClassifier
xgb = XGBClassifier()

xgb.fit(X_train,y_train)
y_pred2 = xgb.predict(X_test)

print("******** XGBClassifier ********")
print(f'Accuracy is : {accuracy_score(y_pred2,y_test)}')

******** XGBClassifier ********
Accuracy is : 0.9436619718309859


## MultinomialNB

In [20]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()

mnb.fit(X_train,y_train)
y_pred3 = mnb.predict(X_test)

print("******** MultinomialNB ********")
print(f'Accuracy is : {accuracy_score(y_pred3,y_test)}')

******** MultinomialNB ********
Accuracy is : 0.9530516431924883


## RandomForestClassifier

In [21]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

rf.fit(X_train,y_train)
y_pred4 = rf.predict(X_test)

print("******** RandomForestClassifier ********")
print(f'Accuracy is : {accuracy_score(y_pred4,y_test)}')

******** RandomForestClassifier ********
Accuracy is : 0.9413145539906104


## Testing on Random Text Data

In [22]:
news = tfidf.transform(['IPL 2021: Royal Challengers Bangalore skipper Virat Kohli doffed his hat to AB de Villiers and Glenn Maxwell after their match-winning innings against Kolkata Knight Riders at the MA Chidambaram Stadium in Chennai.'])
pred = lr.predict(news)
print(pred)

['sport']


In [23]:
news = tfidf.transform(['The simplest form of technology is the development and use of basic tools. The prehistoric discovery of how to control fire and the later Neolithic Revolution increased the available sources of food, and the invention of the wheel helped humans to travel in and control their environment. Developments in historic times, including the printing press, the telephone, and the Internet, have lessened physical barriers to communication and allowed humans to interact freely on a global scale'])
pred = lr.predict(news)
print(pred)

['tech']


In [24]:
pred = lr.predict(tfidf.transform(['IPL 2021: Royal Challengers Bangalore skipper Virat Kohli doffed his hat to AB de Villiers and Glenn Maxwell after their match-winning innings against Kolkata Knight Riders at the MA Chidambaram Stadium in Chennai.']))
print(pred)

['sport']


## Pickle Dump

In [25]:
import pickle
f = open('lr.pickle', 'wb')
pickle.dump(lr, f)
f.close()

f = open('tfidf.pickle', 'wb')
pickle.dump(tfidf, f)
f.close()