In [24]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [25]:
#Data Reading
df = pd.read_csv('news-data.csv')
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [26]:
df.shape

(2225, 2)

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  2225 non-null   object
 1   text      2225 non-null   object
dtypes: object(2)
memory usage: 34.9+ KB


In [28]:
df.isna().sum()

category    0
text        0
dtype: int64

In [29]:
df = df.drop_duplicates()
df.shape

(2126, 2)

In [30]:
df = df.reset_index() # Need to do reset of index values of removel of duplicated indexs values

In [31]:
len(df.category.unique())

5

In [32]:
df.category.value_counts()

sport            504
business         503
politics         403
entertainment    369
tech             347
Name: category, dtype: int64

In [33]:
# Data cleaning and preporcessing
import nltk
import re
from nltk.corpus import stopwords

special_char_remover = re.compile('[/(){}\[\]\@,:;?$''""]')
extra_symbol_removel = re.compile('[^0-9a-z #+_]')
stop_words = stopwords.words('english')

def clean_text(text):
    text = text.lower()
    text = special_char_remover.sub(' ',text)
    text = extra_symbol_removel.sub('', text)
    text = ' '.join((word) for word in text.split() if word not in stop_words)
    return text
df['text'] = df['text'].apply(clean_text)

## CountVectorizer

In [34]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500,  
                     ngram_range=(1, 3),#I specify in the code (1,3). This means that unigrams, bigrams, and trigrams will be taken into account while creating features.
                     min_df=3) # Minimum no of time an ngram should appear in a corpus to be used as a feature.

X = cv.fit_transform(df['text']).toarray()

In [35]:
X

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [36]:
len(X[0])

1500

In [37]:
y = df.category

In [38]:
y

0                tech
1            business
2               sport
3               sport
4       entertainment
            ...      
2121         business
2122         politics
2123    entertainment
2124         politics
2125            sport
Name: category, Length: 2126, dtype: object

In [39]:
X.shape

(2126, 1500)

In [40]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 3)

In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lr = LogisticRegression()

lr.fit(X_train,y_train)
y_pred1 = lr.predict(X_test)

print("******** LogisticRegression ********")
print(f'Accuracy is : {accuracy_score(y_pred1,y_test)}')

******** LogisticRegression ********
Accuracy is : 0.9671361502347418


In [42]:
from xgboost import XGBClassifier
xgb = XGBClassifier()

xgb.fit(X_train,y_train)
y_pred2 = xgb.predict(X_test)

print("******** XGBClassifier ********")
print(f'Accuracy is : {accuracy_score(y_pred2,y_test)}')

******** XGBClassifier ********
Accuracy is : 0.9460093896713615


In [43]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()

mnb.fit(X_train,y_train)
y_pred3 = mnb.predict(X_test)

print("******** MultinomialNB ********")
print(f'Accuracy is : {accuracy_score(y_pred3,y_test)}')

******** MultinomialNB ********
Accuracy is : 0.9530516431924883


In [44]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

rf.fit(X_train,y_train)
y_pred4 = rf.predict(X_test)

print("******** RandomForestClassifier ********")
print(f'Accuracy is : {accuracy_score(y_pred4,y_test)}')

******** RandomForestClassifier ********
Accuracy is : 0.9460093896713615


In [45]:
news = cv.transform(['IPL 2021: Royal Challengers Bangalore skipper Virat Kohli doffed his hat to AB de Villiers and Glenn Maxwell after their match-winning innings against Kolkata Knight Riders at the MA Chidambaram Stadium in Chennai.'])
pred = lr.predict(news)
print(pred)

['sport']
