# ADA Project

# Text Classification using Naive Bayes

## Anubhav Gupta, DTU

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

In [2]:
# Loading the dataset in dataframe df
df=pd.read_csv('news.csv')

df.head(5)

Unnamed: 0,headline,category,authors,short_description
0,There Were 2 Mass Shootings In Texas Last Wee...,CRIME,Melissa Jeltsen,She left her husband. He killed their childre...
1,Will Smith Joins Diplo And Nicky Jam For The ...,ENTERTAINMENT,Andy McDonald,Of course it has a song.
2,Hugh Grant Marries For The First Time At Age ...,ENTERTAINMENT,Ron Dicker,The actor and his longtime girlfriend Anna Eb...
3,Jim Carrey Blasts 'Castrato' Adam Schiff And ...,ENTERTAINMENT,Ron Dicker,The actor gives Dems an ass-kicking for not f...
4,Julianna Margulies Uses Donald Trump Poop Bag...,ENTERTAINMENT,Ron Dicker,The Dietland actress said using the bags is a...


In [1]:
#[df['category'].tolist()]

In [4]:
df.drop('headline',axis=1,inplace=True)

In [5]:
df.drop('authors',axis=1,inplace=True)

In [6]:
 df.head()

Unnamed: 0,category,short_description
0,CRIME,She left her husband. He killed their childre...
1,ENTERTAINMENT,Of course it has a song.
2,ENTERTAINMENT,The actor and his longtime girlfriend Anna Eb...
3,ENTERTAINMENT,The actor gives Dems an ass-kicking for not f...
4,ENTERTAINMENT,The Dietland actress said using the bags is a...


In [7]:
df['category'].value_counts()

 POLITICS            32739
 WELLNESS            17827
 ENTERTAINMENT       16058
 TRAVEL               9887
 STYLE & BEAUTY       9649
 PARENTING            8677
 HEALTHY LIVING       6694
 QUEER VOICES         6314
 FOOD & DRINK         6226
 BUSINESS             5937
 COMEDY               5175
 SPORTS               4884
 BLACK VOICES         4528
 HOME & LIVING        4195
 PARENTS              3955
 THE WORLDPOST        3664
 WEDDINGS             3651
 WOMEN                3490
 IMPACT               3459
 DIVORCE              3426
 CRIME                3405
 MEDIA                2815
 WEIRD NEWS           2670
 GREEN                2622
 WORLDPOST            2579
 RELIGION             2556
 STYLE                2254
 SCIENCE              2178
 WORLD NEWS           2177
 TASTE                2096
 TECH                 2082
 MONEY                1707
 ARTS                 1509
 FIFTY                1401
 GOOD NEWS            1398
 ARTS & CULTURE       1339
 ENVIRONMENT          1323
 

In [8]:
df.shape

(200853, 2)

In [9]:
categories_to_add = [' ENTERTAINMENT  ',' POLITICS  ', ' SPORTS  ',' SCIENCE  ']
dfs=[]
for cat in categories_to_add:
    dfs.append(df.loc[df['category'] == cat])

In [10]:
df_final=pd.concat(dfs)

In [11]:
df_final

Unnamed: 0,category,short_description
1,ENTERTAINMENT,Of course it has a song.
2,ENTERTAINMENT,The actor and his longtime girlfriend Anna Eb...
3,ENTERTAINMENT,The actor gives Dems an ass-kicking for not f...
4,ENTERTAINMENT,The Dietland actress said using the bags is a...
5,ENTERTAINMENT,It is not right to equate horrific incidents ...
...,...,...
200754,SCIENCE,Because of the overuse of antibiotics antibio...
200815,SCIENCE,Gallery
200816,SCIENCE,image 1
200817,SCIENCE,That doesn't mean Jobs lacks for fans in the ...


In [12]:
df_final.shape

(55859, 2)

In [13]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_final.category= le.fit_transform(df_final.category.values)

In [14]:
df_final['category'].value_counts()

1    32739
0    16058
3     4884
2     2178
Name: category, dtype: int64

In [15]:
# Permanently deleting the redundant data
df_final.drop_duplicates(inplace=True)


In [16]:
#New shape of dataset
df_final.shape

(49251, 2)

In [17]:
df_final.isnull().sum() #check if null values

category             0
short_description    0
dtype: int64

In [18]:
df_final['category'].value_counts()

1    29676
0    13461
3     4309
2     1805
Name: category, dtype: int64

In [19]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\ANUBHAV
[nltk_data]     GUPTA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
from nltk.stem import PorterStemmer
ps=PorterStemmer()

In [21]:
def process_text(text):
    # remove all punctuations  
    punc_less=[ch for ch in text if ch not in string.punctuation]
    punc_less=''.join(punc_less)
    
    #remove all numerical values
    res = ''.join([i for i in punc_less if not i.isdigit()])
    
    # remove all stopwords
    new_words=[word for word in res.split() if word.lower() not in stopwords.words('english')]
    
    #apply stemming 
    final_words=[ps.stem(w) for w in new_words]
    
    return final_words


In [22]:
#Example to show working of our function
process_text('hello 3344 [phone] visited , visits 112 worlds: 345661 on a this is number prograMIng program')

['hello', 'phone', 'visit', 'visit', 'world', 'number', 'program', 'program']

# Splitting the Data

In [23]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(df_final['short_description'],df_final['category'],test_size=0.20,random_state=18)

# Building the model

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
temp_train_bow=CountVectorizer(analyzer=process_text).fit(X_train)

In [25]:
train_bow=temp_train_bow.transform(X_train)

In [26]:
train_bow.shape

(39400, 29927)

In [27]:
temp_test_bow=CountVectorizer(analyzer=process_text).fit(X_train)#need to remove not needed can use temp_train_bow only

In [28]:
test_bow=temp_test_bow.transform(X_test)

In [29]:
test_bow.shape

(9851, 29927)

In [30]:
# Creating and Training of the Naive Bayes Model
from sklearn.naive_bayes import MultinomialNB

In [31]:
clf_bow=MultinomialNB().fit(train_bow,y_train)

In [32]:
# Measuring the performance of our model on train data
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [33]:
pred_train_bow=clf_bow.predict(train_bow)

print('Confusion Matrix -> \n',confusion_matrix(y_train,pred_train_bow))
print('\nAccuracy-> ',accuracy_score(y_train,pred_train_bow))

Confusion Matrix -> 
 [[ 8660  2021     3    39]
 [  742 22916    17    54]
 [  200   710   541     9]
 [  511  1568     3  1406]]

Accuracy->  0.8508375634517766


In [60]:
# Measuring the performance of our model on test data
pred_test_bow=clf_bow.predict(test_bow)

print('Confusion Matrix -> \n',confusion_matrix(y_test,pred_test_bow))
print('\nAccuracy-> ',accuracy_score(y_test,pred_test_bow))

Confusion Matrix -> 
 [[1585 1125    3   25]
 [ 250 5683    3   11]
 [  60  225   56    4]
 [ 183  437    1  200]]

Accuracy->  0.7637803268703685


In [67]:
ex_docs=['Big stars like Justin Bieber with his whole band was there and rocked the concert ,even Will Smith joined him on the stage and danced.']
#for pre processing we can here use the process_text function and then transform the message
abc=temp_test_bow.transform(ex_docs)
ex_pred=clf_bow.predict(abc)
#print(ex_pred)
for i in ex_pred:
    if(i==0):
        print("Entertainment")
    if(i==1):
        print("Politics")
    if(i==2):
        print("Science")
    if(i==3):
        print("Sports")

Entertainment
