In [6]:
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\123\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
train_df = pd.read_csv('BBC News Train.csv')
train_df.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [8]:
test_df = pd.read_csv('BBC News Test.csv')
test_df.head()

Unnamed: 0,ArticleId,Text
0,1018,qpr keeper day heads for preston queens park r...
1,1319,software watching while you work software that...
2,1138,d arcy injury adds to ireland woe gordon d arc...
3,459,india s reliance family feud heats up the ongo...
4,1020,boro suffer morrison injury blow middlesbrough...


# Training the model

In [9]:
train_corpus = []
for i in range(0, len(train_df)):
    train_text = re.sub('[^a-zA-Z]', ' ', train_df['Text'][i])
    train_text = train_text.lower()
    train_text = train_text.split()
    ps = PorterStemmer()
    train_text = [ps.stem(word) for word in train_text if not word in set(stopwords.words('english'))]
    train_text = ' '.join(train_text)
    train_corpus.append(train_text)

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(train_corpus).toarray()
y = train_df.iloc[:, 2].values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [10]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X, y)

GaussianNB(priors=None, var_smoothing=1e-09)

In [11]:
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[63  0  0  0  1]
 [ 0 63  0  0  0]
 [ 0  0 53  0  0]
 [ 0  0  0 65  0]
 [ 0  0  0  0 53]]


# Testing the BBC News Test

In [12]:
test_corpus = []
for i in range(0, len(test_df)):
    test_text = re.sub('[^a-zA-Z]', ' ', test_df['Text'][i])
    test_text = test_text.lower()
    test_text = test_text.split()
    ps = PorterStemmer()
    test_text = [ps.stem(word) for word in test_text if not word in set(stopwords.words('english'))]
    test_text = ' '.join(test_text)
    test_corpus.append(test_text)

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
testing = cv.fit_transform(test_corpus).toarray()

In [13]:
test_pred = classifier.predict(testing)

In [14]:
test_pred

array(['tech', 'tech', 'business', 'tech', 'business', 'business', 'tech',
       'tech', 'business', 'sport', 'tech', 'tech', 'business',
       'business', 'business', 'entertainment', 'business', 'business',
       'entertainment', 'tech', 'tech', 'tech', 'business', 'tech',
       'tech', 'entertainment', 'politics', 'sport', 'business', 'tech',
       'tech', 'business', 'tech', 'tech', 'politics', 'business',
       'business', 'tech', 'business', 'tech', 'entertainment', 'tech',
       'entertainment', 'entertainment', 'sport', 'tech', 'entertainment',
       'tech', 'sport', 'tech', 'politics', 'tech', 'tech', 'politics',
       'tech', 'tech', 'tech', 'business', 'tech', 'tech', 'tech', 'tech',
       'tech', 'politics', 'tech', 'business', 'entertainment', 'tech',
       'business', 'tech', 'entertainment', 'entertainment', 'tech',
       'tech', 'business', 'business', 'tech', 'tech', 'politics', 'tech',
       'tech', 'tech', 'business', 'entertainment', 'politics',
       

In [17]:
test_df['Classified Categories'] = test_pred

In [24]:
test_df.head()

Unnamed: 0,ArticleId,Text,Categories,Classified Categories
0,1018,qpr keeper day heads for preston queens park r...,tech,tech
1,1319,software watching while you work software that...,tech,tech
2,1138,d arcy injury adds to ireland woe gordon d arc...,business,business
3,459,india s reliance family feud heats up the ongo...,tech,tech
4,1020,boro suffer morrison injury blow middlesbrough...,business,business
