In [1]:
from keras.preprocessing.text import text_to_word_sequence
from sklearn.preprocessing import LabelEncoder
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from nltk import pos_tag
import pandas as pd
import numpy as np
import scipy
import nltk


In [2]:
data_train = pd.read_csv("BBC News Train.csv")
data_train.columns

Index(['ArticleId', 'Text', 'Category'], dtype='object')

In [3]:
target_category = list(data_train.Category.unique())
target_category

['business', 'tech', 'politics', 'sport', 'entertainment']

In [4]:

def preprocessing(train_text):
       
    #word tokenization using text-to-word-sequence
    train_text= str(train_text)
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    tokenized_train_set = text_to_word_sequence(train_text,
                                                filters = filters,
                                                lower = True,
                                                split=" ")
    #stop word removal
    stop_words = set(stopwords.words('english'))
    stopwordremove = [i for i in tokenized_train_set if not i in stop_words]
        
    #join words into sentence
    stopwordremove_text = ' '.join(stopwordremove)
        
    #remove numbers
    numberremove_text = ''.join(c for c in stopwordremove_text if not c.isdigit())
    
    #--Stemming--
    stemmer= PorterStemmer()
    stem_input=nltk.word_tokenize(numberremove_text)
    stem_text=' '.join([stemmer.stem(word) for word in stem_input])
   
    def get_wordnet_pos(word):
        """Map POS tag to first character lemmatize() accepts"""
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

        return tag_dict.get(tag, wordnet.NOUN)

    lem_input = nltk.word_tokenize(stem_text)
    lemmatizer = WordNetLemmatizer()
    lem_text= ' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in lem_input])
    return lem_text

In [5]:
data_train["Text"] = data_train["Text"].apply(preprocessing)
text = data_train["Text"]
category = data_train['Category']


In [6]:
text

0       worldcom ex bos launch defenc lawyer defend fo...
1       german busi confid slide german busi confid fe...
2       bbc poll indic econom gloom citizen major nati...
3       lifestyl govern mobil choic faster well funkie...
4       enron bos m payout eighteen former enron direc...
                              ...                        
1485    doubl evict big brother model capric holbi cit...
1486    dj doubl act revamp chart show dj duo jk joel ...
1487    weak dollar hit reuter revenu medium group reu...
1488    appl ipod famili expand market appl expand ipo...
1489    santi worm make unwelcom visit thousand websit...
Name: Text, Length: 1490, dtype: object

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(text,category, 
                                                    test_size = 0.3, 
                                                    random_state = 60,
                                                    shuffle=True, 
                                                    stratify=category)

In [8]:
nb = Pipeline([('tfidf', TfidfVectorizer()),
               ('clf', MultinomialNB())])
nb.fit(X_train,Y_train)

test_predict = nb.predict(X_test)

train_accuracy = round(nb.score(X_train,Y_train)*100)
test_accuracy =round(accuracy_score(test_predict, Y_test)*100)


print("Naive Bayes Train Accuracy Score : {}% ".format(train_accuracy ))
print("Naive Bayes Test Accuracy Score  : {}% ".format(test_accuracy ))

Naive Bayes Train Accuracy Score : 99% 
Naive Bayes Test Accuracy Score  : 96% 


In [9]:
data_test=pd.read_csv("BBC News Test.csv")

In [10]:
data_test.columns

Index(['ArticleId', 'Text'], dtype='object')

In [11]:
data_test['Text'] = data_test['Text'].apply(preprocessing)

test_id = data_test['ArticleId']
test_text = data_test['Text']
y_prdict = nb.predict(test_text)

In [12]:
final_data = pd.DataFrame(list(zip(test_id, y_prdict)),
                          columns =['ArticleId', 'Category'])
final_data.to_csv('news_sorted.csv')

In [13]:
import pickle
filename = "news_sorting.pkl"
pickle.dump(nb, open(filename,"wb"))