# Libraries

In [1]:
import nltk
import sklearn
import numpy as np
import pandas as pd
import re 

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB

# Loading the dataset

In [2]:

train  = fetch_20newsgroups(subset='train')
test = fetch_20newsgroups(subset='test')


# Functions

In [3]:
def clean(x):
    m = re.sub(r'[\n,@,:,--,.,!,?,/,\d]',' ',x)
    return(m)


def stop_word(x):
    stopWords = set(stopwords.words('english'))
    wordsFiltered = []
    for w in x:
        if w not in stopWords:
            wordsFiltered.append(w)
    return wordsFiltered

def lemmma(x):
    Wordnet_lemma= WordNetLemmatizer()
    l = []
    for i in x:
        e = Wordnet_lemma.lemmatize(i)
        l.append(e)
    return l


def pre(X):
    
    wordpunct_tokenize = WordPunctTokenizer()
    data = pd.DataFrame(X,columns=['doc'])
    data['text'] = data['doc'].apply(lambda x: clean(x))
    data['word_toc'] =  data['text'].apply(lambda x: wordpunct_tokenize.tokenize(x))
    data['stop'] =  data['word_toc'].apply(lambda x: stop_word(x))
    data['lemma'] = data['stop'].apply(lambda x: lemmma(x))
    data['string'] = data['lemma'].apply(lambda x : " ".join(str(y) for y in x))
    return data['string']


def training(X,y):
    train_X = pre(X)
    tfidf = TfidfVectorizer()
    tfidf.fit(train_X)  # fitting 

    train_X = tfidf.transform(train_X)

    clf = MultinomialNB()
    clf.fit(train_X,y)
    return clf,tfidf


# Training 

In [4]:
X = train.data
y = train.target

clf,tfidf = training(X,y)


# testing

In [5]:
X = test.data
y = test.target

test_X = pre(X)
test_X = tfidf.transform(test_X)
clf.score(test_X,y)

0.81040892193308545

#  validation

In [6]:
val = [ "The Roman Catholic Church is the largest group of Christians in the world. There are about 1 billion members, mostly in Europe, North and South America. The church goes back to about 30 A.D., to the life of Jesus Christ and his apostles. Roman Catholics believe that Jesus Christ is the son of God, and that he rose to heaven after being crucified.  They believe that Jesus brought salvation to all people on Earth.The leader of the Roman Catholic Church and successor of Jesus Christ on Earth is the Pope. He governs the church from the Vatican, a small state situated in the heart of Rome. The Pope selects cardinals and bishops to lead the church throughout the world.The Roman Catholic Church has influenced history more than any other organization. It exercised its power throughout the Middle Ages. Missionaries travelled to other continents to spread Catholicism.  Great architects created churches and cathedrals, artists drew paintings and made frescoes."]


In [7]:
valx = pre(val)
valx = tfidf.transform(valx)
pre = clf.predict(valx)

print("prediction :-  ",test.target_names[pre[0]])

prediction :-   soc.religion.christian
