Importing dependencies

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.util import ngrams
from nltk.corpus import stopwords
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import re,string,unicodedata
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer

Reading the dataset

In [None]:
news_df=pd.read_csv("news_dataset.csv", engine='python',encoding='utf-8', error_bad_lines=False)
news_df


In [None]:
news_text=news_df['text']
news_text

Cleaning the text

In [None]:
news_df['text']=news_df['text'].apply(str)

In [None]:
news_df['text']= news_df['text'].apply((lambda y:re.sub("http://\S+"," ", y)))
news_df['text']= news_df['text'].apply((lambda x:re.sub("\@", " ",x.lower())))
news_df['text']= news_df['text'].apply((lambda z:re.sub(r'[^\w\s]', '', z)))
news_df['text']= news_df['text'].apply((lambda w:re.sub("[0-9]", "", w)))
news_df['text']= news_df['text'].apply((lambda v:re.sub("\n", "", v)))

news_df['text']

In [None]:
news_df['text'][1]

In [None]:
#For removing hindi characters from the dataset
news_df['text'] = news_df['text'].apply(lambda q:re.sub("([^\x900-\x97F])+"," ",q))

In [None]:
news_df['text'][1] #hindi characters successfully removed 

In [None]:
stop = stopwords.words('english')
news_df['text']= news_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop]))
news_df['text']


In [None]:
#print(news_df.text.apply(word_tokenize))

In [None]:
ps = PorterStemmer()
corpus = []
review = [ps.stem(word) for word in news_df['text']if not word in stopwords.words('english')]
review = ' '.join(review)
corpus.append(review)

In [None]:
labels=news_df.label
labels.head()

Training and testing

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(news_df['text'], labels, test_size=0.2, random_state=7)

In [None]:
#initialize a TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
#Fit and transform train set, transform test set
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

In [None]:
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
pac=PassiveAggressiveClassifier(max_iter=100)
pac.fit(tfidf_train,y_train)
#Predict on the test set and calculate accuracy
y_pred=pac.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

In [None]:
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

In [None]:
# Tokenization
review = re.sub('[^a-zA-Z]', ' ', news_df['text'][1])
review = review.lower()
review = review.split() 
review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
review = ' '.join(review)
# Vectorization 
val = tfidf_vectorizer.transform([review]).toarray()
# Predict 
pac.predict(val)

Pickling the model

In [None]:
import pickle
pickle.dump(pac, open('model.pkl', 'wb'))
pickle.dump(tfidf_vectorizer, open('tfidfvect.pkl', 'wb'))

In [None]:
joblib_model = pickle.load(open('model.pkl', 'rb'))
joblib_vect = pickle.load(open('tfidfvect.pkl', 'rb'))
val_pkl = joblib_vect.transform([review]).toarray()
joblib_model.predict(val_pkl)