About the Dataset:

id: unique id for a news article

title: the title of a news article

author: author of the news article

text: the text of the article; could be incomplete

label: a label that marks whether the news article is real or fake: 1: Fake news 0: real News

In [None]:
import numpy as np                                                              #numpy array
import pandas as pd                                                             #use to create dataframe
import re                                                                       #regularExpression
import nltk
import pickle
from nltk.corpus import stopwords                                               #stopwords are words which have less value ex is,the
from nltk.stem.porter import PorterStemmer                                      #for stemming purpose
from sklearn.feature_extraction.text import TfidfVectorizer                     #convert text to features
from sklearn.model_selection import train_test_split                            #help in splitting training data and test data
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# load the dataset
df = pd.read_csv('/content/train.csv')

In [None]:
# counting the number of missing values in the dataset
df.isnull().sum()

# replacing the null values with empty string
df = df.fillna('')

In [None]:
import nltk
nltk.download('stopwords')                                                      #downloading stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
port_stem = PorterStemmer()                       #Loading porterStemmer() function to this variable port_stem

def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)       #removing numbers and punctuations from content & replace it with empty string
    stemmed_content = stemmed_content.lower()               #coverting everything to lower case
    stemmed_content = stemmed_content.split()               #creating a list ["the", "is",....] like this
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')] #removing stopwords then performing stemming
    stemmed_content = ' '.join(stemmed_content)             #joining the list ex. the is
    return stemmed_content

In [None]:
# preprocess the data by applying stemming to the title and author columns
df['title'] = df['title'].apply(stemming)
df['author'] = df['author'].apply(stemming)

In [None]:
# combine the title and author columns into a single text column
df['text'] = df['title'] + ' ' + df['author']

In [None]:
# vectorize the text using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text'])
y = df['label']

In [None]:
# split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# fit a logistic regression model to the training data
model = LogisticRegression()
model.fit(x_train, y_train)

# evaluate the model on the testing data
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.9740384615384615


In [None]:
# save the model using pickle
filename = 'logisticRegressionSavedModelTitleAuthor.pkl'
with open(filename, 'wb') as file:
    pickle.dump(model, file)

# load the model and make predictions on new data
with open(filename, 'rb') as file:
    loaded_model = pickle.load(file)
new_text = 'Specter of Trump Loosens Tongues, if Not Purse Strings, in Silicon Valley - The New York Times'
new_text = stemming(new_text)
new_text = vectorizer.transform([new_text])
prediction = loaded_model.predict(new_text)[0]
if prediction == 1:
    print('Fake news')
else:
    print('Real news')

Real news


In [None]:
#flask code for deployment
from flask import Flask, render_template, request
import pickle
import numpy as np


app=Flask(__name__)


@app.route('/')
def index_view():
    return render_template("index.html")
 
@app.route('/predict', methods = ['GET','POST'])
def predict():
    if request.method == 'POST':
        with open("logisticRegressionSavedModelTitleAuthor.pkl", "rb") as file:
            loaded_model=pickle.load(file)

        t=request.form.get('title')
        a=request.form.get('author')

        text=t+' '+a
        #return text
        new_text=np.array(text)
        predict = loaded_model.predict(new_text)
      
        if int(predict)== 0:
            return render_template("real.html", prediction=predict)
        else:
            return render_template("fake.html", prediction=predict)

if __name__=="__main__":
    app.run()