In [26]:
pip install pandas numpy scikit-learn nltk spacy gensim




In [28]:
#importing the needed libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import spacy
from gensim import corpora, models
import gensim

In [30]:
#importing the data set
data = pd.read_csv('/news.csv')

In [31]:
# Checking the shape of the data
print(data.shape)

(6335, 4)


In [32]:
# Displaying  the first few rows of the dataset
print(data.head())

   Unnamed: 0                                              title  \
0        8476                       You Can Smell Hillary’s Fear   
1       10294  Watch The Exact Moment Paul Ryan Committed Pol...   
2        3608        Kerry to go to Paris in gesture of sympathy   
3       10142  Bernie supporters on Twitter erupt in anger ag...   
4         875   The Battle of New York: Why This Primary Matters   

                                                text label  
0  Daniel Greenfield, a Shillman Journalism Fello...  FAKE  
1  Google Pinterest Digg Linkedin Reddit Stumbleu...  FAKE  
2  U.S. Secretary of State John F. Kerry said Mon...  REAL  
3  — Kaydee King (@KaydeeKing) November 9, 2016 T...  FAKE  
4  It's primary day in New York and front-runners...  REAL  


In [33]:
# Extracting  the labels
labels = data.label
print(labels.head())

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object


In [34]:
# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(data['text'], labels, test_size=0.2, random_state=7)

In [35]:
# Sentiment Analysis
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()
data['sentiment'] = data['text'].apply(lambda x: sid.polarity_scores(x)['compound'])

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [36]:
# Named Entity Recognition (NER)
nlp = spacy.load('en_core_web_sm')
data['entities'] = data['text'].apply(lambda x: [(ent.text, ent.label_) for ent in nlp(x).ents])


In [37]:
# Topic Modeling
def preprocess(text):
    return [word for word in gensim.utils.simple_preprocess(text) if word not in gensim.parsing.preprocessing.STOPWORDS]

texts = data['text'].apply(preprocess)
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
lda_model = gensim.models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)
data['topics'] = [lda_model[doc] for doc in corpus]


In [39]:
# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

In [40]:
# Fit and transform the training set, and transform the testing set
tfidf_train = tfidf_vectorizer.fit_transform(x_train)
tfidf_test = tfidf_vectorizer.transform(x_test)

In [41]:
# Initialize PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train, y_train)

In [42]:
# Initialize PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train, y_train)

In [43]:
# Predict on the test set and calculate accuracy
y_pred = pac.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 92.58%


In [44]:
# Build confusion matrix
confusion = confusion_matrix(y_test, y_pred, labels=['FAKE', 'REAL'])
print(confusion)


[[587  51]
 [ 43 586]]


In [45]:
# Display some of the additional features
print(data[['text', 'sentiment', 'entities', 'topics']].head())

                                                text  sentiment  \
0  Daniel Greenfield, a Shillman Journalism Fello...    -0.9994   
1  Google Pinterest Digg Linkedin Reddit Stumbleu...     0.7723   
2  U.S. Secretary of State John F. Kerry said Mon...     0.9595   
3  — Kaydee King (@KaydeeKing) November 9, 2016 T...    -0.4242   
4  It's primary day in New York and front-runners...     0.9903   

                                            entities  \
0  [(Daniel Greenfield, PERSON), (Shillman Journa...   
1  [(Google Pinterest, ORG), (two, CARDINAL), (Pa...   
2  [(U.S., GPE), (State, ORG), (John F. Kerry, PE...   
3  [(November 9, 2016, DATE), (tonight, TIME), (D...   
4  [(New York, GPE), (Hillary Clinton, PERSON), (...   

                                              topics  
0  [(0, 0.6660551), (1, 0.20792745), (2, 0.106032...  
1  [(0, 0.23614244), (1, 0.70332634), (2, 0.05859...  
2  [(2, 0.07337513), (3, 0.12929845), (4, 0.79552...  
3                  [(0, 0.18246731), (1,