CS6120 NLP Assignment 2 - Fake News Detection by Naive Bayes and TF-IDF<br>
Wing Man, Kwok<br>
Feb 1 2023<br>

In [3]:
# Import libraries and EDA of true and fake news dataset
import numpy as np  
import pandas as pd  

path="/kaggle/input/fake-and-real-news-dataset/"
df_real = pd.read_csv(path + 'True.csv')
df_fake = pd.read_csv(path + 'Fake.csv')

# Add y_true
df_real['RealNews?'] = True
df_fake['RealNews?'] = False

# Combine true news and fake news into one single file
df = df_real.append(df_fake)

df.head()
len(df)

44898

In [None]:
# view rows of all y_true
df['RealNews?']

In [None]:
# view rows when news y_true is True
df[df['RealNews?']]

In [None]:
# view rows when news y_true is False
df[~df['RealNews?']]

In [None]:
df['title']

In [4]:
# combine title and text into a new column
df['document'] = df[['title', 'text']].agg(' '.join, axis=1)

# set print option to show full cell content to verify title and text are combined
pd.set_option('display.max_colwidth', -1)
df['document']

  """


0        As U.S. budget fight looms, Republicans flip their fiscal script WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on Sunday and urged budget restraint in 2018. In keeping with a sharp pivot under way among Republicans, U.S. Representative Mark Meadows, speaking on CBS’ “Face the Nation,” drew a hard line on federal spending, which lawmakers are bracing to do battle over in January. When they return from the holidays on Wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the November congressional election campaigns approach in which Republicans will seek to keep control of Congress. President Donald Trump and his Republicans want a big budget increase in military spending, while Democrats also want proportional increases

In [5]:
# change all text to lower case
df['document'] = df['document'].apply(lambda x: x.lower())

Part 1: Naive Bayes “by hand”

Train a Naive Bayes classifier on the training set. Use Laplace (add-1) smoothing. Report the precision, recall, and F1 score on the test set.

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
import re

In [7]:
# Dataset cleaning

#Remove punctuation 
df['document_bow'] = df['document'].apply(lambda x: re.sub(r'[^\w\s]','',x))
#Remove numbers 
df['document_bow'] = df['document_bow'].str.replace('\d+','')
#Removes quotation marks
df['document_bow'] = df['document_bow'].apply(lambda x: re.sub(r'\"', '', x))
df.head()

df_train, df_test = train_test_split(df, test_size=0.2, shuffle=True)
df_train.to_csv("df_train.csv")
df_test.to_csv("df_test.csv")

  


In [10]:
def build_dict(dataset,dictionary):
    for bow in dataset['document_bow']: 
        words = bow.split()
        for word in words:
            if word in dictionary:
                dictionary[word] += 1
            else:
                dictionary[word] = 1
    return dictionary

In [11]:
def count_total_words(dictionary):
    return sum(dictionary.values())

In [12]:
def compute_probability(dictionary,X, prior):
    X = X.lower()
    words = X.split()
    probability = np.log(prior)
    for word in words:
        if word in dictionary:
            probability += np.log(dictionary[word])
    return probability

In [13]:
def convert_word2prob(Dictionary, size, overall_unique_key):
    dictionary = {}
    
    for word in Dictionary:
        dictionary[word] = (Dictionary[word] + 1)/(size + len(overall_unique_key))
        
    #if word not found in own dictionary but found in the other dictionary
    for word in (overall_unique_key - set(Dictionary.keys())):
        dictionary[word] = 1/(size + len(overall_unique_key))
        
    return dictionary

In [14]:
# build dictionaries
true_news_dictionary = {}
fake_news_dictionary = {}

true_news_dictionary = build_dict(df_train[df_train['RealNews?']],true_news_dictionary)   #build true news dictionary
fake_news_dictionary = build_dict(df_train[~df_train['RealNews?']],fake_news_dictionary)  #build fake news dictionary
print(len(true_news_dictionary), len(fake_news_dictionary))

# get unique words of dictionaries
true_news_unique_key = set(true_news_dictionary.keys())    #put all true news unique keyword into a set, then merge with fake news to form the dominiator
fake_news_unique_key = set(fake_news_dictionary.keys())
overall_unique_key = true_news_unique_key.union(fake_news_unique_key)
print(len(true_news_unique_key), len(fake_news_unique_key), len(overall_unique_key))

# calculate dictionary size in terms of word count
true_news_word_count = count_total_words(true_news_dictionary)   
fake_news_word_count = count_total_words(fake_news_dictionary)   

# based on word count in dictionary as input, convert word count into conditional prob with Laplace smoothing.
true_news_dictionary_prob = convert_word2prob(true_news_dictionary,true_news_word_count, overall_unique_key)
fake_news_dictionary_prob = convert_word2prob(fake_news_dictionary,fake_news_word_count, overall_unique_key)

# compute true news and fake news prior
total_count = len(df_train)

true_prior = len(df_train[df_train['RealNews?']]) / total_count
fake_prior = len(df_train[~df_train['RealNews?']]) / total_count

print(len(df_train[df_train['RealNews?']]), len(df_train[~df_train['RealNews?']]), len(df_train))
print(true_prior, fake_prior)
print(len(true_news_dictionary), len(fake_news_dictionary))

71931 161003
71931 161003 194160
17190 18728 35918
0.47859012194442896 0.521409878055571
71931 161003


In [15]:
# testing
test_sample = 'Breaking news: Today scientists have discovered moon has rised from west'
print(compute_probability(fake_news_dictionary_prob,test_sample, fake_prior))
print(compute_probability(true_news_dictionary_prob,test_sample, true_prior))

-74.48900682081488
-75.4965162913426


In [16]:
# report precision, recall, F1 score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, average_precision_score, average_precision_score
from sklearn.metrics import classification_report, plot_confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import make_scorer  
from sklearn import metrics 

y_true = df_test['RealNews?'].to_list()
y_pred = []

for word in df_test['document_bow']:
    if compute_probability(true_news_dictionary_prob,word,true_prior) > compute_probability(fake_news_dictionary_prob,word,fake_prior):
        y_pred.append(True)
    else:
        y_pred.append(False)
        
print(precision_recall_fscore_support(y_true, y_pred))        
print(classification_report(y_true, y_pred))    

(array([0.97467811, 0.95115741]), array([0.95560699, 0.97208422]), array([0.96504834, 0.96150696]), array([4753, 4227]))
              precision    recall  f1-score   support

       False       0.97      0.96      0.97      4753
        True       0.95      0.97      0.96      4227

    accuracy                           0.96      8980
   macro avg       0.96      0.96      0.96      8980
weighted avg       0.96      0.96      0.96      8980



Part 2: Tf-idf “by hand”

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:

# prepare a vector for each document in the training set using tf-idf
def calculate_tf(category, Dictionary):
    vector = pd.DataFrame()
    tfidf_dictionary = Dictionary.copy()
    tfidf_dictionary = tfidf_dictionary.fromkeys(tfidf_dictionary , 0) # default all dictionary values to 0
    
    for bow in category['document_bow']: 
        words = bow.split()
        dictionary = {}
        
        '''
        for word in words:
            if word in tfidf_dictionary:
                tfidf_dictionary[word] += 1
        '''        
        [tfidf_dictionary.update({word: tfidf_dictionary.get(word, 0) + 1}) for word in words if word in tfidf_dictionary]

                
        vector = vector.append(tfidf_dictionary, ignore_index=True)
        tfidf_dictionary = tfidf_dictionary.fromkeys(tfidf_dictionary , 0)
        
        if len(vector) % 100 == 0 and len(vector) > 99:
            print(len(vector))
    
    # after counting words, perform log to the entire dataframe to obtain term frequency
    tf = lambda x: 1 + np.log10(x)
    vector = vector.applymap(lambda x: tf(x) if x!=0 else x)
    
    return vector

# calculate TF of individual dataset
overall_dictionary = {**true_news_dictionary, **fake_news_dictionary}

X_train = calculate_tf(df_train[:1000],overall_dictionary)
X_test = calculate_tf(df_test[:1000],overall_dictionary)

X_train.to_csv('X_train.csv')  
print(X_train[:3])
print(X_test[:3])

100
200
300
400
500
600
700
800
900
1000
100
200
300
400
500
600
700
800
900
1000
    myanmar   police    arrest  buddhist      monk  over       us  embassy  \
0  0.000000  0.00000  0.000000  0.000000  0.000000  0.0   1.60206  0.00000   
1  1.778151  1.60206  1.477121  1.477121  1.477121  1.0   1.60206  1.60206   
2  0.000000  0.00000  0.000000  0.000000  0.000000  0.0   0.00000  0.00000   

    protest  newspaper  ...  publiclykeep  reallymarrakesh  \
0  0.000000  0.00000    ...  0.0           0.0               
1  1.778151  1.30103    ...  0.0           0.0               
2  0.000000  0.00000    ...  0.0           0.0               

   donationsmeanwhile  supportan  partnershipsvia  commanderinchiefno  \
0  0.0                 0.0        0.0              0.0                  
1  0.0                 0.0        0.0              0.0                  
2  0.0                 0.0        0.0              0.0                  

   newspapertwitterit  handlingthe  liestwitterwe  pictwitterco

In [43]:
#X_test.fillna(0, inplace=True)
#print(X_test[:3])

__notebook_source__.ipynb  df_test.csv	df_train.csv


In [20]:
# compute tdf and convert document into vectors

# compute sum of non zero occurence at each column
frequency_of_word_dictionary = X_train.astype(bool).sum(axis=0).to_dict()

# compute number of documents in the dataset
N = len(X_train)

# Compute idf  
idf_dictionary = {}
for word in frequency_of_word_dictionary:
    if frequency_of_word_dictionary[word] >1:
        idf_dictionary[word] = np.log10(N / frequency_of_word_dictionary[word])

# Compute tf-idf values for each word to vectorize
def compute_tfidf(dataset, idf_dictionary):
    for index, row in dataset.iterrows():
        for word in idf_dictionary:
            if row[word] >10:
                dataset.at[index, word] = (row[word] * idf_dictionary[word])  # note the .at here.  it updates cell values.  no at then it's not updating new values
    return dataset

compute_tfidf(X_train[:1000], idf_dictionary)
compute_tfidf(X_test[:1000], idf_dictionary)
print(X_train)
print(X_test)

# encode labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train["RealNews?"].to_list())
y_test = label_encoder.fit_transform(df_test["RealNews?"].to_list())

      myanmar   police    arrest  buddhist      monk     over        us  \
0    0.000000  0.00000  0.000000  0.000000  0.000000  0.00000  1.602060   
1    1.778151  1.60206  1.477121  1.477121  1.477121  1.00000  1.602060   
2    0.000000  0.00000  0.000000  0.000000  0.000000  0.00000  0.000000   
3    0.000000  0.00000  0.000000  0.000000  0.000000  0.00000  1.301030   
4    0.000000  0.00000  0.000000  0.000000  0.000000  1.60206  0.000000   
..        ...      ...       ...       ...       ...      ...       ...   
995  0.000000  0.00000  0.000000  0.000000  0.000000  0.00000  1.301030   
996  0.000000  1.60206  0.000000  0.000000  0.000000  0.00000  1.000000   
997  0.000000  0.00000  0.000000  0.000000  0.000000  0.00000  1.000000   
998  0.000000  1.00000  0.000000  0.000000  0.000000  1.00000  1.602060   
999  0.000000  0.00000  0.000000  0.000000  0.000000  0.00000  1.845098   

     embassy   protest  newspaper  ...  publiclykeep  reallymarrakesh  \
0    0.00000  0.000000  0.

In [21]:
print(X_test.shape)
print(X_train.shape)

# train the data with logistic regression 
clf = LogisticRegression(random_state=0).fit(X_train, y_train[:1000])

# predict
predictions = clf.predict(X_test)
print(precision_recall_fscore_support(y_test[:1000], predictions))
print(classification_report(y_test[:1000], predictions))

(1000, 194160)
(1000, 194160)
(array([1.        , 0.98148148]), array([0.98279159, 1.        ]), array([0.99132112, 0.99065421]), array([523, 477]))
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       523
           1       0.98      1.00      0.99       477

    accuracy                           0.99      1000
   macro avg       0.99      0.99      0.99      1000
weighted avg       0.99      0.99      0.99      1000



Part 3: Naive Bayes and Tf-idf using Scikit-learn

In [22]:
# Naive Bayes 
from sklearn.feature_extraction.text import CountVectorizer # for word count vectorizor.
from sklearn.naive_bayes import MultinomialNB

X_train = df_train['document_bow'].values
y_train = df_train['RealNews?'].values

X_test = df_test['document_bow'].values
y_test = df_test['RealNews?'].values
print(len(X_test), len(y_test))

# create count vectorizor with in-built common English stop words. 
vector = CountVectorizer(stop_words = 'english')

# fits/transforms dataset by count vectorizor  
X_train = vector.fit_transform(X_train)
X_test = vector.transform(X_test)

# train the model with Naive Bayes
clf = MultinomialNB()
clf.fit(X_train, y_train)

# predict
predictions = clf.predict(X_test)

# report
print(classification_report(y_test, predictions))

8980 8980
              precision    recall  f1-score   support

       False       0.97      0.95      0.96      4753
        True       0.95      0.97      0.96      4227

    accuracy                           0.96      8980
   macro avg       0.96      0.96      0.96      8980
weighted avg       0.96      0.96      0.96      8980



In [23]:
# Tf-idf
from sklearn.feature_extraction.text import TfidfTransformer 

X_train = df_train['document_bow'].values
y_train = df_train['RealNews?'].values

X_test = df_test['document_bow'].values
y_test = df_test['RealNews?'].values

# create count and tf-idf vectorizor 
X_train = vector.fit_transform(X_train)

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
X_train = tfidf_transformer.fit_transform(X_train)

X_test = vector.transform(X_test)
X_test = tfidf_transformer.transform(X_test)

# train the model with Logistic Regression
clf = LogisticRegression(random_state=0).fit(X_train, y_train)
print(X_test.shape)
print(X_train.shape)

# predict
predictions = clf.predict(X_test)

# report
print(classification_report(y_test, predictions))

(8980, 193820)
(35918, 193820)
              precision    recall  f1-score   support

       False       0.99      0.99      0.99      4753
        True       0.99      0.99      0.99      4227

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [24]:
# experiment with one of the parameters in the tfidf vectorizer - changing smooth_idf to false
X_train = df_train['document_bow'].values
y_train = df_train['RealNews?'].values

X_test = df_test['document_bow'].values
y_test = df_test['RealNews?'].values

# create count and tf-idf vectorizor 
X_train = vector.fit_transform(X_train)

tfidf_transformer=TfidfTransformer(smooth_idf=False,use_idf=True) 
X_train = tfidf_transformer.fit_transform(X_train)

X_test = vector.transform(X_test)
X_test = tfidf_transformer.transform(X_test)

# train the model with Logistic Regression
clf = LogisticRegression(random_state=0).fit(X_train, y_train)
print(X_test.shape)
print(X_train.shape)

# predict
predictions = clf.predict(X_test)

# report
print(classification_report(y_test, predictions))

(8980, 193820)
(35918, 193820)
              precision    recall  f1-score   support

       False       0.99      0.99      0.99      4753
        True       0.99      0.99      0.99      4227

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



Smooth idf parameters when set true, add the constant 1 to numerator and denominator to prevent zero division, by default is on.  Turning smoothing off seems not imposing a big changes on the performance outcome nor error message prompts.  It could be because we have a dataset large enough so less chance of zero occurence.