In [None]:
# Adapted from: https://github.com/Shubha23/Fake-News-Detection-Text-Preprocessing-and-Classification/blob/master/fake-news-detection-text-pre-processing-using-nltk.ipynb
# and: https://github.com/manthan89-py/Fake_News_detection/blob/master/Fake%20News.ipynb

# Original data can be found here: https://www.uvic.ca/ecs/ece/isot/datasets/fake-news/index.php

# Import necessary Python libraries, modules, etc.
import time # for generating timestamps 
import re # for regular expressions
import string as st # for removing punctuation
import numpy as np # for linear algebra
import pandas as pd # for frame processing
import matplotlib.pyplot as plt # for data visualization
import nltk # for natural language processing
from nltk.corpus import stopwords # for removing english stopwords
from nltk.stem import WordNetLemmatizer # for term stemming
import sklearn # for predictive data analysis
from sklearn import preprocessing # for data preprocessing
from sklearn.model_selection import train_test_split # for splitting data into test/train sets
from sklearn.feature_extraction.text import TfidfVectorizer # for text vectorization
from wordcloud import WordCloud, ImageColorGenerator
from IPython.core.interactiveshell import InteractiveShell # to modify Jupyter notebook configuration
InteractiveShell.ast_node_interactivity = "all" # so that all outputs in a cell are returned (instead of last instance)

In [None]:
# Load the fake and true news CSVs into Pandas dataframes
true_news = pd.read_csv('True.csv') 
fake_news = pd.read_csv('Fake.csv')

# Add column for fake/true label
true_news['label'] = 'true'
fake_news['label'] = 'fake'

# True and Fake news value counts - are they balanced?
print("Compare number of observations in true news and fake news data frames")
true_news['label'].value_counts()
print()
fake_news['label'].value_counts()
print()

# Remove random rows from fake_news (n = 2064) data frame so it has same number of rows as true_news
np.random.seed(5)

remove_n = 2064
drop_indices = np.random.choice(fake_news.index, remove_n, replace = False)
fake_news = fake_news.drop(drop_indices)

# Check that have same number of observations now
print("True and fake datasets should have same number of samples now...")
true_news['label'].value_counts()
print()
fake_news['label'].value_counts()
print()

# Preview first 5 rows in datasets to ensure they imported properly
print("Preview of the raw datasets to ensure they imported properly:")
true_news.head()
print()
fake_news.head()
print()

In [None]:
# Combine true_news and fake_news data frames into one
dfs = [true_news, fake_news]
news_data = pd.concat(dfs)

# Instantiate instance of LabelEncoder
le = preprocessing.LabelEncoder()

# Assign numerical values to column of target values (true = 1, fake = 0)
news_data['target'] = le.fit_transform(news_data['label'])

# Concatenate text columns and isolate only relevant columns for analysis (i.e., text and target)
news_data['text'] = news_data['title'] + ' ' + news_data['text']
news_data = news_data[['text', 'target']]

# Check that binary values were assigned correctly
print("Dimensions of data frame that will be cleaned:")
news_data.shape # data frame dimensions
print()

print("First and last five rows of pre-cleaned concatenated dataset:")
news_data.head(-5) # first 5 and last 5 rows
print()

print("Null values by column:")
news_data.isnull().sum() # check for null values
print()

In [None]:
# Instantiate WordNetLemmatizer() -- reduce words to their roots
wnl = WordNetLemmatizer()

# Download multilingual Wordnet data from OMW
nltk.download('omw-1.4')

# List of english stopwords
nltk.download('stopwords') 
stop_words = set(stopwords.words('english'))

# Download english dictionary ('wordnet')
nltk.download('wordnet')

In [None]:
# Define function for cleaning data
def data_cleaning(row):
    row = row.lower() # convert text into lowercase
    row = re.sub('[^a-zA-Z]', ' ', row) # remove number and special characters using regex (keep words only)
    token = row.split() # split the data into tokens
    news = [wnl.lemmatize(word) for word in token if not word in stop_words] # lemmatize the words and remove any stopwords (e.g., a, an, the, etc.)
    row_clean = [word for word in news if len(word) >= 3] # only keep words greater than or equal to length of 3
    cleaned_news = ' '.join(row_clean) # join all tokenized words with space in between 
    
    return cleaned_news

# Clean the data - might take a couple minutes to run.
news_data['text'] = news_data['text'].apply(lambda x : data_cleaning(x)) # 'text' column
print("First and last five rows after cleaning the data:")
news_data.head(-5) # check that cleaning went as planned
print()

# Check for null values
print("Null values by column:")
news_data.isnull().sum() # want zero null values
print()

# Check number unique values in each column
print("Unique values by column:")
news_data.nunique() # number unique values in each column
print()

In [None]:
# Separate news_data into predictor and response variables
X = news_data.iloc[:, 0] # features used to predict if news is fake or true
y = news_data.iloc[:, 1] # what we're trying to predict: (whether is fake(0) or true(1))

# Split the data into training and test subsets
train_data, test_data, train_target, test_target = train_test_split(X, y, random_state = 5, train_size = 0.80)

# View first 5 rows to ensure data split worked correctly
print("The first and last five rows of training_data are:")
train_data.head(-5)
print()

print("The first and last five rows of testing_data are:")
test_data.head(-5)
print()

print("The first and last five rows of training_target are:")
train_target.head(-5)
print()

# Double check data partitioning after split - are they balanced?
print("Data partitioning of true and fake values in the training data:")
train_target.value_counts() # balanced partition of train data
print()

print("Data partitioning of true and fake values in the testing data:")
test_target.value_counts() # balanced partition of test data
print()

In [None]:
# List of different values to try for TfidVectorizer max_features (i.e., top key words)
key_words = [100, 500, 1000, 5000] # How many of the top key words to keep - iterate over list
n_grams = [(1, 1), (1, 2), (2, 2), (1, 3), (2, 3), (3, 3)] # ngram_range dictates if we keep 1 word (1, 1), 1 or 2 words (1, 2) etc. 

# Max features parameter chooses the top n words (iterate over key_words list)
for ng in n_grams:

    # ngram_range parameter defines phrase length.
    # (1, 1) keeps single words only, (3,3) keeps three word phrases,
    # (1, 3) keeps one, two, or three word phrases in top n words and so on. 
    for kw in key_words:

        # Create variable for storing start time
        start_time = time.time()

        # Instantiate vectorizer
        vectorizer = TfidfVectorizer(max_features = kw, lowercase = False, ngram_range = ng)

        # Vectorize training data
        vec_train_data = vectorizer.fit_transform(train_data)
        vec_train_data = vec_train_data.toarray()

        # Vectorize test data
        vec_test_data = vectorizer.transform(test_data).toarray()

        # Double check dimensions
        print(f"The pre-vectorization training data shape using top {kw} words and an n-gram range of {ng} is: {train_data.shape}")
        print()
        print(f"The pre-vectorization testing data shape using top {kw} words and an n-gram range of {ng} is: {test_data.shape}")
        print()
        print(f"The post-vectorization training data shape using top {kw} words and an n-gram range of {ng} is: {vec_train_data.shape}")
        print()
        print(f"The post-vectorization testing data shape using top {kw} words and an n-gram range of {ng} is: {vec_test_data.shape}")
        print()
        
        # Store vectorized training and test data into respective dfs - this is the final data to use in training/evaluation
        training_data = pd.DataFrame(vec_train_data , columns = vectorizer.get_feature_names_out())
        print(f"The first and last five rows of final training data using top {kw} words and an n-gram range of {ng} are:") 
        training_data.head(-5)
        print()

        testing_data = pd.DataFrame(vec_test_data , columns = vectorizer.get_feature_names_out())
        print(f"The first and last five rows of final testing data using top {kw} words and an n-gram range of {ng} are:") 
        testing_data.head(-5)
        print()

        # Print time stamp for each iteration in nested loop - can be modified to calculate runtime for each classifier
        current_time = time.time() # current time in executed code
        elapsed_time = current_time - start_time # elapsed time (i.e., how long it took to run iteration)
        print(f"The time to vectorize the data using top {kw} words and an n-gram range of {ng} is: {elapsed_time:.2f} seconds")
        print()

        # # Write final vectorized training and testing data to CSV - if you want them saved.
        # # Took me about 45 minutes to run when saving individually as CSVs. 
        # training_data.to_csv(f'training_data_{kw}words_{ng}range.csv')
        # testing_data.to_csv(f'testing_data_{kw}words_{ng}range.csv')

        # Continue here with your evaluation of traditional classification model inside the for loop...

In [None]:
# Create a word cloud of top 500 words for presentation
text = " ".join([x for x in news_data['text']])

plt.rcParams['figure.figsize'] = (30, 20)

# Create and generate a word cloud image:
wordcloud = WordCloud(
    max_words = 500,
    width = 3000,
    height = 2000,
    background_color = 'white',
    stopwords = stop_words).generate(text)

# Display the generated image:
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.show()