In [113]:
import pandas as pd
from collections import Counter
import math
import re
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import pandas as pd


## Load the data

In [118]:


# Load fake and real news datasets
pf_fake = pd.read_csv("../data/raw/PolitiFact_fake_news_content.csv")
pf_real = pd.read_csv("../data/raw/PolitiFact_real_news_content.csv")

# Concatenate them into a full dataset
df_full = pd.concat([pf_real, pf_fake], ignore_index=True)

# Append news_id from PolitiFactNews.txt
with open('../data/raw/PolitiFactNews.txt', 'r') as file:
    news_ids = [line.strip() for line in file]

df_full["news_id"] = news_ids

# Load train and test id's
with open('../data/processed/FakeNewsNet/trainIds.txt', 'r') as file:
    train_values = [line.strip() for line in file]
with open('../data/processed/FakeNewsNet/testIds.txt', 'r') as file:
    test_values = [line.strip() for line in file]

# Create train and test data sets
df_train = df_full[df_full['news_id'].isin(train_values)]
df_test = df_full[df_full['news_id'].isin(test_values)]

df_train.to_csv("../data/processed/FakeNewsNet/train_df.csv")
df_test.to_csv("../data/processed/FakeNewsNet/test_df.csv")

## Creating a TF-IDF representation.

In [50]:

# Download the stopwords dataset
nltk.download('stopwords')

def tokenize(text, stemmer, stop_words):
    """Tokenize text, stem, and remove stop words."""
    tokens = re.findall(r'\b\w+\b', text.lower())
    return [stemmer.stem(token) for token in tokens if token not in stop_words]

def compute_tfidf(df, column_name):
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))

    # Tokenize, stem and compute term frequency for each document
    df['tf'] = df[column_name].apply(lambda x: Counter(tokenize(x, stemmer, stop_words)))

    # Create a list of all unique words in the corpus
    all_unique_words = list(set(word for tf in df['tf'] for word in tf.keys()))

    # Compute the number of documents in which each word appears
    word_document_count = Counter(word for tf in df['tf'] for word in tf.keys())

    # Compute the total number of documents
    total_documents = len(df)

    # Compute inverse document frequency for each word
    idf = {word: math.log(total_documents / count) for word, count in word_document_count.items()}

    # Compute TF-IDF for each word in each document and ensure that all words in the corpus are represented
    def calculate_tfidf(tf):
        return {word: tf.get(word, 0)/sum(tf.values()) * idf[word] for word in all_unique_words}

    df['tfidf'] = df['tf'].apply(calculate_tfidf)

    return df['tfidf']


tfidf_scores = compute_tfidf(pf_fake, 'full_text')



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lasse\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0      {'mud': 0.0, 'seriou': 0.0, 'beyond': 0.0, 'fl...
1      {'mud': 0.0, 'seriou': 0.0, 'beyond': 0.0, 'fl...
2      {'mud': 0.0, 'seriou': 0.0, 'beyond': 0.0, 'fl...
3      {'mud': 0.0, 'seriou': 0.0, 'beyond': 0.0, 'fl...
4      {'mud': 0.0, 'seriou': 0.0, 'beyond': 0.0, 'fl...
                             ...                        
115    {'mud': 0.0, 'seriou': 0.0, 'beyond': 0.0, 'fl...
116    {'mud': 0.0, 'seriou': 0.0, 'beyond': 0.0, 'fl...
117    {'mud': 0.0, 'seriou': 0.002206360447568847, '...
118    {'mud': 0.0, 'seriou': 0.0, 'beyond': 0.0, 'fl...
119    {'mud': 0.0, 'seriou': 0.0, 'beyond': 0.0, 'fl...
Name: tfidf, Length: 120, dtype: object
