# Real vs Satire Data Processing (Article level)

1. Read files by article
2. Lowercase texts
3. Remove stopwords and punctuations
4. Remove duplications
5. Remove proper nouns
6. Remove all inreadable codes
7. Generate topics
8. Save as CSV files

In [None]:
pip install empath

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns
from empath import Empath
from google.colab import drive
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
stops = stopwords.words("english")

In [None]:
# Uncomment if connecting to Google Drive
# Run this cell and select your UMich Google account in the pop-up

# drive.mount('/content/gdrive')

In [None]:
real_path = "data\raw\real_satire\true"
satire_path = "data\raw\real_satire\satire"
true_train_list = ["true_train_1.txt", "true_train_2.txt", "true_train_3.txt", "true_train_4.txt", "true_train_5.txt", "true_train_6.txt"]
true_test_list = ["true_test_1.txt", "true_test_2.txt"]
true_dev_list = ["true_validation_1.txt", "true_validation_2.txt"]
satire_train_list = ["satire_train.txt"]
satire_test_list = ["satire_test.txt"]
satire_dev_list = ["satire_dev.txt"]

https://huggingface.co/docs/transformers/model_doc/roberta

About the data:

"We omit headline, creation time, and author information so this work concentrates on the satire in the article body."

## Article level

In [None]:
def read_files(folder_path, file_list, real=True):
    """
    Reads in files from a folder and returns a dataframe with the text and the label
    
    Parameters
    ----------
    folder_path : str
        The path to the folder containing the files
    file_list : list
        A list of the files to read in
    real : bool
        Whether the files are real or satire
    """
    print("Preprocessing...")
    tmp_list = []
    for file in file_list:
        with open(folder_path + file, "r") as f:
            file = f.read()
            # convert all strings to lowercases
            text_string = file.lower()
            # split the content of the file into a list of articles
            tmp_list.append(text_string.split("******"))
    
    content_dict = {}
    content_dict["content"] = None
    content_df = pd.DataFrame(columns=["content"])  
    print("Reading files...")
    for anArticle in tmp_list:
        for num in range(len(anArticle)-1):
            # the last line of content is empty, so we reduce the length by 1
            anArticle[num] = anArticle[num].replace("? ?", "")
            anArticle[num] = anArticle[num].replace("u . s .", "us")
            anArticle[num] = anArticle[num].replace("van gaal", "")
            anArticle[num] = anArticle[num].replace("barack obama", "")
            # remove unreadable codes and proper names

            if anArticle[num].split("\n")[0] != "":
                content_dict["content"] = ' '.join(anArticle[num].split("\n")[1:-1])
            else: 
                content_dict["content"] = ' '.join(anArticle[num].split("\n")[2:-1])
            content_df = content_df.append(content_dict, ignore_index=True)
    # create a new column to indicate whether the article is real or satire
    # 1 for real, 0 for satire
    if real:
        content_df["label"] = 1
        print("Read {} real articles".format(len(content_df)))
        print("The real article data is loaded successfully!")
        return content_df
    else:
        content_df["label"] = 0
        print("Read {} satire articles".format(len(content_df)))
        print("The satire article data is loaded successfully!")
        return content_df
    

In [None]:
real_train_data = read_files(real_path, true_train_list, real=True)
satire_train_data = read_files(satire_path, satire_train_list, real=False)
real_test_data = read_files(real_path, true_test_list, real=True)
satire_test_data = read_files(satire_path, satire_test_list, real=False)
real_dev_data = read_files(real_path, true_dev_list, real=True)
satire_dev_data = read_files(satire_path, satire_dev_list, real=False)

test_df = pd.concat([real_test_data, satire_test_data], ignore_index=True)
dev_df = pd.concat([real_dev_data, satire_dev_data], ignore_index=True)
train_df = pd.concat([real_train_data, satire_train_data], ignore_index=True)

In [None]:
# proportion of real and satire articles in the training set
train_df.reset_index().groupby('label')['content'].count().plot(kind='bar', title="Real vs Satire", figsize=(10, 5))
plt.xlabel("Label: 1 for Real, 0 for Satire")
plt.ylabel("Number of Articles")

In [None]:
def remove_duplicate(df, column_name, remove=True):
    """
    Removes duplicate articles from the dataframe
    
    Parameters
    ----------
    df : pandas dataframe
        The dataframe to remove duplicates from
    column_name : str
        The name of the column to remove duplicates from
    remove : bool
        Whether to remove duplicates or not
    """
    # check duplicates
    print("There are {} duplicates in the dataset".format(df.duplicated().sum()))
    duplicate_df = df[df.duplicated(subset=column_name)]
    if remove:
        if df.duplicated().sum() > 0:
            updated_df = df.drop_duplicates(subset=[column_name], keep=False, inplace=False)
            print("The duplicate articles are removed successfully!")
            return duplicate_df, updated_df
        else: 
            print("There is no duplicate article in the dataset")
            return duplicate_df
    else:
        print("The duplicate articles are NOT removed!")
        return duplicate_df
    

In [None]:
duplicate_real_train_df, updated_real_train_df = remove_duplicate(real_train_data, "content", remove=True)
duplicate_satire_train_df, updated_satire_train_df = remove_duplicate(satire_train_data, "content", remove=True)
duplicate_train_df, updated_train_df = remove_duplicate(train_df, "content", remove=True)
duplicate_real_test_df, updated_real_test_df = remove_duplicate(real_test_data, "content", remove=True)
duplicate_satire_test_df, updated_satire_test_df = remove_duplicate(satire_test_data, "content", remove=True)
duplicate_test_df, updated_test_df = remove_duplicate(test_df, "content", remove=True)
duplicate_real_dev_df, updated_real_dev_df = remove_duplicate(real_dev_data, "content", remove=True)
duplicate_satire_dev_df, updated_satire_dev_df = remove_duplicate(satire_dev_data, "content", remove=True)
duplicate_dev_df, updated_dev_df = remove_duplicate(dev_df, "content", remove=True)

In [None]:
def remove_stop_punct(df, col_name, updated_col_name, re):
    """
    Removes stop words and punctuations from the dataframe

    Parameters
    ----------
    df : pandas dataframe
        The dataframe to remove stop words and punctuations from
    col_name : str
        The name of the column to remove stop words and punctuations from
    updated_col_name : str
        The name of the column to store the updated content
    re : str
        Regex for removing punctuations
    """
    print("Removing stop words and punctuations...")
    df[updated_col_name] = df[col_name].apply(lambda x: ' '.join([word for word in x.split() if word not in (stops)]))
    df[updated_col_name] = df[updated_col_name].str.replace(re,'')
    print("Stop words and punctuations are removed successfully!")
    return df

In [None]:
updated_real_train_df = remove_stop_punct(updated_real_train_df, "content", "clean_content", '[^\w\s]')
updated_satire_train_df = remove_stop_punct(updated_satire_train_df, "content", "clean_content", '[^\w\s]')
updated_train_df = remove_stop_punct(updated_train_df, "content", "clean_content", '[^\w\s]')
updated_real_test_df = remove_stop_punct(updated_real_test_df, "content", "clean_content", '[^\w\s]')
updated_satire_test_df = remove_stop_punct(updated_satire_test_df, "content", "clean_content", '[^\w\s]')
updated_test_df = remove_stop_punct(updated_test_df, "content", "clean_content", '[^\w\s]')
updated_real_dev_df = remove_stop_punct(updated_real_dev_df, "content", "clean_content", '[^\w\s]')
updated_satire_dev_df = remove_stop_punct(updated_satire_dev_df, "content", "clean_content", '[^\w\s]')
updated_dev_df = remove_stop_punct(updated_dev_df, "content", "clean_content", '[^\w\s]')


In [None]:
# concat dev data to train and test data
updated_test_df = pd.concat([updated_dev_df[:8396], updated_test_df], ignore_index=True)
updated_real_test_df = pd.concat([updated_real_dev_df[:1550], updated_real_test_df], ignore_index=True)
updated_satire_test_df = pd.concat([updated_satire_dev_df[:9947], updated_satire_test_df], ignore_index=True)

updated_train_df = pd.concat([updated_dev_df[8397:], updated_train_df], ignore_index=True)
updated_real_train_df = pd.concat([updated_real_dev_df[1551:], updated_real_train_df], ignore_index=True)
updated_satire_train_df = pd.concat([updated_satire_dev_df[9948:], updated_satire_train_df], ignore_index=True)

print("The number of real articles in the test dataset is {}".format(len(updated_real_test_df)))
print("The number of satire articles in the test dataset is {}".format(len(updated_satire_test_df)))


print("The number of real articles in the train dataset is {}".format(len(updated_real_train_df)))
print("The number of satire articles in the train dataset is {}".format(len(updated_satire_train_df)))

print("---------------------------------------------")
print("The number of articles in the test dataset is {}".format(len(updated_test_df)))
print("The number of articles in the train dataset is {}".format(len(updated_train_df)))

## Generate topics for each article

In [None]:
def get_topic(df):
    """
    Extracts the topic of the article
    
    Parameters
    ----------
    df : pandas dataframe
        The dataframe to extract the topic from
    """
    lexicon = Empath()
    print("Getting topics...")
    df['topic'] = None
    for i in range(len(df)):
        df['topic'][i] = sorted(lexicon.analyze(df['clean_content'][i]).items(), key=lambda x: x[1], reverse=True)[0][0]
    print("Topics are extracted successfully!")
    return df

In [None]:
updated_train_df = get_topic(updated_train_df)
updated_test_df = get_topic(updated_test_df)

In [None]:
updated_train_df.head()

In [None]:
# concat train and test data as the whole dataset
df = pd.concat([updated_train_df, updated_test_df], ignore_index=True)
# plot topic distribution for top 15 topics
sns.set_theme(style="darkgrid")
df['topic'].value_counts()[:15].plot(kind='bar', figsize=(8, 5), title='Topic Distribution')

Top 15 topics are "sports", "government", "crime", "business", "leader", "politics", "family", "play", "health", "children", "money", "war", "school", "military", "air_travel"

In [None]:
sorted_df = df[df['topic'].isin(["sports", "government", "crime", "business", "leader", "politics", "family", "play", "health", "children", "money", "war", "school", "military", "air_travel"])]
fig, ax = plt.subplots(figsize=(12, 5))
sns.countplot(x="topic", data=sorted_df, order=sorted_df.topic.value_counts().iloc[:15].index, hue="label", palette="Set2")
plt.legend(loc='upper right', labels=['satire', 'real'])
plt.xticks(rotation=45)

## Save as CSV files

Uncomment below code if saving as CSV files is needed.

In [None]:
# updated_train_df.to_csv("data\processed\real_satire\updated_train_df.csv", index=False)
# updated_test_df.to_csv("data\processed\real_satire\updated_test_df.csv", index=False)

----