# <b>Data Processing: Unify All Pre-Processed Datasets</b>
#### <font color = 'gray'>December 2022</font>

###<b>Initial Set-Up & Mount Google Drive</b>

In [1]:
import pandas as pd
import plotly.express as px
import nltk
from nltk.corpus import stopwords

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# # If running in Colab: connecting to Shared Google Drive
# # Run this cell and select your UMich Google account in the pop-up
# from google.colab import drive
# import sys
# drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
# These paths will work if our project drive has been shared with you
GOLBECK_PATH = 'data\processed\satire_fake\full_golbeck_df'
MIHALCEA_PATH = 'data\processed\fake_real\Real-Fake-AllNewsData.xlsx'
YANG_TRAIN_PATH = 'data\processed\real_satire\updated_train_df.csv'
COMBINED_PATH = 'data\processed\combined_df.csv'

# Columns we want to keep across all datasets
KEEP_COLS = ['unique_id', 'label', 'source', 'text']
STOPS = stopwords.words("english")

###<b>Import to DataFrame and Standardize Columns</b>

In [5]:
# Golbeck Fake-Satire
golbeck_df = pd.read_csv(GOLBECK_PATH, index_col=0)

# Add metadata and standardize
golbeck_df['source'] = 'Golbeck'
golbeck_df['unique_id'] = 'golbeck-' + golbeck_df['article_id'].astype(str)
golbeck_df['label'] = golbeck_df['label'].str.lower()
golbeck_df.rename({'text': 'article_only'}, axis=1, inplace=True)
golbeck_df.drop('article_id', axis=1, inplace=True)
golbeck_df['text'] = golbeck_df['title'].astype(str) + '\n' + golbeck_df['article_only'].astype(str)
golbeck_df.sample(5)

Unnamed: 0,title,article_only,label,source,unique_id,text
408,CANCER ENDORSES REPUBLICAN HEALTHCARE PLAN,"WASHINGTON, D.C. (SatireWire.com) – Republican...",satire,Golbeck,golbeck-297,CANCER ENDORSES REPUBLICAN HEALTHCARE PLAN\nWA...
372,New Comprehensive Investigation Involving Trump,Just In:,satire,Golbeck,golbeck-136,New Comprehensive Investigation Involving Trum...
371,U.S. FISH & WILDLIFE SERIOUSLY CONSIDERING ABU...,"WASHINGTON, D.C. (SatireWire.com) – Overlooked...",satire,Golbeck,golbeck-295,U.S. FISH & WILDLIFE SERIOUSLY CONSIDERING ABU...
247,Iraq War Veteran Dies Trying to Protect Buffet...,Home just eight weeks after being wounded duri...,satire,Golbeck,golbeck-470,Iraq War Veteran Dies Trying to Protect Buffet...
271,Ted Cruz Still Leads Polls for Most Punch-able...,"CONCORD, NH—Following a strong second place fi...",satire,Golbeck,golbeck-129,Ted Cruz Still Leads Polls for Most Punch-able...


In [6]:
fig = px.bar(golbeck_df['label'].reset_index().groupby('label').count(),
             width=500, height=400)
fig.update_layout(showlegend=False, 
                  title_text='Fake-Satire Class Imbalance', title_x=0.5)
fig.show()

In [7]:
# Mihalcea Real-Fake
mihalcea_df = pd.read_csv(MIHALCEA_PATH, index_col=0)

# Add metadata and standardize
mihalcea_df['source'] = 'Mihalcea'
mihalcea_df['unique_id'] = 'mihalcea-' + mihalcea_df.index.astype(str)
mihalcea_df.rename({'Text': 'text', 'Domain': 'domain', 'Label': 'label'}, 
                   axis=1, inplace=True)
mihalcea_df['label'].replace('legit', 'real', inplace=True)

mihalcea_df.sample(5)

Unnamed: 0,text,domain,label,source,unique_id
559,Chelsea Handler Has Nothing Nice to Say About ...,celebrity,fake,Mihalcea,mihalcea-559
10,"Ellison: Democrats ""gloating"" over Republican ...",polit,fake,Mihalcea,mihalcea-10
690,Kate Middleton & Prince William Try To Save Cr...,celebrity,fake,Mihalcea,mihalcea-690
59,Obama Administration sets new rule for childre...,edu,fake,Mihalcea,mihalcea-59
365,What to expect from Carrie Fisher and Debbie ...,entmt,real,Mihalcea,mihalcea-365


In [8]:
fig = px.bar(mihalcea_df['label'].reset_index().groupby('label').count(),
             width=500, height=400)
fig.update_layout(showlegend=False, 
                  title_text='Real-Fake Class Imbalance', title_x=0.5)
fig.show()

In [9]:
# Yang Real-Satire
yang_train_df = pd.read_csv(YANG_TRAIN_PATH)

# Add metadata and standardize
yang_train_df['source'] = 'Yang'
yang_train_df['unique_id'] = 'yang-' + yang_train_df.index.astype(str)
yang_train_df.rename({'content': 'text', 'topic': 'domain'}, 
                     axis=1, inplace=True)
yang_train_df['label'].replace(0, 'satire', inplace=True)
yang_train_df['label'].replace(1, 'real', inplace=True)

yang_train_df.sample(5)

Unnamed: 0,text,label,clean_content,domain,source,unique_id
31090,"as revealed by the mail on sunday , armstrong ...",real,revealed mail sunday armstrong agreed request...,vacation,Yang,yang-31090
18864,the 21 year old has enjoyed a whirlwind few da...,real,21 year old enjoyed whirlwind days since domin...,sports,Yang,yang-18864
25465,the announcement was made after a meeting of t...,real,announcement made meeting atp board directors ...,play,Yang,yang-25465
30460,actor and politician fred thompson attends the...,real,actor politician fred thompson attends opening...,politics,Yang,yang-30460
17555,the sukhoi superjet 100 came to rest on the si...,real,sukhoi superjet 100 came rest side mount salak...,air_travel,Yang,yang-17555


In [10]:
fig = px.bar(yang_train_df['label'].reset_index().groupby('label').count(),
             width=500, height=400)
fig.update_layout(showlegend=False, 
                  title_text='Real-Satire Class Imbalance', title_x=0.5)
fig.show()

###<b>Concatenate DataFrames</b>

In [11]:
full_combined_df = pd.concat([golbeck_df, mihalcea_df, yang_train_df], axis=0)
combined_df = full_combined_df[KEEP_COLS]
combined_df.describe()

Unnamed: 0,unique_id,label,source,text
count,48709,48709,48709,48708
unique,48709,3,3,48622
top,golbeck-316,real,Yang,an excess of government buildings and restrict...
freq,1,35196,47319,2


In [12]:
# Final check for duplicates (in 'text')
num_dups = combined_df.duplicated('text').sum()

combined_df['text'] = combined_df['text'].astype(str)
print(f'There are {num_dups} duplicates.')
combined_df = combined_df[~combined_df.duplicated('text')]
print('Records with duplicate text have been removed')

# Check for NaN
combined_df = combined_df.dropna()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



There are 86 duplicates.
Records with duplicate text have been removed


In [13]:
combined_df.sample(5)

Unnamed: 0,unique_id,label,source,text
11216,yang-11216,real,Yang,"lira is worst performer in emerging markets , ..."
919,yang-919,real,Yang,chris megerian and noam n . levey contact repo...
142,mihalcea-142,fake,Mihalcea,Google computers defeat shows once again nothi...
36309,yang-36309,real,Yang,serena had to save set points in her second ro...
38485,yang-38485,satire,Yang,"thurmond , 93 , first elected to congress in 1..."


###<b>Clean Text & Remove Stop Words</b>

In [14]:
def remove_stop_punct(df, col_name, updated_col_name, re):
    """
    Removes stop words and punctuations from the dataframe

    Parameters
    ----------
    df : pandas dataframe
        The dataframe to remove stop words and punctuations from
    col_name : str
        The name of the column to remove stop words and punctuations from
    updated_col_name : str
        The name of the column to store the updated content
    re : str
        Regex
    """
    print("Removing stop words and punctuations...")
    df[updated_col_name] = df[col_name].apply(lambda x: ' '.join([word for word in x.split() if word not in (STOPS)]))
    df[updated_col_name] = df[updated_col_name].str.replace(re,'')
    print("Stop words and punctuations are removed successfully!")
    return df

In [15]:
clean_combined_df = remove_stop_punct(combined_df, 
                                      col_name='text', 
                                      updated_col_name='clean_text', 
                                      re='[^\w\s]'
                                      )

Removing stop words and punctuations...



The default value of regex will change from True to False in a future version.



Stop words and punctuations are removed successfully!


###<b>Save to CSV for Modeling</b>

In [16]:
clean_combined_df.dropna().to_csv(COMBINED_PATH, index=False)

###<b>Review Combined Dataset</b>

In [17]:
fig = px.bar(clean_combined_df['label'].reset_index().groupby('label').count(),
             width=500, height=400)
fig.update_layout(showlegend=False, 
                  title_text='Real-Fake-Satire Class Imbalance', title_x=0.5)
fig.show()

In [18]:
clean_combined_df.sample(5)

Unnamed: 0,unique_id,label,source,text,clean_text
20904,yang-20904,real,Yang,but presidential nominations are not won purel...,presidential nominations purely national polls...
10748,yang-10748,real,Yang,it is understood the prime minister will say s...,understood prime minister say sorry weekly aud...
32997,yang-32997,real,Yang,in the light and spirit ( of holocaust memoria...,light spirit holocaust memorial day speaker ...
36386,yang-36386,real,Yang,the dead spotted eagle ray lies on the deck of...,dead spotted eagle ray lies deck boat florida ...
3760,yang-3760,real,Yang,alice barker snapped her fingers and sang alon...,alice barker snapped fingers sang along videos...
