In [49]:
import pandas as pd

## GOSSIP

In [50]:
# Read the CSV file
gossip_real = pd.read_csv('../datasets/Social Media/gossipcop/gossipcop_real.csv')
gossip_fake = pd.read_csv('../datasets/Social Media/gossipcop/gossipcop_fake.csv')

In [51]:
# Perform preprocessing steps here
# drop id,news_url,tweet_ids
gossip_real = gossip_real.drop(columns=['id', 'news_url', 'tweet_ids'])
gossip_fake = gossip_fake.drop(columns=['id', 'news_url', 'tweet_ids'])

In [52]:
#add label column with value 1
gossip_real['label'] = 1
gossip_fake['label'] = 0

In [53]:
#merge the two dataframes
gossip = pd.concat([gossip_real, gossip_fake])

#rename the column title to claim
gossip = gossip.rename(columns={'title': 'claim'})

gossip.head()

Unnamed: 0,claim,label
0,Teen Mom Star Jenelle Evans' Wedding Dress Is ...,1
1,Kylie Jenner refusing to discuss Tyga on Life ...,1
2,Quinn Perkins,1
3,I Tried Kim Kardashian's Butt Workout & Am For...,1
4,Celine Dion donates concert proceeds to Vegas ...,1


## POLITIFACT

In [54]:
# Read the CSV file
politifact_real = pd.read_csv('../datasets/Social Media/gossipcop/politifact_real.csv')
politifact_fake = pd.read_csv('../datasets/Social Media/gossipcop/politifact_fake.csv')

In [55]:
#add label column with value 1
politifact_real['label'] = 1
politifact_fake['label'] = 0

In [56]:
#merge the two dataframes
politifact = pd.concat([politifact_real, politifact_fake])

#drop id,news_url,tweet_ids
politifact = politifact.drop(columns=['id', 'news_url', 'tweet_ids'])

politifact = politifact.rename(columns={'title': 'claim'})

politifact.head()

Unnamed: 0,claim,label
0,National Federation of Independent Business,1
1,comments in Fayetteville NC,1
2,"Romney makes pitch, hoping to close deal : Ele...",1
3,Democratic Leaders Say House Democrats Are Uni...,1
4,"Budget of the United States Government, FY 2008",1


In [57]:
#merge the two dataframes
socialmedia_dataset = pd.concat([gossip, politifact])

In [58]:
#drop the rows with missing values
socialmedia_dataset = socialmedia_dataset.dropna()

#drop duplicates
socialmedia_dataset = socialmedia_dataset.drop_duplicates()

In [59]:
#display details of merged_df such as shape, info, describe, head unique values in label column
print(socialmedia_dataset.shape)
print(socialmedia_dataset.info())
print(socialmedia_dataset.describe())
print(socialmedia_dataset.head())
print(socialmedia_dataset['label'].unique())

#display average claim length
socialmedia_dataset['claim_length'] = socialmedia_dataset['claim'].apply(lambda x: len(x.split()))
print(socialmedia_dataset['claim_length'].mean())

(21847, 2)
<class 'pandas.core.frame.DataFrame'>
Index: 21847 entries, 0 to 430
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   claim   21847 non-null  object
 1   label   21847 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 512.0+ KB
None
              label
count  21847.000000
mean       0.756351
std        0.429293
min        0.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        1.000000
                                               claim  label
0  Teen Mom Star Jenelle Evans' Wedding Dress Is ...      1
1  Kylie Jenner refusing to discuss Tyga on Life ...      1
2                                      Quinn Perkins      1
3  I Tried Kim Kardashian's Butt Workout & Am For...      1
4  Celine Dion donates concert proceeds to Vegas ...      1
[1 0]
11.259989929967501


In [60]:
#Remove punctuations
#Convert text to tokens
#Remove tokens of length less than or equal to 3
#Remove stopwords using NLTK corpus stopwords list to match
#Apply lemmatization

import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if len(word) > 3]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

socialmedia_dataset['claim'] = socialmedia_dataset['claim'].apply(clean_text)
socialmedia_dataset.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MONSTER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\MONSTER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MONSTER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,claim,label,claim_length
0,teen star jenelle evans wedding dress availabl...,1,12
1,kylie jenner refusing discus tyga life kylie,1,10
2,quinn perkins,1,2
3,tried kardashians butt workout forever changed,1,10
4,celine dion donates concert proceeds vega shoo...,1,9


In [61]:
#display average claim length
socialmedia_dataset['claim_length'] = socialmedia_dataset['claim'].apply(lambda x: len(x.split()))
print(socialmedia_dataset['claim_length'].mean())
socialmedia_dataset = socialmedia_dataset.drop(columns=['claim_length'])

7.443768023069529


In [62]:
#save to preprocessed_datasets folder
socialmedia_dataset.to_csv('../preprocessed_datasets/socialmedia_dataset.csv', index=False)