# Reddit Climate Change - Data Preparation
Supervision: Prof. Dr. Jan Fabian Ehmke

Group members: Britz Luis, Huber Anja, Krause Felix Elias, Preda Yvonne-Nadine

Time: Summer term 2023 

Data: https://www.kaggle.com/datasets/pavellexyr/the-reddit-climate-change-dataset

In [1]:
# Loading packages
import pandas as pd
import matplotlib.pyplot as plt

## Load data and pre-processing

### Data import

In [35]:
# Loading data
raw_comments = pd.read_csv('data/the-reddit-climate-change-dataset-comments.csv',nrows=50000)
raw_posts = pd.read_csv('data/the-reddit-climate-change-dataset-posts.csv',nrows= 50000)

### Empty, removed and deleted entries

In [39]:
# Clean post data set from empty posts, removed posts, deleted posts
clean_posts = raw_posts.drop(raw_posts[raw_posts['selftext'] == '[removed]'].index)
clean_posts = clean_posts.drop(clean_posts[clean_posts['selftext'] == '[deleted]'].index)
clean_posts = clean_posts.dropna(subset=['selftext'])

In [40]:
# Clean comment dataset from empty comments
clean_comments = raw_comments.dropna(subset=['body'], how='all')

### Duplicates

In [41]:
# Drop duplicates in post and comment dataset
clean_posts = clean_posts.drop_duplicates(subset=["selftext","type"])
clean_comments = clean_comments.drop_duplicates(subset=["body","type"])

### Sort out specific words

In [42]:
# Sort out the word climate change in comment and post dataset
deleted_words = ("climate","change")

for x in deleted_words:
    clean_posts["selftext"] = clean_posts["selftext"].str.replace(x, "", case=False)
    clean_posts["title"] = clean_posts["title"].str.replace(x, "", case=False)
    clean_comments["body"] = clean_comments["body"].str.replace(x, "", case=False)

### Bots

In [43]:
bot_subreddits = ['memebot9000', 
                  'spacenewsbot', 
                  'removalbot', 
                  'u_ignorantbotkin4ho', 
                  'botsrights', 
                  'gildedbot', 
                  'open_bots_test', 
                  'rcbredditbot', 
                  'newsbotmarket', 
                  'alt_source_bot_log', 
                  'steamiebot', 
                  'modbot_staging',
                  'testanimalsupportbot', 
                  'u_udemy_sample_bot', 
                  'newsbottmt', 
                  'bottownfriends', 
                  'airsoft_bot', 
                  'wikileaksemailbot', 
                  'foreveralonebots', 
                  'botterminator', 
                  'bottown2', 
                  'bottown_polibot', 
                  'bottowngarden', 
                  'havoc_bot', 
                  'botrequests', 
                  'sentimentviewbot', 
                  'newsbotscience', 
                  'u_commonmisspellingbot', 
                  'dogetipbot', 
                  'resistbot', 
                  'newsbotfunding', 
                  'ebolanewsbot', 
                  'gwcoepbot', 
                  'uknewsbyabot', 
                  'u_flamboyantbotoshrx', 
                  'laserlikebot', 
                  'repostsleuthbot', 
                  'interfaithbotdialogue', 
                  'articlebot', 
                  'potuswatchbot', 
                  'pulsarbot', 
                  'mimeticsbot', 
                  'cryptobots', 
                  'wutbotposts', 
                  'bottesting', 
                  'botbotread', 
                  'cleverbot', 
                  'u_anticensor_bot', 
                  'trollbot', 
                  'brokentranslatebot', 
                  'newsbiasbot', 
                  'sexpollbottest', 
                  'bottalks', 
                  'bitnewsbot', 
                  'gbpolbot', 
                  'rssbot', 
                  'botsscrewingup', 
                  'u_dicebotbtc', 
                  'u_userleansbot', 
                  'islamicstatenewsbot', 
                  'u_moebot12', 
                  'webbot', 
                  'pew_bot_memes', 
                  'u_zukbot', 
                  'u_bot4bot', 
                  'stabbot', 
                  'u_yangpolicyinfo_bot', 
                  'subredditsummarybot', 
                  'bot4bottesting', 
                  'travsbots', 
                  'spooktoberbot', 
                  'trollabot', 
                  'printrbot', 
                  'bottown1', 
                  'kzreminderbotsub', 
                  'bottown', 
                  'newsbotbot', 
                  'gifbot', 
                  'uvabot', 
                  'cfbotpolitics', 
                  'nwordcountbot', 
                  'quizzybot', 
                  'israelnewsbot', 
                  'popularnewsbot', 
                  'monkeynewsbot', 
                  'thelinkfixerbot', 
                  'atheismbot', 
                  'inspirobotbot', 
                  'testingground4bots', 
                  'buzzfeedbot', 
                  'isreactionarybot', 
                  'blenderbot', 
                  'talkwithgpt2bots', 
                  'bottown22', 
                  'twitter_bot', 
                  'autowikibot', 
                  'elsbot']

for i in bot_subreddits:
    clean_comments = clean_comments[~clean_comments['subreddit.name'].str.contains(i)]
    clean_posts = clean_posts[~clean_posts['subreddit.name'].str.contains(i)]

### Convert date and time information

In [44]:
# Create a new columns with date and time information
clean_posts['created_date'] = pd.to_datetime(clean_posts['created_utc'], utc=True, unit='s').dt.strftime('%Y-%m-%d')
clean_posts['created_day'] = pd.to_datetime(clean_posts['created_utc'], utc=True, unit='s').dt.strftime('%d')
clean_posts['created_month'] = pd.to_datetime(clean_posts['created_utc'], utc=True, unit='s').dt.strftime('%m')
clean_posts['created_year'] = pd.to_datetime(clean_posts['created_utc'], utc=True, unit='s').dt.strftime('%Y')
clean_posts['created_time'] = pd.to_datetime(clean_posts['created_utc'], utc=True, unit='s').dt.strftime('%H:%M:%S')

clean_comments['created_date'] = pd.to_datetime(clean_comments['created_utc'], utc=True, unit='s').dt.strftime('%Y-%m-%d')
clean_comments['created_day'] = pd.to_datetime(clean_comments['created_utc'], utc=True, unit='s').dt.strftime('%d')
clean_comments['created_month'] = pd.to_datetime(clean_comments['created_utc'], utc=True, unit='s').dt.strftime('%m')
clean_comments['created_year'] = pd.to_datetime(clean_comments['created_utc'], utc=True, unit='s').dt.strftime('%Y')
clean_comments['created_time'] = pd.to_datetime(clean_comments['created_utc'], utc=True, unit='s').dt.strftime('%H:%M:%S')

### Output file

In [None]:
# Output CSV file with relevant data