# Reddit Climate Change - Data Preparation
Supervision: Prof. Dr. Jan Fabian Ehmke

Group members: Britz Luis, Huber Anja, Krause Felix Elias, Preda Yvonne-Nadine

Time: Summer term 2023 

Data: https://www.kaggle.com/datasets/pavellexyr/the-reddit-climate-change-dataset

In [2]:
# Loading packages
%pip install langid
from langid.langid import LanguageIdentifier, model
from utils.nltk_function_samples import check_lang
from utils.nltk_function_samples import remove_features
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
import os

Note: you may need to restart the kernel to use updated packages.


## Load data and pre-processing

### Data import

In [88]:
# Loading data
raw_comments = pd.read_csv('data/the-reddit-climate-change-dataset-comments.csv')
raw_posts = pd.read_csv('data/the-reddit-climate-change-dataset-posts.csv')

### Duplicates

In [89]:
# Drop duplicates in post and comment dataset
clean_posts = raw_posts.drop_duplicates(subset=["selftext","type"])
clean_comments = raw_comments.drop_duplicates(subset=["body","type"])

### Feature removal

In [93]:
# Adjusted feature removal function
def remove_features(data_str):
    # compile regex
    url_re = re.compile('https?://(www.)?\w+\.\w+(/\w+)*/?')
    num_re = re.compile('(\\d+)')
    mention_re = re.compile('@(\w+)')
    
    # remove hyperlinks
    data_str = url_re.sub(' ', data_str)
    
    # remove @mentions
    data_str = mention_re.sub(' ', data_str)
    
    # remove numeric 'words'
    data_str = num_re.sub(' ', data_str)

    cleaned_str = data_str
    return cleaned_str

In [95]:
# Convert text columns to be of type string
clean_posts["selftext"] = clean_posts["selftext"].astype(str)
clean_posts["title"] = clean_posts["title"].astype(str)
clean_comments["body"] = clean_comments["body"].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_posts["selftext"] = clean_posts["selftext"].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_posts["title"] = clean_posts["title"].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_comments["body"] = clean_comments["body"].astype(str)


In [96]:
# Apply remove feature function to all text data and create new columns with clean text
clean_comments["body_clean"] = clean_comments["body"].apply(remove_features)
clean_posts["title_clean"] = clean_posts["title"].apply(remove_features)
clean_posts["selftext_clean"] = clean_posts["selftext"].apply(remove_features)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_comments["body_clean"] = clean_comments["body"].apply(remove_features)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_posts["title_clean"] = clean_posts["title"].apply(remove_features)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_posts["selftext_clean"] = clean_posts["selftext"]

### Sort out specific words

In [111]:
# Sort out the word climate change in comment and post dataset
deleted_words = ("climate","change")

for x in deleted_words:
    clean_posts["selftext_clean"] = clean_posts["selftext_clean"].str.replace(x, "", case=False)
    clean_posts["title_clean"] = clean_posts["title_clean"].str.replace(x, "", case=False)
    clean_comments["body_clean"] = clean_comments["body_clean"].str.replace(x, "", case=False)

### Empty, removed and deleted entries

In [120]:
# Clean post data set from empty posts, removed posts, deleted posts
clean_posts = clean_posts.drop(clean_posts[clean_posts['selftext_clean'] == '[removed]'].index)
clean_posts = clean_posts.drop(clean_posts[clean_posts['selftext_clean'] == '[deleted]'].index)
clean_posts = clean_posts.dropna(subset=['selftext_clean'])
clean_posts = clean_posts.drop(clean_posts[clean_posts['title_clean'] == ' '].index) # empty title because climate change was removed

In [101]:
# Clean comment dataset from empty comments
clean_comments = clean_comments.dropna(subset=['body_clean'], how='all')

### Bots

In [103]:
bot_subreddits = ['memebot9000', 
                  'spacenewsbot', 
                  'removalbot', 
                  'u_ignorantbotkin4ho', 
                  'botsrights', 
                  'gildedbot', 
                  'open_bots_test', 
                  'rcbredditbot', 
                  'newsbotmarket', 
                  'alt_source_bot_log', 
                  'steamiebot', 
                  'modbot_staging',
                  'testanimalsupportbot', 
                  'u_udemy_sample_bot', 
                  'newsbottmt', 
                  'bottownfriends', 
                  'airsoft_bot', 
                  'wikileaksemailbot', 
                  'foreveralonebots', 
                  'botterminator', 
                  'bottown2', 
                  'bottown_polibot', 
                  'bottowngarden', 
                  'havoc_bot', 
                  'botrequests', 
                  'sentimentviewbot', 
                  'newsbotscience', 
                  'u_commonmisspellingbot', 
                  'dogetipbot', 
                  'resistbot', 
                  'newsbotfunding', 
                  'ebolanewsbot', 
                  'gwcoepbot', 
                  'uknewsbyabot', 
                  'u_flamboyantbotoshrx', 
                  'laserlikebot', 
                  'repostsleuthbot', 
                  'interfaithbotdialogue', 
                  'articlebot', 
                  'potuswatchbot', 
                  'pulsarbot', 
                  'mimeticsbot', 
                  'cryptobots', 
                  'wutbotposts', 
                  'bottesting', 
                  'botbotread', 
                  'cleverbot', 
                  'u_anticensor_bot', 
                  'trollbot', 
                  'brokentranslatebot', 
                  'newsbiasbot', 
                  'sexpollbottest', 
                  'bottalks', 
                  'bitnewsbot', 
                  'gbpolbot', 
                  'rssbot', 
                  'botsscrewingup', 
                  'u_dicebotbtc', 
                  'u_userleansbot', 
                  'islamicstatenewsbot', 
                  'u_moebot12', 
                  'webbot', 
                  'pew_bot_memes', 
                  'u_zukbot', 
                  'u_bot4bot', 
                  'stabbot', 
                  'u_yangpolicyinfo_bot', 
                  'subredditsummarybot', 
                  'bot4bottesting', 
                  'travsbots', 
                  'spooktoberbot', 
                  'trollabot', 
                  'printrbot', 
                  'bottown1', 
                  'kzreminderbotsub', 
                  'bottown', 
                  'newsbotbot', 
                  'gifbot', 
                  'uvabot', 
                  'cfbotpolitics', 
                  'nwordcountbot', 
                  'quizzybot', 
                  'israelnewsbot', 
                  'popularnewsbot', 
                  'monkeynewsbot', 
                  'thelinkfixerbot', 
                  'atheismbot', 
                  'inspirobotbot', 
                  'testingground4bots', 
                  'buzzfeedbot', 
                  'isreactionarybot', 
                  'blenderbot', 
                  'talkwithgpt2bots', 
                  'bottown22', 
                  'twitter_bot', 
                  'autowikibot', 
                  'elsbot']

for i in bot_subreddits:
    clean_comments = clean_comments[~clean_comments['subreddit.name'].str.contains(i)]
    clean_posts = clean_posts[~clean_posts['subreddit.name'].str.contains(i)]

### Language check

In [104]:
# Add language information
clean_comments["language"] = clean_comments["body_clean"].apply(check_lang)
clean_posts["language"] = clean_posts["selftext_clean"].apply(check_lang)

In [105]:
# Sorting out all not english posts and comments, assumption that post body represents language
clean_posts = clean_posts[clean_posts["language"]=="en"]
clean_comments = clean_comments[clean_comments["language"]=="en"]

### Convert date and time information

In [106]:
# Create a new columns with date and time information
clean_posts['created_date'] = pd.to_datetime(clean_posts['created_utc'], utc=True, unit='s').dt.strftime('%Y-%m-%d')
clean_posts['created_day'] = pd.to_datetime(clean_posts['created_utc'], utc=True, unit='s').dt.strftime('%d')
clean_posts['created_month'] = pd.to_datetime(clean_posts['created_utc'], utc=True, unit='s').dt.strftime('%m')
clean_posts['created_year'] = pd.to_datetime(clean_posts['created_utc'], utc=True, unit='s').dt.strftime('%Y')
clean_posts['created_time'] = pd.to_datetime(clean_posts['created_utc'], utc=True, unit='s').dt.strftime('%H:%M:%S')

clean_comments['created_date'] = pd.to_datetime(clean_comments['created_utc'], utc=True, unit='s').dt.strftime('%Y-%m-%d')
clean_comments['created_day'] = pd.to_datetime(clean_comments['created_utc'], utc=True, unit='s').dt.strftime('%d')
clean_comments['created_month'] = pd.to_datetime(clean_comments['created_utc'], utc=True, unit='s').dt.strftime('%m')
clean_comments['created_year'] = pd.to_datetime(clean_comments['created_utc'], utc=True, unit='s').dt.strftime('%Y')
clean_comments['created_time'] = pd.to_datetime(clean_comments['created_utc'], utc=True, unit='s').dt.strftime('%H:%M:%S')

### Output file

In [107]:
# Save "posts" CSV file in data folder
posts_save_path = f"data/posts_preprocessed.csv"

if not os.path.isfile(posts_save_path):
    clean_posts.to_csv('data/preprocessed_posts.csv', index=False)
    print("File saved!")
else:
    print("Warning file already exists")

In [15]:
# Reduce comment file size by dropping unnecessary columns because it got too large
clean_comments = clean_comments.drop(["type","subreddit.id","body","language"],axis=1)

In [None]:
# Due to wrong commata in the original file there are NA rows created which should be deleted
clean_comments = clean_comments.dropna(axis=0)

In [None]:
# Save "comments" CSV file in data folder
comments_save_path = f"data/comments_preprocessed.csv"

if not os.path.isfile(comments_save_path):
    clean_comments.to_csv("data/preprocessed_comments" + '.gzip', index=False, compression='gzip')
    print("File saved!")
else:
    print("Warning file already exists")