In [1]:
import os
import pandas as pd

This set of code should ingest a manifest(or set of manifests), and perform the specified cleaning. 

Returns 2 Tables: author Metadata, and Comments

In [None]:
!pwd

In [2]:
# home_directory = '/Users/ameliachu/Google Drive/Spring 2021/Text as Data/final_project'

home_directory = "/Volumes/GoogleDrive/My Drive/Spring 2021/Text as Data/final_project/"
data_directory = "data/reddit" # should never change

# Should be custom or inferred by date
manifest_fname = "curated_submission_manifest_gme_04-15-2021.csv"


In [3]:
manifest_path = f"{home_directory}/{data_directory}/{manifest_fname}"
manifest =  pd.read_csv(manifest_path)
manifest.sort_values(by="sub_created_utc").head(5)

Unnamed: 0,sub_created_utc,sub_id,sub_title
8,1611233000.0,l1xtan,GME Megathread - Lemon Party 2: Electric Boogaloo
39,1611271000.0,l2a9vf,Shitty Luck: My refutation of every point Shit...
7,1611596000.0,l4syrd,GME Megathread Part 2
14,1611767000.0,l690w0,When the $GME train finally pulls in to the st...
15,1611782000.0,l6eijh,GME Feel-Good posts MEGATHREAD! We'll have the...


In [4]:
# Determining what we have collected thus far
gme_raw_data_directory = f"{home_directory}/{data_directory}/gme/"
gme_fnames = [f for f in os.listdir(gme_raw_data_directory) if os.path.isfile(os.path.join(gme_raw_data_directory, f))]

existing_sub_ids = [fname.split('.')[0].split('_')[-1] for fname in gme_fnames if fname.startswith('comments_gme_')]

In [5]:
existing_sub_ids = [i for i in existing_sub_ids if i not in ['m16emz']]

In [6]:
gme_fname = gme_fnames[0]
gme_raw_data_path = f"{gme_raw_data_directory}{gme_fname}"
gme_raw_data = pd.read_csv(gme_raw_data_path, index_col=0)

In [None]:

QAComments.init_count(comments)

In [None]:
RedditCSVComments()

In [10]:
gme_raw_data 

Unnamed: 0,sub_id,body,score,author,created_utc
0,mfoivk,GME: Making Monday mornings exciting since 2021.,534,rweavere,1.617022e+09
1,mfoivk,My dopamine is directly correlated to GME ticker,2463,Oscell,1.617022e+09
2,mfoivk,Just bought my very first share,1743,Poetslord,1.617029e+09
3,mfoivk,"Morning, reminder that this week is a short we...",685,FistPunch_Vol_4,1.617021e+09
4,mfoivk,I think I'm going to quit my job when this moo...,656,babycrusher7,1.617022e+09
...,...,...,...,...,...
24159,mfoivk,[deleted],1,,1.617072e+09
24160,mfoivk,Whatever clown,0,Ozwaldo,1.617033e+09
24161,mfoivk,"God dammit, man. You are so fucking dumb. I li...",3,fs05,1.617074e+09
24162,mfoivk,He doesn't need to lmao. He bought 9 million s...,2,need2burn,1.617072e+09


In [7]:
class RedditCSVComments:
    def __init__(self, raw_data):
        self.raw_data = raw_data
        self.default_author_agg = {
            "sub_id": ["nunique"],
            "score": ["sum", "count"],
            "created_utc": ["min", "max"]
            }
    def get_data(self):
        return self.raw_data
        

class CleanRedditCSVComments(RedditCSVComments):
    def __init__(self, raw_data):
        # constructor
        super().__init__(self, raw_data)
        self.comment_exclusion = ['[deleted]', '[removed]']
        self.raw_data = self.remove_uninformative_comments(raw_data)
        
    def remove_uninformative_comments(self, raw):
        cleaned_data = raw_data[~raw_data['body'].isin(comment_exclusion)]
        self.cleaned_data = cleaned_data
        return cleaned_data

In [40]:
class RedditComments:
    def __init__(self, raw_data):
        self.data = raw_data
        self.default_author_agg = {
            "sub_id": ["nunique"],
            "score": ["sum", "count"],
            "created_utc": ["min", "max"]
            }

        self.comment_exclusion = ['[deleted]', '[removed]']

    def get_data(self):
        return self.data


class RedditCommentsBuilder:
    def __init__(self, raw_data):
        self.data = raw_data
        self.comment_exclusion = ['[deleted]', '[removed]']

    def remove_uninformative_comments(self):
        self.data = self.data[~self.data['body'].isin(self.comment_exclusion)]
        return self

    def remove_smiley_faces(self):
        #self.data = code to remove smiley_faces
        return self

    def build(self):
        return RedditComments(self.data)



In [44]:
reddit_comments_builder = RedditCommentsBuilder(gme_raw_data)
reddit_comments = reddit_comments_builder\
                    .remove_uninformative_comments()\
                    .remove_smiley_faces()\
                    .build()

reddit_comments.get_data()

Unnamed: 0,sub_id,body,score,author,created_utc
0,mfoivk,GME: Making Monday mornings exciting since 2021.,534,rweavere,1.617022e+09
1,mfoivk,My dopamine is directly correlated to GME ticker,2463,Oscell,1.617022e+09
2,mfoivk,Just bought my very first share,1743,Poetslord,1.617029e+09
3,mfoivk,"Morning, reminder that this week is a short we...",685,FistPunch_Vol_4,1.617021e+09
4,mfoivk,I think I'm going to quit my job when this moo...,656,babycrusher7,1.617022e+09
...,...,...,...,...,...
24157,mfoivk,Lmao the fuck is this loser complaining about ...,1,inomooshekki,1.617032e+09
24160,mfoivk,Whatever clown,0,Ozwaldo,1.617033e+09
24161,mfoivk,"God dammit, man. You are so fucking dumb. I li...",3,fs05,1.617074e+09
24162,mfoivk,He doesn't need to lmao. He bought 9 million s...,2,need2burn,1.617072e+09


In [29]:
clean_comments = CleanRedditCSVComments(gme_raw_data)

In [33]:
clean_comments.get_data()

Unnamed: 0,sub_id,body,score,author,created_utc
0,mfoivk,GME: Making Monday mornings exciting since 2021.,534,rweavere,1.617022e+09
1,mfoivk,My dopamine is directly correlated to GME ticker,2463,Oscell,1.617022e+09
2,mfoivk,Just bought my very first share,1743,Poetslord,1.617029e+09
3,mfoivk,"Morning, reminder that this week is a short we...",685,FistPunch_Vol_4,1.617021e+09
4,mfoivk,I think I'm going to quit my job when this moo...,656,babycrusher7,1.617022e+09
...,...,...,...,...,...
24157,mfoivk,Lmao the fuck is this loser complaining about ...,1,inomooshekki,1.617032e+09
24160,mfoivk,Whatever clown,0,Ozwaldo,1.617033e+09
24161,mfoivk,"God dammit, man. You are so fucking dumb. I li...",3,fs05,1.617074e+09
24162,mfoivk,He doesn't need to lmao. He bought 9 million s...,2,need2burn,1.617072e+09


In [30]:
comments = RedditCSVComments(gme_raw_data)

In [None]:
QAComments.n_unique_authors(comments, cleaned=False)

In [None]:
raw_data 

In [None]:
comments.cleaned_data

In [None]:
CleanRedditCSVComments.remove_uninformative_comments(comments)

In [None]:
QAComments.update_log(comments,sub_id='a')

Removing `[deleted]` comment rows

In [None]:
class CleanRedditCSVComments(RedditCSVComments):
    def __init__(self, raw_data):
        super().__init__(raw_data)
        self.comment_exclusion = ['[deleted]', '[removed]']
        
    def remove_uninformative_comments(self):
        raw_data = self.raw_data
        cleaned_data = raw_data[~raw_data['body'].isin(self.comment_exclusion)]
        self.cleaned_data = cleaned_data
        return cleaned_data

In [None]:
CleanRedditCSVComments

In [None]:
class RedditCSVComments:
    def __init__(self, raw_data):
        self.raw_data = raw_data
        self.default_author_agg = {'sub_id':['nunique'],
                              'score':['sum','count'], 
                                'created_utc':['min','max']}

class QAComments(RedditCSVComments):
    def __init__(self, raw_data):
        super().__init__(raw_data)
        self.log = []
        
    def init_count(self):
        raw_data = self.raw_data
        return raw_data.count()[0]

    def n_unique_authors(self, cleaned=True):
        if cleaned:
            data = self.cleaned_data
        else:
            data = self.raw_data
        return data['author'].nunique()

    def post_count(self):
        cleaned_data = self.cleaned_data
        return cleaned_data.count()[0]
    
    def update_log(self, sub_id):
        init_count = self.init_count()
        post_process_count = self.post_count()
        unique_authors = self.n_unique_authors()
        update_entry = (sub_id, init_count, post_process_count, unique_authors)
        self.log.append(update_entry)
        

# cleaning methods
class CleanRedditCSVComments(RedditCSVComments):
    def __init__(self, raw_data, comment_exclusion = ['[deleted]','[removed]']):
        super().__init__(raw_data, comment_exclusion)
        self.comment_exclusion = comment_exclusion
        self.qa_log = []
        
    def remove_uninformative_comments(self):
        raw_data = self.raw_data
        cleaned_data = raw_data[~raw_data['body'].isin(self.comment_exclusion)]
        self.cleaned_data = cleaned_data
        return cleaned_data
        

class RedditAuthorTable:
    """ Created after master table?
    Should keep score of deleted and removed? can't
    {'zachariassss': {'num_comments':,
                  'thread_appearances': [],
                  'init_comment_timestamp':,
                  'last_comment_timestamp':,
                  'average_acore':}}
    """
    
    def collect_author_data(self, data):
        """ collecting information by dataframe
        """
        default_aggregations = self.default_author_agg 
        data.groupby(by='author').agg(default_aggregations)
    
    def update_author_data():
        """ updating master table after collecting author name
        """
        
class RedditCommentsTable:
    def update_comments_data():
        pass


In [3]:
sub_id = existing_sub_ids[0]
gme_raw_data_path = f"{gme_raw_data_directory}comments_gme_{sub_id}.csv"
gme_raw_data = pd.read_csv(gme_raw_data_path, index_col=0)

gme_raw_data

NameError: name 'existing_sub_ids' is not defined

In [None]:
clean_log = []

for sub_id in existing_sub_ids:
    
    gme_raw_data_path = f"{gme_raw_data_directory}comments_gme_{sub_id}.csv"
    gme_raw_data = pd.read_csv(gme_raw_data_path, index_col=0)
    
    init_count = gme_raw_data.count()[0]
    
    cleaned_gme_data = gme_raw_data[~gme_raw_data['body'].isin(['[deleted]','[removed]'])]
    post_process_count = cleaned_gme_data.count()[0]

    unique_authors = gme_raw_data['author'].nunique()

    new_entry = (sub_id, init_count, post_process_count, unique_authors)
    cleaned_gme_data_path =  f"{gme_raw_data_directory}cleaned_comments_gme_{sub_id}.csv"
    cleaned_gme_data.to_csv(cleaned_gme_data_path, index=True)
    clean_log.append(new_entry)
    print(new_entry)

Combine all submission comments into a single file 

In [None]:
gme_master_data_list = []

for sub_id in existing_sub_ids:
     cleaned_gme_data_path =  f"{gme_raw_data_directory}cleaned_comments_gme_{sub_id}.csv"
     gme_cleaned_data = pd.read_csv( cleaned_gme_data_path , index_col=0)
     gme_master_data_list.append(gme_cleaned_data)

gme_master_data = pd.concat(gme_master_data_list).reset_index(drop=True)

In [None]:
start_time = int(gme_master_data['created_utc'].min())
end_time = int(gme_master_data['created_utc'].max())

gme_master_data_path = f"{gme_raw_data_directory}gme_master_data_{start_time}_{end_time}.csv"
gme_master_data.to_csv(gme_master_data_path, index=True)
print(gme_master_data_path)

In [None]:
sub_id = 'm16emz'
gme_raw_data_path = f"{gme_raw_data_directory}comments_gme_limited_m16emz.csv"
gme_raw_data = pd.read_csv(gme_raw_data_path, index_col=0)

init_count = gme_raw_data.count()[0]

cleaned_gme_data = gme_raw_data[~gme_raw_data['body'].isin(['[deleted]','[removed]'])]
post_process_count = cleaned_gme_data.count()[0]

unique_authors = gme_raw_data['author'].nunique()

new_entry = (sub_id, init_count, post_process_count, unique_authors)
cleaned_gme_data_path =  f"{gme_raw_data_directory}cleaned_comments_gme_{sub_id}.csv"
cleaned_gme_data.to_csv(cleaned_gme_data_path, index=True)
clean_log.append(new_entry)

In [None]:
cleaned_gme_data_path =  f"{gme_raw_data_directory}cleaned_comments_gme_{sub_id}.csv"
gme_cleaned_data = pd.read_csv( cleaned_gme_data_path , index_col=0)
gme_master_data_list.append(gme_cleaned_data)

In [None]:
gme_master_data = pd.concat(gme_master_data_list).reset_index(drop=True)

In [None]:
gme_master_data.count()

In [None]:
gme_master_data.head(5)