In [19]:
import requests
import pandas as pd
import datetime as dt
import time
import praw
import numpy as np
import json

In [3]:
base_url = 'https://api.pushshift.io/reddit/submission/search'

In [4]:
pd.set_option('display.max_columns', None)

In [5]:
current_time = round(time.time())

In [6]:
params = {
    'subreddit' : 'LiverpoolFC',
    'limit' : 1000,
    'until' : current_time
    
}

In [7]:
res = requests.get(base_url, params)

In [8]:
df = pd.DataFrame(res.json()['data'])
df.shape

(1000, 95)

In [9]:
# def get_submission_ids(subreddit):
#     submissions = []
#     url = f"https://api.pushshift.io/reddit/search/submission?subreddit={subreddit}&size=200"
#     start = dt.datetime(2011, 1, 1).timestamp()  
#     num_posts = 0
#     all_posts = []
#     while True:
#         start_time = time.time()
#         res = requests.get(url)
#         print(res.status_code)
#         try:
#             posts = pd.DataFrame(res.json()['data'])
#             all_posts.append(posts)
#             num_posts += len(posts)
#             end_time = time.time()
#             time_to = end_time - start_time
#             total_time += time_to
#         except:
#             pass
#         print(f"time to scrape: {time_to}; num posts: {num_posts}")
#     df = pd.DataFrame(all_posts)
#     return df[['id']]


In [27]:
def pull_reddit_posts(subreddit):
    """
    This function is designed to take in a set of subreddits, connect to the pushshift api,
    and pull all submissions by setting a lowerbound of choice as start_utc. Utilizes a while
    loop to get posts using requests and has checks inside for:
    1. If data is not in the response json
    2. If the last post of the most recent pull is less than the lower bounds
    Once all posts have been collected and/or the loop breaks, save all posts as a dataframe
    and save to csv in project data folder.

    """
    posts = []

    
    print("this is subreddit", subreddit)
    # save URL based on current subreddit
    url = f"https://api.pushshift.io/reddit/search/submission/?subreddit={subreddit}&size=500"
    # set date as lower bounds, see if we can limit calls by only pulling since 2015
    start_utc = dt.datetime(2011, 1, 1).timestamp()
    print("start_utc", start_utc)
    while True:
        
        start_time = time.time()
        # call requests, save response
        response = requests.get(url)
        # save json as data
        try:
            data = response.json()
            print("Equipment data has been successfully retrieved.")
        except json.decoder.JSONDecodeError:
            print("There was a problem accessing the equipment data.")
        # if data key not in response, break loop
        if len(data['data']) == 0:
            break
        # else, add data to posts list
        posts += data['data']
        print("posts length", len(posts))
        # pull last post in most recent data pull
        last_post = data['data'][-1]
        # save created_utc for gate and next url
        created_utc = last_post['created_utc']
        print("created_utc", created_utc)
        # current last post utc less than lower bounds, break loop
        if created_utc < start_utc:
            break
        # if loop hasn't broken, set url for next pull passing created_utc as new 'before' parameter
        url = f"https://api.pushshift.io/reddit/search/submission/?subreddit={subreddit}&size=500&before={created_utc}"
        end_time = time.time()
        print("total time taken this loop: ", end_time-start_time)
    df = pd.DataFrame(posts)
    return df

In [28]:
df_liv = pull_reddit_posts('LiverpoolFC')

this is subreddit LiverpoolFC
start_utc 1293840000.0
Equipment data has been successfully retrieved.
posts length 500
created_utc 1678652954
total time taken this loop:  1.7834289073944092
Equipment data has been successfully retrieved.
posts length 1000
created_utc 1678099691
total time taken this loop:  4.2652177810668945
Equipment data has been successfully retrieved.
posts length 1500
created_utc 1678040760
total time taken this loop:  2.7655510902404785
Equipment data has been successfully retrieved.
posts length 2000
created_utc 1677277857
total time taken this loop:  2.9237940311431885
Equipment data has been successfully retrieved.
posts length 2499
created_utc 1676606496
total time taken this loop:  3.0365421772003174
Equipment data has been successfully retrieved.
posts length 2998
created_utc 1675637682
total time taken this loop:  3.547813892364502
Equipment data has been successfully retrieved.
posts length 3498
created_utc 1674699259
total time taken this loop:  2.4297289

Equipment data has been successfully retrieved.
posts length 30485
created_utc 1603835490
total time taken this loop:  3.668667793273926
Equipment data has been successfully retrieved.
posts length 30985
created_utc 1603315492
total time taken this loop:  2.7753779888153076
Equipment data has been successfully retrieved.
posts length 31485
created_utc 1602960535
total time taken this loop:  3.6074581146240234
Equipment data has been successfully retrieved.
posts length 31985
created_utc 1602620696
total time taken this loop:  4.780536890029907
Equipment data has been successfully retrieved.
posts length 32485
created_utc 1601881700
total time taken this loop:  4.181637763977051
Equipment data has been successfully retrieved.
posts length 32985
created_utc 1601569175
total time taken this loop:  4.551461935043335
Equipment data has been successfully retrieved.
posts length 33483
created_utc 1601034033
total time taken this loop:  4.358750820159912
Equipment data has been successfully re

Equipment data has been successfully retrieved.
posts length 60468
created_utc 1577098740
total time taken this loop:  3.0931930541992188
Equipment data has been successfully retrieved.
posts length 60968
created_utc 1576946187
total time taken this loop:  3.3893790245056152
Equipment data has been successfully retrieved.
posts length 61468
created_utc 1576696982
total time taken this loop:  4.235860824584961
Equipment data has been successfully retrieved.
posts length 61968
created_utc 1576401374
total time taken this loop:  2.814892053604126
Equipment data has been successfully retrieved.
posts length 62468
created_utc 1576197399
total time taken this loop:  2.985703945159912
Equipment data has been successfully retrieved.
posts length 62968
created_utc 1575990635
total time taken this loop:  2.662400007247925
Equipment data has been successfully retrieved.
posts length 63468
created_utc 1575591551
total time taken this loop:  4.216073989868164
Equipment data has been successfully re

Equipment data has been successfully retrieved.
posts length 90456
created_utc 1557571419
total time taken this loop:  3.349550724029541
Equipment data has been successfully retrieved.
posts length 90956
created_utc 1557412748
total time taken this loop:  3.6586451530456543
Equipment data has been successfully retrieved.
posts length 91456
created_utc 1557332359
total time taken this loop:  3.4398300647735596
Equipment data has been successfully retrieved.
posts length 91956
created_utc 1557299237
total time taken this loop:  2.9155290126800537
Equipment data has been successfully retrieved.
posts length 92455
created_utc 1557269795
total time taken this loop:  2.357083797454834
Equipment data has been successfully retrieved.
posts length 92955
created_utc 1557264776
total time taken this loop:  2.850724935531616
Equipment data has been successfully retrieved.
posts length 93454
created_utc 1557262553
total time taken this loop:  2.2185497283935547
Equipment data has been successfully 

Equipment data has been successfully retrieved.
posts length 120439
created_utc 1537019026
total time taken this loop:  3.909160852432251
Equipment data has been successfully retrieved.
posts length 120939
created_utc 1536423804
total time taken this loop:  2.85721492767334
Equipment data has been successfully retrieved.
posts length 121439
created_utc 1535816617
total time taken this loop:  3.674727201461792
Equipment data has been successfully retrieved.
posts length 121939
created_utc 1535555150
total time taken this loop:  3.11529278755188
Equipment data has been successfully retrieved.
posts length 122439
created_utc 1535211042
total time taken this loop:  2.9363760948181152
Equipment data has been successfully retrieved.
posts length 122939
created_utc 1534829612
total time taken this loop:  3.22885799407959
Equipment data has been successfully retrieved.
posts length 123439
created_utc 1534506255
total time taken this loop:  3.7183029651641846
Equipment data has been successfull

Equipment data has been successfully retrieved.
posts length 150417
created_utc 1519883731
total time taken this loop:  2.9026999473571777
Equipment data has been successfully retrieved.
posts length 150917
created_utc 1519452335
total time taken this loop:  2.7794039249420166
Equipment data has been successfully retrieved.
posts length 151417
created_utc 1518830940
total time taken this loop:  2.8354060649871826
Equipment data has been successfully retrieved.
posts length 151917
created_utc 1518605395
total time taken this loop:  2.4936470985412598
Equipment data has been successfully retrieved.
posts length 152415
created_utc 1518125280
total time taken this loop:  2.4288129806518555
Equipment data has been successfully retrieved.
posts length 152915
created_utc 1517769006
total time taken this loop:  3.172996997833252
Equipment data has been successfully retrieved.
posts length 153414
created_utc 1517324531
total time taken this loop:  1.900965690612793
Equipment data has been succe

Equipment data has been successfully retrieved.
posts length 180404
created_utc 1498512761
total time taken this loop:  2.743785858154297
Equipment data has been successfully retrieved.
posts length 180904
created_utc 1498136692
total time taken this loop:  2.6300110816955566
Equipment data has been successfully retrieved.
posts length 181403
created_utc 1497483812
total time taken this loop:  2.5378758907318115
Equipment data has been successfully retrieved.
posts length 181903
created_utc 1496862534
total time taken this loop:  3.1992571353912354
Equipment data has been successfully retrieved.
posts length 182403
created_utc 1496358158
total time taken this loop:  5.273115873336792
Equipment data has been successfully retrieved.
posts length 182903
created_utc 1495642014
total time taken this loop:  3.042240858078003
Equipment data has been successfully retrieved.
posts length 183403
created_utc 1495332655
total time taken this loop:  3.967146158218384
Equipment data has been success

Equipment data has been successfully retrieved.
posts length 210399
created_utc 1457648883
total time taken this loop:  3.1983883380889893
Equipment data has been successfully retrieved.
posts length 210899
created_utc 1457060378
total time taken this loop:  3.3943207263946533
Equipment data has been successfully retrieved.
posts length 211399
created_utc 1456546302
total time taken this loop:  3.5821728706359863
Equipment data has been successfully retrieved.
posts length 211899
created_utc 1455748298
total time taken this loop:  3.0484418869018555
Equipment data has been successfully retrieved.
posts length 212399
created_utc 1455130884
total time taken this loop:  2.7853739261627197
Equipment data has been successfully retrieved.
posts length 212898
created_utc 1454609605
total time taken this loop:  2.018834114074707
Equipment data has been successfully retrieved.
posts length 213398
created_utc 1454093520
total time taken this loop:  3.0210869312286377
Equipment data has been succ

Equipment data has been successfully retrieved.
posts length 240379
created_utc 1419659265
total time taken this loop:  2.8836050033569336
Equipment data has been successfully retrieved.
posts length 240878
created_utc 1418671227
total time taken this loop:  3.789658784866333
Equipment data has been successfully retrieved.
posts length 241378
created_utc 1417714999
total time taken this loop:  2.4457437992095947
Equipment data has been successfully retrieved.
posts length 241877
created_utc 1416758095
total time taken this loop:  3.1443140506744385
Equipment data has been successfully retrieved.
posts length 242377
created_utc 1415445188
total time taken this loop:  2.432682991027832
Equipment data has been successfully retrieved.
posts length 242876
created_utc 1414536273
total time taken this loop:  2.7027509212493896
Equipment data has been successfully retrieved.
posts length 243376
created_utc 1413781687
total time taken this loop:  2.2298998832702637
Equipment data has been succe

Equipment data has been successfully retrieved.
posts length 270344
created_utc 1376342011
total time taken this loop:  2.9519259929656982
Equipment data has been successfully retrieved.
posts length 270843
created_utc 1375779237
total time taken this loop:  3.2489967346191406
Equipment data has been successfully retrieved.
posts length 271343
created_utc 1374817268
total time taken this loop:  3.383661985397339
Equipment data has been successfully retrieved.
posts length 271843
created_utc 1373911645
total time taken this loop:  3.174738883972168
Equipment data has been successfully retrieved.
posts length 272342
created_utc 1372695738
total time taken this loop:  3.2016539573669434
Equipment data has been successfully retrieved.
posts length 272842
created_utc 1371281932
total time taken this loop:  2.6387898921966553
Equipment data has been successfully retrieved.
posts length 273341
created_utc 1369867157
total time taken this loop:  2.255164384841919
There was a problem accessing 

In [30]:
df_liv.shape

(289828, 126)

In [31]:
df_liv.selftext.replace('', np.nan, inplace = True)

In [33]:
df_liv.selftext.fillna(df_liv.title, inplace = True)

In [34]:
df_liv.head(30)

Unnamed: 0,subreddit,selftext,author_fullname,gilded,title,link_flair_richtext,subreddit_name_prefixed,hidden,pwls,link_flair_css_class,thumbnail_height,top_awarded_type,hide_score,quarantine,link_flair_text_color,upvote_ratio,author_flair_background_color,subreddit_type,total_awards_received,media_embed,thumbnail_width,author_flair_template_id,is_original_content,secure_media,is_reddit_media_domain,is_meta,category,secure_media_embed,link_flair_text,score,is_created_from_ads_ui,author_premium,thumbnail,edited,author_flair_css_class,author_flair_richtext,gildings,post_hint,content_categories,is_self,link_flair_type,wls,removed_by_category,author_flair_type,domain,allow_live_comments,suggested_sort,url_overridden_by_dest,view_count,archived,no_follow,is_crosspostable,pinned,over_18,preview,all_awardings,awarders,media_only,can_gild,spoiler,locked,author_flair_text,treatment_tags,removed_by,distinguished,subreddit_id,link_flair_background_color,id,is_robot_indexable,author,discussion_type,num_comments,send_replies,whitelist_status,contest_mode,author_patreon_flair,author_flair_text_color,permalink,parent_whitelist_status,stickied,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video,retrieved_utc,updated_utc,utc_datetime_str,link_flair_template_id,media_metadata,is_gallery,gallery_data,poll_data,edited_on,author_cakeday,tournament_data,author_created_utc,retrieved_on,event_end,event_is_live,event_start,crosspost_parent,crosspost_parent_list,removal_reason,collections,previous_visits,approved_by,created,downs,likes,mod_reports,num_reports,post_categories,report_reasons,rte_mode,saved,selftext_html,ups,user_reports,brand_safe,approved_at_utc,banned_at_utc,from_id,from,from_kind
0,LiverpoolFC,Brooklyn license plate...,t2_s6t8f,0.0,Brooklyn license plate...,[],r/LiverpoolFC,False,6.0,,140.0,,True,False,dark,1.0,,public,0.0,{},140.0,,False,,True,False,,{},,1,False,False,https://a.thumbs.redditmedia.com/A7UwIUr12Oyjv...,False,,[],{},image,,False,text,6.0,,text,i.redd.it,False,,https://i.redd.it/z76gy3ro9dqa1.jpg,,False,True,True,False,False,{'images': [{'source': {'url': 'https://previe...,[],[],False,True,False,False,,[],,,t5_2qn0o,,124502d,True,bkscooter,,0,True,all_ads,False,False,,/r/LiverpoolFC/comments/124502d/brooklyn_licen...,all_ads,False,https://i.redd.it/z76gy3ro9dqa1.jpg,436444.0,1679959868,0.0,,False,1679959887,1679959887,2023-03-27 23:31:08,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,LiverpoolFC,Maybe the England lads were a diversion?,t2_7rwhuhvi,0.0,Maybe the England lads were a diversion?,"[{'e': 'text', 't': 'International Football'}]",r/LiverpoolFC,False,6.0,,140.0,,True,False,dark,1.0,,public,0.0,{},140.0,,False,,True,False,,{},International Football,1,False,False,https://b.thumbs.redditmedia.com/QPNVQj0l57cml...,False,,[],{},image,,False,richtext,6.0,,text,i.redd.it,False,,https://i.redd.it/oivjl5padeqa1.jpg,,False,False,True,False,False,{'images': [{'source': {'url': 'https://previe...,[],[],False,True,False,False,,[],,,t5_2qn0o,#5ae0e2,1242m5s,True,Controversial_lemon,,0,True,all_ads,False,False,,/r/LiverpoolFC/comments/1242m5s/maybe_the_engl...,all_ads,False,https://i.redd.it/oivjl5padeqa1.jpg,436437.0,1679955190,0.0,,False,1679955206,1679955206,2023-03-27 22:13:10,9897faca-3e83-11ec-a851-927b3aee6417,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,LiverpoolFC,Jude Bellingham would be unbelievable for Live...,t2_5o1ma6rj,0.0,Jude Bellingham would be unbelievable for Live...,"[{'e': 'text', 't': 'Tier 4 (Paywall) unless J...",r/LiverpoolFC,False,6.0,tier4,78.0,,True,False,dark,1.0,#dadada,public,0.0,{},140.0,ed7df05a-4944-11ec-939a-4aba47597414,False,,False,False,,{},Tier 4 (Paywall) unless Joyce,1,False,False,https://a.thumbs.redditmedia.com/S6hNp4SMopiBS...,False,,"[{'e': 'text', 't': 'Aly Cissokho'}]",{},link,,False,richtext,6.0,,richtext,thetimes.co.uk,False,,https://www.thetimes.co.uk/article/jude-bellin...,,False,False,True,False,False,{'images': [{'source': {'url': 'https://extern...,[],[],False,True,False,False,Aly Cissokho,[],,,t5_2qn0o,,12422ji,True,TheNotoriousJN,,0,True,all_ads,False,False,dark,/r/LiverpoolFC/comments/12422ji/jude_bellingha...,all_ads,False,https://www.thetimes.co.uk/article/jude-bellin...,436436.0,1679954168,0.0,,False,1679954187,1679954187,2023-03-27 21:56:08,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,LiverpoolFC,Henderson: ‘He (Jude Bellingham) can go as hig...,t2_3yj8lxpo,0.0,Henderson: ‘He (Jude Bellingham) can go as hig...,"[{'e': 'text', 't': 'Interviews'}]",r/LiverpoolFC,False,6.0,,84.0,,True,False,light,1.0,#dadada,public,0.0,{},140.0,1698d2c0-bcd0-11ed-9d0c-da55ef28e827,False,,False,False,,{},Interviews,1,False,False,https://b.thumbs.redditmedia.com/jUnE0ckrzsX3Y...,False,,"[{'e': 'text', 't': '⚽️ Liverpool 7-0 Man Unit...",{},link,,False,richtext,6.0,moderator,richtext,dailymail.co.uk,False,,https://www.dailymail.co.uk/sport/football/art...,,False,True,False,False,False,{'images': [{'source': {'url': 'https://extern...,[],[],False,True,False,False,"⚽️ Liverpool 7-0 Man United, 22/23 ⚽️",[],,,t5_2qn0o,#cc5289,1241x6w,False,doubleoeck1234,,0,True,all_ads,False,False,dark,/r/LiverpoolFC/comments/1241x6w/henderson_he_j...,all_ads,False,https://www.dailymail.co.uk/sport/football/art...,436435.0,1679953881,0.0,,False,1679953896,1679953897,2023-03-27 21:51:21,8f7a3316-805c-11ec-93bb-460d48c021b8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,LiverpoolFC,Naby Keita assist for Guinea,t2_y1tb7,0.0,Naby Keita assist for Guinea,[],r/LiverpoolFC,False,6.0,,78.0,,True,False,dark,1.0,,public,0.0,{},140.0,,False,,False,False,,{},,1,False,False,https://b.thumbs.redditmedia.com/uHtiqb4NO9Sai...,False,,[],{},link,,False,text,6.0,,text,streamin.me,False,,https://streamin.me/v/3908fcae,,False,False,True,False,False,{'images': [{'source': {'url': 'https://extern...,[],[],False,True,False,False,,[],,,t5_2qn0o,,123y8ma,True,gmp24,,0,True,all_ads,False,False,,/r/LiverpoolFC/comments/123y8ma/naby_keita_ass...,all_ads,False,https://streamin.me/v/3908fcae,436418.0,1679946733,0.0,,False,1679946752,1679946752,2023-03-27 19:52:13,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,LiverpoolFC,Naby Keita goal for Guinea,t2_y1tb7,0.0,Naby Keita goal for Guinea,[],r/LiverpoolFC,False,6.0,,78.0,,True,False,dark,1.0,,public,0.0,{},140.0,,False,,False,False,,{},,1,False,False,https://b.thumbs.redditmedia.com/iKGfVAO8AcfLk...,False,,[],{},link,,False,text,6.0,,text,streamin.me,False,,https://streamin.me/v/0ea5154e,,False,True,True,False,False,{'images': [{'source': {'url': 'https://extern...,[],[],False,True,False,False,,[],,,t5_2qn0o,,123y6ui,True,gmp24,,0,False,all_ads,False,False,,/r/LiverpoolFC/comments/123y6ui/naby_keita_goa...,all_ads,False,https://streamin.me/v/0ea5154e,436419.0,1679946631,0.0,,False,1679946643,1679946644,2023-03-27 19:50:31,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6,LiverpoolFC,[David Ornstein] Liverpool yet to open contrac...,t2_39i1zb05,0.0,[David Ornstein] Liverpool yet to open contrac...,"[{'e': 'text', 't': 'Tier 1'}]",r/LiverpoolFC,False,6.0,tier1,,,True,False,,1.0,#dadada,public,0.0,{},,66c08f00-4192-11ec-a8c0-ce635f0ecd91,False,,False,False,,{},Tier 1,1,False,False,default,False,,"[{'e': 'text', 't': 'Trent Alexander-Arnold'}]",{},,,False,richtext,6.0,,richtext,twitter.com,False,,https://twitter.com/David_Ornstein/status/1640...,,False,False,True,False,False,,[],[],False,True,False,False,Trent Alexander-Arnold,[],,,t5_2qn0o,,123v550,True,bllshrfv,,0,True,all_ads,False,False,dark,/r/LiverpoolFC/comments/123v550/david_ornstein...,all_ads,False,https://twitter.com/David_Ornstein/status/1640...,436411.0,1679940447,0.0,,False,1679940463,1679940464,2023-03-27 18:07:27,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,LiverpoolFC,[Giannis Chorianopoulos] Tsimikas is out after...,t2_5o0plj0f,0.0,[Giannis Chorianopoulos] Tsimikas is out after...,"[{'e': 'text', 't': 'Injury'}]",r/LiverpoolFC,False,6.0,rumour,140.0,,True,False,dark,1.0,,public,0.0,"{'content': '&lt;blockquote class=""twitter-vid...",140.0,,False,"{'type': 'twitter.com', 'oembed': {'provider_u...",False,False,,"{'content': '&lt;blockquote class=""twitter-vid...",Injury,1,False,False,https://b.thumbs.redditmedia.com/qM_B07xby3NMO...,False,,[],{},link,,False,richtext,6.0,,text,twitter.com,False,,https://twitter.com/choria80/status/1640389639...,,False,False,True,False,False,{'images': [{'source': {'url': 'https://extern...,[],[],False,True,False,False,,[],,,t5_2qn0o,#00a398,123s7pk,True,MathaiosPalaio,,0,True,all_ads,False,False,,/r/LiverpoolFC/comments/123s7pk/giannis_choria...,all_ads,False,https://twitter.com/choria80/status/1640389639...,436411.0,1679934460,0.0,"{'type': 'twitter.com', 'oembed': {'provider_u...",False,1679934475,1679934475,2023-03-27 16:27:40,a9a11dc6-f67f-11e5-b3ba-0ec1d138f2d7,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8,LiverpoolFC,Bobby Clark with Jude's younger brother Jobe B...,t2_4e66bt01,0.0,Bobby Clark with Jude's younger brother Jobe B...,"[{'e': 'text', 't': 'International Football'}]",r/LiverpoolFC,False,6.0,,140.0,,True,False,dark,1.0,,public,0.0,{},140.0,,False,,True,False,,{},International Football,1,False,False,https://b.thumbs.redditmedia.com/-jKGkZeCw057d...,False,,[],{},image,,False,richtext,6.0,,text,i.redd.it,False,,https://i.redd.it/dvztwfhgmcqa1.jpg,,False,True,True,False,False,{'images': [{'source': {'url': 'https://previe...,[],[],False,True,False,False,,[],,,t5_2qn0o,#5ae0e2,123s0im,True,mirzazulhilmi,,0,True,all_ads,False,False,,/r/LiverpoolFC/comments/123s0im/bobby_clark_wi...,all_ads,False,https://i.redd.it/dvztwfhgmcqa1.jpg,436410.0,1679934057,0.0,,False,1679934071,1679934072,2023-03-27 16:20:57,9897faca-3e83-11ec-a851-927b3aee6417,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9,LiverpoolFC,[removed],t2_bgbw1,0.0,Find someone that looks at you like Gerrard lo...,"[{'e': 'text', 't': 'Banned Source'}]",r/LiverpoolFC,False,6.0,banned,,,True,False,dark,1.0,,public,0.0,{},,,False,,False,False,,{},Banned Source,1,False,False,self,False,,[],{},,,True,richtext,6.0,moderator,text,self.LiverpoolFC,False,,,,False,True,False,False,False,,[],[],False,True,False,False,,[],,,t5_2qn0o,,123oest,False,KoedKevin,,0,True,all_ads,False,False,,/r/LiverpoolFC/comments/123oest/find_someone_t...,all_ads,False,https://www.reddit.com/r/LiverpoolFC/comments/...,436385.0,1679927137,0.0,,False,1679927149,1679927149,2023-03-27 14:25:37,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [35]:
df_liv.selftext.replace('[removed]', np.nan, inplace = True)

In [36]:
df_liv.selftext.replace('[deleted]', np.nan, inplace = True)

In [37]:
df_liv.selftext.fillna(df_liv.title, inplace = True)

In [42]:
df_liv['length'] = df_liv.selftext.str.split().apply(lambda x: len(x))
long = df_liv['length'] > 0
df_liv = df_liv[long]

In [43]:
df_liv.length.min()

1

In [44]:
df_liv.shape

(289817, 127)

In [46]:
df_liv.selftext.value_counts(ascending = False)

[deleted by user]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

In [70]:
df_liv.selftext.replace('[deleted by user]', np.nan, inplace = True)

In [66]:
df_liv.selftext.fillna(df_liv.title, inplace = True)

In [68]:
df_liv.selftext.value_counts(ascending = False)

[deleted by user]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

In [59]:
to_keep = df_liv[[
    'subreddit',
    'selftext',
    'score',
    'length',
    'num_comments',
    'utc_datetime_str',
    'id'
]]

In [60]:
df_liv_posts = to_keep

In [None]:
# df_posts.utc_datetime_str = df_posts.utc_datetime_str.str[:10]

In [61]:
df_liv_posts.head()

Unnamed: 0,subreddit,selftext,score,length,num_comments,utc_datetime_str,id
0,LiverpoolFC,Brooklyn license plate...,1,3,0,2023-03-27 23:31:08,124502d
1,LiverpoolFC,Maybe the England lads were a diversion?,1,7,0,2023-03-27 22:13:10,1242m5s
2,LiverpoolFC,Jude Bellingham would be unbelievable for Live...,1,10,0,2023-03-27 21:56:08,12422ji
3,LiverpoolFC,Henderson: ‘He (Jude Bellingham) can go as hig...,1,50,0,2023-03-27 21:51:21,1241x6w
4,LiverpoolFC,Naby Keita assist for Guinea,1,5,0,2023-03-27 19:52:13,123y8ma


In [62]:
df_liv_posts.num_comments.mean()

26.817833322406898

In [63]:
df_liv_posts.to_csv('liv_posts.csv', index = False)