In [1]:
from pathlib import Path

import string

import pandas as pd

import emoji

In [2]:
# set parameters
keep_comments = 5
comment_word_min = 5

# read in files

# since this is just to test, only read in the small "extra" files
post_p    = Path('./data/sd_post_10_2022_to_1115_2022.csv')
# comment_p = Path('./data/sd_comment_10_2022_to_11_2022_data_extra.csv')
# comment_p = Path('./data/sd_comment_9_2022_to_10_2022.csv')
comment_p = Path('./sd_comment_9_2022_to_10_2022.csv')

In [3]:
def translate_emojis(df, col):
    # source: https://stackoverflow.com/a/69423881
    df[col] =  df[col].apply(lambda x: ''.join((' '+c+' ') if c in emoji.UNICODE_EMOJI['en'] else c for c in x))
    df[col] =  df[col].apply(lambda x: emoji.demojize(x))
    return df

def remove_puncuations(df, col, *, keep=None):
    # remove special char combinations
    df[col] = df[col].replace(r'&amp;',' ', regex=True)
    df[col] = df[col].replace(r'\n',' ', regex=True)
    if keep is None:
        remove = string.punctuation
    else:
        remove = ''.join(list(set(string.punctuation).difference(set(keep))))
    df[col] = df[col].apply(lambda x: x.translate(str.maketrans('', '', remove)))
    return df
    
def add_word_count(df, col):
    new_col = f"{col}_count"
    df[new_col] = df[col].apply(lambda x: len(x.split(' ')))
    return df
    
def clean_str_col(df, col):
    df = translate_emojis(df, col)
    # remove urls and embedded gifs/giphy
    # source https://stackoverflow.com/a/51994366
    df[col] = df[col].replace(r'gif', '', regex=True).replace(r'giphy\S+', '', regex=True)
    df[col] = df[col].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)

    
    df = remove_puncuations(df, col, keep=['.', ' '])
    df[col] = df[col].str.lower()
    return df

In [4]:
# read in post csv
post_df = pd.read_csv(post_p)
mask = post_df.selftext.str.contains('\[removed\]').fillna(False)
tot_removed = post_df[mask].shape[0]
print(f"Posts removed: {tot_removed}")
post_df = post_df[~mask]
post_df.selftext = post_df.selftext.fillna('')
post_df.rename(columns={"id": "post_id", "author": "post_author", "created_utc": "post_utc"}, inplace=True)
post_df

Posts removed: 233


Unnamed: 0,subreddit,title,post_id,post_author,post_utc,selftext,full_link
0,UCSD,Lost my small black coach wallet at 11 am near...,ym7lyz,Recent-Obligation417,2022-11-04 19:14:07+00:00,,https://www.reddit.com/r/UCSD/comments/ym7lyz/...
1,sandiego,Shark Attack in Del Mar,ym7jn8,Xcapegoat,2022-11-04 19:11:36+00:00,,https://www.reddit.com/r/sandiego/comments/ym7...
2,UCSD,what 11 dollars gets you at Spice,ym7hqi,17kimv,2022-11-04 19:09:33+00:00,,https://www.reddit.com/r/UCSD/comments/ym7hqi/...
3,SDSU,Has anyone’s middle class scholarship not show...,ym7hf1,Exciting_Opposite_37,2022-11-04 19:09:11+00:00,,https://www.reddit.com/r/SDSU/comments/ym7hf1/...
4,UCSD,PSA don't lock your bike to this bike rack. it...,ym7ele,jneprz,2022-11-04 19:06:06+00:00,,https://www.reddit.com/r/UCSD/comments/ym7ele/...
...,...,...,...,...,...,...,...
2560,UCSD,Does having a low gpa matter?,yudhby,OkDoughnut994,2022-11-13 20:41:06+00:00,I’m a first year and am not doing very well in...,https://www.reddit.com/r/UCSD/comments/yudhby/...
2561,sandiego,SDGE bill generation vs distribution,yudh2m,solidavocadorock,2022-11-13 20:40:51+00:00,,https://www.reddit.com/r/sandiego/comments/yud...
2562,sandiego,Looking for venues,yucph0,sockherlu,2022-11-13 20:15:01+00:00,I am looking for different venue options for m...,https://www.reddit.com/r/sandiego/comments/yuc...
2563,sandiego,Looking for venues,yucmsj,sockherlu,2022-11-13 20:12:25+00:00,,https://www.reddit.com/r/sandiego/comments/yuc...


In [10]:
# read in comment csv
# comment_df = pd.read_csv(comment_p, lineterminator='\n')
comment_df = pd.read_csv(comment_p)
tot_removed = comment_df[comment_df.body.str.contains('\[removed\]')].shape[0]
print(f"Comments removed: {tot_removed}")
tot_removed = comment_df[comment_df.body.str.contains('\[deleted\]')].shape[0]
print(f"Comments deleted: {tot_removed}")
comment_df = comment_df[~comment_df.body.str.contains('\[removed\]')]
comment_df = comment_df[~comment_df.body.str.contains('\[deleted\]')]
comment_df.rename(columns={"id": "comment_id", "author": "comment_author"}, inplace=True)
comment_df.drop(columns=["subreddit", "link_id", "title", "permalink", "created_utc"], inplace=True)
comment_df

ParserError: Error tokenizing data. C error: Buffer overflow caught - possible malformed input file.


In [6]:
# clean post columns
post_df = clean_str_col(post_df, 'title')
post_df = clean_str_col(post_df, 'selftext')
post_df['combined_text'] = post_df['title'] + " " + post_df['selftext']
# post_df['combined_text'] = post_df['combined_text'].replace('. .','.')
# post_df = add_word_count(post_df, 'combined_text')


# clean comment columns
comment_df = clean_str_col(comment_df, 'body')
comment_df = add_word_count(comment_df, 'body')

In [7]:
comment_df = comment_df.sort_values(by='body_count', ascending=False)
comment_df = comment_df.groupby('post_id').head(keep_comments).reset_index(drop=True)
comment_df

Unnamed: 0,post_id,comment_author,body,score,comment_id,body_count
0,ykp50l,fullofdust,ok im bored and like to cook so ill try to hel...,6,iuxb4bq,459
1,ytm876,Icy-Ad2082,the 25 is just how it would appear in the merc...,1,iw9jcbq,394
2,ykng41,SnooLemons5080,let’s make a few things abundantly clear. tur...,3,iux4lnn,368
3,yismqt,cmajalis,if you know someone that goes to miramar commu...,1,iul6eqy,344
4,ykng41,SnooLemons5080,sorry as someone who has spoken to people who ...,3,iux6gmi,316
...,...,...,...,...,...,...
2282,yv8khm,Dry_Perspective_1537,ew,1,iwcymjt,1
2283,yoz6eu,randumbguy2,lol,1,ivh3v0s,1
2284,ymklkn,Lt-shorts,scam,1,iv4gah5,1
2285,yqzv2f,kailron,ok,9,ivrffdi,1


In [8]:
# filter out comments without enough words
comment_df = comment_df.loc[comment_df.body_count > comment_word_min]
comment_df

Unnamed: 0,post_id,comment_author,body,score,comment_id,body_count
0,ykp50l,fullofdust,ok im bored and like to cook so ill try to hel...,6,iuxb4bq,459
1,ytm876,Icy-Ad2082,the 25 is just how it would appear in the merc...,1,iw9jcbq,394
2,ykng41,SnooLemons5080,let’s make a few things abundantly clear. tur...,3,iux4lnn,368
3,yismqt,cmajalis,if you know someone that goes to miramar commu...,1,iul6eqy,344
4,ykng41,SnooLemons5080,sorry as someone who has spoken to people who ...,3,iux6gmi,316
...,...,...,...,...,...,...
1936,yr1579,HealthOnWheels,probably the garage by sixth college.,1,ivrc4vz,6
1937,ykfaap,teddyblanket,how do you think you did,1,iuvce2s,6
1938,yiucgc,Professional-Car7655,bro just go to floor 8,1,iul1pbu,6
1939,yqquuo,SelvaSauce,dodge duck dip dive and duck,11,ivrc9iz,6


In [9]:
comment_df['comments'] = comment_df.groupby('post_id', as_index=False)['body'].transform(lambda x: '. '.join(x))
comment_df.drop_duplicates(subset=['post_id'], inplace=True)
comment_df.drop(columns=["body_count", "body"], inplace=True)
# comment_df = add_word_count(comment_df, 'comments')
comment_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comment_df['comments'] = comment_df.groupby('post_id', as_index=False)['body'].transform(lambda x: '. '.join(x))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comment_df.drop_duplicates(subset=['post_id'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comment_df.drop(columns=["body_count", "body"], inplace=True)


Unnamed: 0,post_id,comment_author,score,comment_id,comments
0,ykp50l,fullofdust,6,iuxb4bq,ok im bored and like to cook so ill try to hel...
1,ytm876,Icy-Ad2082,1,iw9jcbq,the 25 is just how it would appear in the merc...
2,ykng41,SnooLemons5080,3,iux4lnn,let’s make a few things abundantly clear. tur...
3,yismqt,cmajalis,1,iul6eqy,if you know someone that goes to miramar commu...
5,ytrogg,nowlistenhereboy,1,iw8y368,what im saying is that from the average person...
...,...,...,...,...,...
1929,yll0o6,SleepLessThan3,1,iv2je7k,im dumb where is the star
1930,ykhkr2,a7xbarbie,1,iuv91gf,33f but like to have fun…
1932,yimcuq,impuissant_iguana,1,iul1ll9,people too lazy to do stuff
1934,ym4b2q,yltneyugn,1,iv4b2jj,thank you thank you thank you


In [10]:
merged_df = post_df.merge(comment_df, on='post_id', how='left')
merged_df.comments = merged_df.comments.fillna('')
merged_df['post_text'] = merged_df['combined_text'] + " " + merged_df['comments']
# merged_df['post_text'] = merged_df['post_text'].replace('. .','.')
merged_df = add_word_count(merged_df, 'post_text')
merged_df.drop(columns=["combined_text", "comments", "selftext", "comment_author", "comment_id", "score"], inplace=True)
merged_df.drop_duplicates(subset='title', inplace=True)
merged_df

Unnamed: 0,subreddit,title,post_id,post_author,post_utc,full_link,post_text,post_text_count
0,UCSD,lost my small black coach wallet at 11 am near...,ym7lyz,Recent-Obligation417,2022-11-04 19:14:07+00:00,https://www.reddit.com/r/UCSD/comments/ym7lyz/...,lost my small black coach wallet at 11 am near...,24
1,sandiego,shark attack in del mar,ym7jn8,Xcapegoat,2022-11-04 19:11:36+00:00,https://www.reddit.com/r/sandiego/comments/ym7...,shark attack in del mar glad the person survi...,156
2,UCSD,what 11 dollars gets you at spice,ym7hqi,17kimv,2022-11-04 19:09:33+00:00,https://www.reddit.com/r/UCSD/comments/ym7hqi/...,what 11 dollars gets you at spice yeah the po...,223
3,SDSU,has anyone’s middle class scholarship not show...,ym7hf1,Exciting_Opposite_37,2022-11-04 19:09:11+00:00,https://www.reddit.com/r/SDSU/comments/ym7hf1/...,has anyone’s middle class scholarship not show...,66
4,UCSD,psa dont lock your bike to this bike rack. its...,ym7ele,jneprz,2022-11-04 19:06:06+00:00,https://www.reddit.com/r/UCSD/comments/ym7ele/...,psa dont lock your bike to this bike rack. its...,13
...,...,...,...,...,...,...,...,...
2326,sandiego,sdge energy generation vs distribution,yudppk,solidavocadorock,2022-11-13 20:49:02+00:00,https://www.reddit.com/r/sandiego/comments/yud...,sdge energy generation vs distribution i just ...,225
2327,UCSD,does having a low gpa matter,yudhby,OkDoughnut994,2022-11-13 20:41:06+00:00,https://www.reddit.com/r/UCSD/comments/yudhby/...,does having a low gpa matter i’m a first year ...,458
2328,sandiego,sdge bill generation vs distribution,yudh2m,solidavocadorock,2022-11-13 20:40:51+00:00,https://www.reddit.com/r/sandiego/comments/yud...,sdge bill generation vs distribution,7
2329,sandiego,looking for venues,yucph0,sockherlu,2022-11-13 20:15:01+00:00,https://www.reddit.com/r/sandiego/comments/yuc...,looking for venues i am looking for different ...,209


In [11]:
merged_df.to_csv('merged_reddit_data.csv', index=False)

In [12]:
merged_df

Unnamed: 0,subreddit,title,post_id,post_author,post_utc,full_link,post_text,post_text_count
0,UCSD,lost my small black coach wallet at 11 am near...,ym7lyz,Recent-Obligation417,2022-11-04 19:14:07+00:00,https://www.reddit.com/r/UCSD/comments/ym7lyz/...,lost my small black coach wallet at 11 am near...,24
1,sandiego,shark attack in del mar,ym7jn8,Xcapegoat,2022-11-04 19:11:36+00:00,https://www.reddit.com/r/sandiego/comments/ym7...,shark attack in del mar. . glad the person sur...,156
2,UCSD,what 11 dollars gets you at spice,ym7hqi,17kimv,2022-11-04 19:09:33+00:00,https://www.reddit.com/r/UCSD/comments/ym7hqi/...,what 11 dollars gets you at spice. . yeah the ...,223
3,SDSU,has anyone’s middle class scholarship not show...,ym7hf1,Exciting_Opposite_37,2022-11-04 19:09:11+00:00,https://www.reddit.com/r/SDSU/comments/ym7hf1/...,has anyone’s middle class scholarship not show...,66
4,UCSD,psa dont lock your bike to this bike rack. its...,ym7ele,jneprz,2022-11-04 19:06:06+00:00,https://www.reddit.com/r/UCSD/comments/ym7ele/...,psa dont lock your bike to this bike rack. its...,13
...,...,...,...,...,...,...,...,...
2327,UCSD,does having a low gpa matter,yudhby,OkDoughnut994,2022-11-13 20:41:06+00:00,https://www.reddit.com/r/UCSD/comments/yudhby/...,does having a low gpa matter. i’m a first year...,458
2328,sandiego,sdge bill generation vs distribution,yudh2m,solidavocadorock,2022-11-13 20:40:51+00:00,https://www.reddit.com/r/sandiego/comments/yud...,sdge bill generation vs distribution. .,7
2329,sandiego,looking for venues,yucph0,sockherlu,2022-11-13 20:15:01+00:00,https://www.reddit.com/r/sandiego/comments/yuc...,looking for venues. i am looking for different...,209
2330,sandiego,looking for venues,yucmsj,sockherlu,2022-11-13 20:12:25+00:00,https://www.reddit.com/r/sandiego/comments/yuc...,looking for venues. .,5


In [19]:
comment_df = comment_df.groupby('post_id').head(5).reset_index(drop=True)
comment_df.value_counts(subset=['post_id'])

post_id
yp1fuc     5
ym5c1a     5
ynhh5r     5
ykjci2     5
ynjngu     5
          ..
yogndj     1
yod2h9     1
yobuzl     1
yoae3x     1
yvmxnt     1
Length: 899, dtype: int64

In [20]:
comment_df

Unnamed: 0,subreddit,post_id,link_id,author,body,score,id,created_utc,title,permalink,body_count
0,UCSD,yizile,t3_yizile,iamunknowntoo,ah i see yeah im confused as to why op called ...,2,iunzprk,2022-11-01 19:39:38+00:00,中国朋友come_to_chemao道_cafe,/r/UCSD/comments/yizile/中国朋友come_to_chemao道_ca...,13
1,sandiego,yismqt,t3_yismqt,Lula121,i have a fully renovated small trailer i’m sel...,1,iuooc1i,2022-11-01 22:18:03+00:00,me_my_husband_cat_are_going_to_be_homeless_in,/r/sandiego/comments/yismqt/me_my_husband_cat_...,28
2,SanDiegan,yjeg94,t3_yjeg94,KupoMcMog,also how much of his shtick can people take h...,1,iuop4rw,2022-11-01 22:23:32+00:00,1015_kgb_dsc_radio_show_hosts_dave_rickards_and,/r/SanDiegan/comments/yjeg94/1015_kgb_dsc_radi...,130
3,sandiego,yjg6ir,t3_yjg6ir,TheElusiveHolograph,we bought about a year and 3 months ago so tha...,1,iuooz57,2022-11-01 22:22:29+00:00,sewer_service_fee_on_water_bill,/r/sandiego/comments/yjg6ir/sewer_service_fee_...,27
4,sandiego,yjn0iz,t3_yjn0iz,jeremr,false widow,1,iuoscg3,2022-11-01 22:46:02+00:00,what_kind_of_spider_is_this,/r/sandiego/comments/yjn0iz/what_kind_of_spide...,2
...,...,...,...,...,...,...,...,...,...,...,...
2282,UCSD,yipzrj,t3_yipzrj,unimpressive_op,you’re partially right cal grant is only for 4...,1,iukcuqi,2022-11-01 00:06:13+00:00,who_to_contact_if_i_want_to_take_5_years,/r/UCSD/comments/yipzrj/who_to_contact_if_i_wa...,47
2283,UCSD,yipho1,t3_yipho1,safetystegosaurus1,“clem” and linguistics… hmm,1,iuke0x3,2022-11-01 00:15:28+00:00,to_everyone_wearing_halloween_costumes_today,/r/UCSD/comments/yipho1/to_everyone_wearing_ha...,4
2284,UCSD,ysyrbu,t3_ysyrbu,Funny-Exam2081,cringe,1,iw40xbq,2022-11-12 20:08:57+00:00,you_got_a_hold_bc_of_student_health_services_i,/r/UCSD/comments/ysyrbu/you_got_a_hold_bc_of_s...,1
2285,sandiego,yst30w,t3_yst30w,chillinwithmynwords,is that a jetson scooter,1,iw41j8k,2022-11-12 20:13:08+00:00,our_city_is_beautiful,/r/sandiego/comments/yst30w/our_city_is_beauti...,5


In [7]:
post_df

Unnamed: 0,subreddit,title,id,author,created_utc,selftext,full_link,combined_text,combined_text_count
0,UCSD,lost my small black coach wallet at 11 am near...,ym7lyz,Recent-Obligation417,2022-11-04 19:14:07+00:00,,https://www.reddit.com/r/UCSD/comments/ym7lyz/...,lost my small black coach wallet at 11 am near...,22
1,sandiego,shark attack in del mar,ym7jn8,Xcapegoat,2022-11-04 19:11:36+00:00,,https://www.reddit.com/r/sandiego/comments/ym7...,shark attack in del mar,5
2,UCSD,what 11 dollars gets you at spice,ym7hqi,17kimv,2022-11-04 19:09:33+00:00,,https://www.reddit.com/r/UCSD/comments/ym7hqi/...,what 11 dollars gets you at spice,7
3,SDSU,has anyone’s middle class scholarship not show...,ym7hf1,Exciting_Opposite_37,2022-11-04 19:09:11+00:00,,https://www.reddit.com/r/SDSU/comments/ym7hf1/...,has anyone’s middle class scholarship not show...,12
4,UCSD,psa dont lock your bike to this bike rack its ...,ym7ele,jneprz,2022-11-04 19:06:06+00:00,,https://www.reddit.com/r/UCSD/comments/ym7ele/...,psa dont lock your bike to this bike rack its ...,11
...,...,...,...,...,...,...,...,...,...
2560,UCSD,does having a low gpa matter,yudhby,OkDoughnut994,2022-11-13 20:41:06+00:00,i’m a first year and am not doing very well in...,https://www.reddit.com/r/UCSD/comments/yudhby/...,does having a low gpa matteri’m a first year a...,89
2561,sandiego,sdge bill generation vs distribution,yudh2m,solidavocadorock,2022-11-13 20:40:51+00:00,,https://www.reddit.com/r/sandiego/comments/yud...,sdge bill generation vs distribution,5
2562,sandiego,looking for venues,yucph0,sockherlu,2022-11-13 20:15:01+00:00,i am looking for different venue options for m...,https://www.reddit.com/r/sandiego/comments/yuc...,looking for venuesi am looking for different v...,44
2563,sandiego,looking for venues,yucmsj,sockherlu,2022-11-13 20:12:25+00:00,,https://www.reddit.com/r/sandiego/comments/yuc...,looking for venues,3


In [8]:
comment_df

Unnamed: 0,subreddit,post_id,link_id,author,body,score,id,created_utc,title,permalink,body_count
0,UCSD,yizile,t3_yizile,iamunknowntoo,ah i see yeah im confused as to why op called ...,2,iunzprk,2022-11-01 19:39:38+00:00,中国朋友come_to_chemao道_cafe,/r/UCSD/comments/yizile/中国朋友come_to_chemao道_ca...,13
1,sandiego,yismqt,t3_yismqt,Lula121,i have a fully renovated small trailer i’m sel...,1,iuooc1i,2022-11-01 22:18:03+00:00,me_my_husband_cat_are_going_to_be_homeless_in,/r/sandiego/comments/yismqt/me_my_husband_cat_...,28
2,SanDiegan,yjeg94,t3_yjeg94,KupoMcMog,also how much of his shtick can people take h...,1,iuop4rw,2022-11-01 22:23:32+00:00,1015_kgb_dsc_radio_show_hosts_dave_rickards_and,/r/SanDiegan/comments/yjeg94/1015_kgb_dsc_radi...,130
3,sandiego,yjg6ir,t3_yjg6ir,TheElusiveHolograph,we bought about a year and 3 months ago so tha...,1,iuooz57,2022-11-01 22:22:29+00:00,sewer_service_fee_on_water_bill,/r/sandiego/comments/yjg6ir/sewer_service_fee_...,27
4,sandiego,yjn0iz,t3_yjn0iz,jeremr,false widow,1,iuoscg3,2022-11-01 22:46:02+00:00,what_kind_of_spider_is_this,/r/sandiego/comments/yjn0iz/what_kind_of_spide...,2
...,...,...,...,...,...,...,...,...,...,...,...
4561,sandiego,yr0wf2,t3_yr0wf2,dezld,how he was the first choice on the ballot and ...,1,ivux2ab,2022-11-10 20:05:31+00:00,holy_shit_mike_schaefer_won_60_of_the_vote_for,/r/sandiego/comments/yr0wf2/holy_shit_mike_sch...,57
4562,UCSD,yqf7oc,t3_yqf7oc,Kooky_Concentrate553,winner winner,1,ivuxqi2,2022-11-10 20:10:00+00:00,youll_never_guess_what_happens_in_rita,/r/UCSD/comments/yqf7oc/youll_never_guess_what...,2
4563,SanDiegan,yrmlqa,t3_yrmlqa,leesfer,a massive waste of space in places that could ...,1,ivuv715,2022-11-10 19:53:07+00:00,topgolf_is_finally_coming,/r/SanDiegan/comments/yrmlqa/topgolf_is_finall...,12
4564,SanDiegan,yrpjjr,t3_yrpjjr,love_sun_shine,you can likely change your filter yourself whi...,1,ivuuh7y,2022-11-10 19:48:21+00:00,how_much_does_an_oil_change_cost_around_here,/r/SanDiegan/comments/yrpjjr/how_much_does_an_...,29


In [23]:
comment_df = comment_df.sort_values(by='body_count', ascending=False)
comment_df

Unnamed: 0,subreddit,post_id,link_id,author,body,score,id,created_utc,title,permalink,body_count
2017,sandiego,ykp50l,t3_ykp50l,fullofdust,ok im bored and like to cook so ill try to hel...,6,iuxb4bq,2022-11-03 18:14:41+00:00,spicy_spaghetti_aka_srirachaghetti_anywhere,/r/sandiego/comments/ykp50l/spicy_spaghetti_ak...,459
253,sandiego,yrvc8z,t3_yrvc8z,WormSlayers,thats not really how it works in the real worl...,1,ivw6t3w,2022-11-11 01:34:42+00:00,this_lady_walking_around_sunrise_buffet_with_her,/r/sandiego/comments/yrvc8z/this_lady_walking_...,302
438,SanDiegan,yv42lp,t3_yv42lp,coldjesusbeer,yeah the whole esa concept has to be open for ...,1,iwekix5,2022-11-15 01:33:38+00:00,pet_screening_demand_from_landlord_fight_back,/r/SanDiegan/comments/yv42lp/pet_screening_dem...,300
1088,UCSD,ykcso4,t3_ykcso4,Impossible-Ad-3073,there’s no way i could respond to this in a pc...,1,iuw1k0j,2022-11-03 13:10:18+00:00,how_does_a_stem_heavy_school_like_ucsd_even,/r/UCSD/comments/ykcso4/how_does_a_stem_heavy_...,297
1819,UCSD,ythp5q,t3_ythp5q,Kuryaka,people who say they can tell by some vague und...,1,iwbhbys,2022-11-14 12:15:49+00:00,showing_off_signs_of_depression_to_other_people,/r/UCSD/comments/ythp5q/showing_off_signs_of_d...,294
...,...,...,...,...,...,...,...,...,...,...,...
600,sandiego,yj0y50,t3_yj0y50,Hollowpoint808,lovely,1,iuvba5n,2022-11-03 08:01:17+00:00,adding_to_the_other_shots_of_the_sunset_tonight,/r/sandiego/comments/yj0y50/adding_to_the_othe...,1
328,UCSD,ytq7s8,t3_ytq7s8,gloomyfairy10,mychart,1,iw5yksc,2022-11-13 05:17:43+00:00,flu_shot,/r/UCSD/comments/ytq7s8/flu_shot/iw5yksc/,1
566,UCSD,yksnnc,t3_yksnnc,AcanthocephalaSad541,link,1,iuwpbzz,2022-11-03 15:55:55+00:00,are_any_ucsd_students_holding_a_lol_world_finals,/r/UCSD/comments/yksnnc/are_any_ucsd_students_...,1
1482,UCSD,yksv8g,t3_yksv8g,Kirboolin,down,1,iuv931n,2022-11-03 07:28:17+00:00,one_piece_red,/r/UCSD/comments/yksv8g/one_piece_red/iuv931n/,1


In [12]:
post_df.selftext = post_df.selftext.apply(lambda x: emoji.demojize(x))
post_df.title = post_df.title.apply(lambda x: emoji.demojize(x))

In [13]:
post_df.title.loc[post_df.id.str.contains('ym6jd1')]

11    Could today get any worse? :upside-down_face:
Name: title, dtype: object

In [14]:
post_df['combined_text'] = post_df['title'] + post_df['selftext']
post_df['text_len'] = post_df['combined_text'].apply(lambda x: len(x.split(' ')))
post_df

Unnamed: 0,subreddit,title,id,author,created_utc,selftext,full_link,combined_text,text_len
0,UCSD,Lost my small black coach wallet at 11 am near...,ym7lyz,Recent-Obligation417,2022-11-04 19:14:07+00:00,,https://www.reddit.com/r/UCSD/comments/ym7lyz/...,Lost my small black coach wallet at 11 am near...,22
1,sandiego,Shark Attack in Del Mar,ym7jn8,Xcapegoat,2022-11-04 19:11:36+00:00,,https://www.reddit.com/r/sandiego/comments/ym7...,Shark Attack in Del Mar,5
2,UCSD,what 11 dollars gets you at Spice,ym7hqi,17kimv,2022-11-04 19:09:33+00:00,,https://www.reddit.com/r/UCSD/comments/ym7hqi/...,what 11 dollars gets you at Spice,7
3,SDSU,Has anyone’s middle class scholarship not show...,ym7hf1,Exciting_Opposite_37,2022-11-04 19:09:11+00:00,,https://www.reddit.com/r/SDSU/comments/ym7hf1/...,Has anyone’s middle class scholarship not show...,12
4,UCSD,PSA don't lock your bike to this bike rack. it...,ym7ele,jneprz,2022-11-04 19:06:06+00:00,,https://www.reddit.com/r/UCSD/comments/ym7ele/...,PSA don't lock your bike to this bike rack. it...,11
...,...,...,...,...,...,...,...,...,...
2560,UCSD,Does having a low gpa matter?,yudhby,OkDoughnut994,2022-11-13 20:41:06+00:00,I’m a first year and am not doing very well in...,https://www.reddit.com/r/UCSD/comments/yudhby/...,Does having a low gpa matter?I’m a first year ...,87
2561,sandiego,SDGE bill generation vs distribution,yudh2m,solidavocadorock,2022-11-13 20:40:51+00:00,,https://www.reddit.com/r/sandiego/comments/yud...,SDGE bill generation vs distribution,5
2562,sandiego,Looking for venues,yucph0,sockherlu,2022-11-13 20:15:01+00:00,I am looking for different venue options for m...,https://www.reddit.com/r/sandiego/comments/yuc...,Looking for venuesI am looking for different v...,44
2563,sandiego,Looking for venues,yucmsj,sockherlu,2022-11-13 20:12:25+00:00,,https://www.reddit.com/r/sandiego/comments/yuc...,Looking for venues,3


In [15]:
comment_df.sort_values(by="score")

Unnamed: 0,subreddit,post_id,link_id,author,body,score,id,created_utc,title,permalink
4345,sandiego,ykq06g,t3_ykq06g,daringtransgression,No.,-9,iuxcqu3,2022-11-03 18:25:10+00:00,confronting_a_man_who_is_taking_pictures_of_them,/r/sandiego/comments/ykq06g/confronting_a_man_...
3396,sandiego,yqo23z,t3_yqo23z,Wizardof1000Kings,I'm against all gambling as it serves to worse...,-5,ivrdmel,2022-11-10 01:20:14+00:00,prop_30_175_tax_on_multimillion_dollar_incomes,/r/sandiego/comments/yqo23z/prop_30_175_tax_on...
3395,sandiego,yqqumo,t3_yqqumo,itsamezario,Such a naive POV. Rules exist for a reason. Yo...,-2,ivre85y,2022-11-10 01:24:49+00:00,is_this_legal_homeless_guy_living_on_pb_for_the,/r/sandiego/comments/yqqumo/is_this_legal_home...
3554,sandiego,yqqumo,t3_yqqumo,Uglyduckling75,Balboa Park is pretty much the homeless capita...,-2,ivr9oqs,2022-11-10 00:50:17+00:00,is_this_legal_homeless_guy_living_on_pb_for_the,/r/sandiego/comments/yqqumo/is_this_legal_home...
3553,sandiego,yqqumo,t3_yqqumo,failfast2etna,Ok. At this point I can definitely tell you're...,-2,ivr9iqk,2022-11-10 00:49:03+00:00,is_this_legal_homeless_guy_living_on_pb_for_the,/r/sandiego/comments/yqqumo/is_this_legal_home...
...,...,...,...,...,...,...,...,...,...,...
555,sandiego,yqzf7p,t3_yqzf7p,NoToNope,I wonder what kind of garbage parents Ashli an...,188,ivradd0,2022-11-10 00:55:25+00:00,domestic_terrorist_ashli_babbitts_brother_has,/r/sandiego/comments/yqzf7p/domestic_terrorist...
554,sandiego,yqzf7p,t3_yqzf7p,oicnitall,"Too bad his name isn't Roger Babbitt, at least...",192,ivrbgj0,2022-11-10 01:03:39+00:00,domestic_terrorist_ashli_babbitts_brother_has,/r/sandiego/comments/yqzf7p/domestic_terrorist...
553,sandiego,yr26hd,t3_yr26hd,CplUseless,Whale mating season.,221,ivrjf29,2022-11-10 02:03:57+00:00,anyone_knows_whats_up_with_the_foam_at_torrey,/r/sandiego/comments/yr26hd/anyone_knows_whats...
90,sandiego,ylaxg7,t3_ylaxg7,Payorfixyourself,Zoo,354,iuxg0wc,2022-11-03 18:46:00+00:00,somewhere_in_san_diego_take_a_guess,/r/sandiego/comments/ylaxg7/somewhere_in_san_d...


In [None]:
import regex as re    
re.sub(ur"\p{P}+", "", txt)

In [29]:
remove = string.punctuation
# remove.replace(".", "")
remove = ''.join(list(set(string.punctuation).difference(set(['.', ' ']))))

In [21]:
a = None
if a:
    print('Hello')

In [27]:
a = ['.']
''.join(list(set(string.punctuation).difference(set(a))))

'!",<&*|[(_+#~=/`@\\$%-{>^?\'];:})'

In [48]:
post_df['a'] = comment_df['body'].apply(lambda x: x.translate(str.maketrans('', '', remove)))
# post_df['a'].apply(lambda x: x.replace('\n', ' '))
# post_df
post_df['a'].replace(r'\n',' ', regex=True) 

0       Ah I see yeah Im confused as to why OP called ...
1       I have a fully renovated small trailer I’m sel...
2       also how much of his shtick can people take.  ...
3       We bought about a year and 3 months ago so tha...
4                                             False Widow
                              ...                        
2560                                                  NaN
2561    Air and space museum. Just had a vendor show w...
2562    As of 111 am on 1109 yes is leading by 1409 votes
2563    If you are from another UC it’s gonna be for f...
2564    That’s helpful I’m currently taking cogs 107a ...
Name: a, Length: 2332, dtype: object

In [50]:
post_df.loc[post_df.index == 2560, :]

Unnamed: 0,subreddit,title,id,author,created_utc,selftext,full_link,combined_text,text_len,a
2560,UCSD,Does having a low gpa matter?,yudhby,OkDoughnut994,2022-11-13 20:41:06+00:00,I’m a first year and am not doing very well in...,https://www.reddit.com/r/UCSD/comments/yudhby/...,Does having a low gpa matter?I’m a first year ...,87,
