In [2]:
import pandas as pd
import matplotlib.pyplot as plt

Read dataframes from the files.

In [10]:
df_comments = pd.read_csv('main_comments.csv')

In [14]:
df_submissions = pd.read_csv('main_submissions.csv')

In [11]:
df_comments

Unnamed: 0,id,score,link_id,author,subreddit,created_utc
0,t1_gl0hhsq,1,t3_l6a44e,Io99IHkg-4QzX6xbKwbte0cuzp4=,wallstreetbets,1611788340
1,t1_gmd0xrl,1,t3_lehibh,1UBdU9GQvCnnXQHAcYaG1uL9V_U=,RedditSessions,1612683157
2,t1_gggg1ed,1,t3_kgocvo,Io99IHkg-4QzX6xbKwbte0cuzp4=,memes,1608454223
3,t1_g7zggfh,1,t3_j6n57d,EA1r-K5p_lVBLesLhCFRrKOPN-I=,videos,1602058834
4,t1_fn060jg,26,t3_fyheuv,_aeNuqWD_AT5JIfooWYpKiZR8qg=,nfl,1586536065
...,...,...,...,...,...,...
27997318,t1_f8sotb4,1,t3_e20bxb,EA1r-K5p_lVBLesLhCFRrKOPN-I=,Showerthoughts,1574785778
27997319,t1_eczq4gz,1,t3_abbin1,Pkz1m3vsliYpbnltUnWaQkPFLEo=,AskReddit,1546315274
27997320,t1_f240cz6,0,t3_dbujdv,NfwullBPKgqUPvj_Qr6RPnH1hrI=,DestinyTheGame,1569942007
27997321,t1_em778f3,2,t3_bj2bnd,kj12hcxGWPd3LxjKCNpoPFTDNBQ=,funny,1556683776


In [12]:
df_submissions

Unnamed: 0,id,author,created_utc,domain,is_self,score,subreddit
0,t3_kby7v8,ZkakZ8xfXNiL6wUXhJhRU3Ysa7c=,1607811261,self.AskReddit,True,3,AskReddit
1,t3_nwccst,pQ8j9hcuty1RBfCkUutL0cy9Zco=,1623290593,i.redd.it,False,11,pics
2,t3_ik55fe,GH3YzjQCtBT4DkTjcxWb2-vPoa0=,1598903369,self.Market76,True,0,Market76
3,t3_hkl0qs,57zAuZ7GS2EvQeCSXqU19wYZgjs=,1593790032,v.redd.it,False,1,funny
4,t3_jdrjpp,HY8UtHbd9PNUDFmaAkm4-10hgmA=,1603066751,i.redd.it,False,578,PrequelMemes
...,...,...,...,...,...,...,...
1857436,t3_cqojdd,Io99IHkg-4QzX6xbKwbte0cuzp4=,1565868449,i.redd.it,False,1,FIFA
1857437,t3_amszup,Io99IHkg-4QzX6xbKwbte0cuzp4=,1549222271,i.redd.it,False,1,dankmemes
1857438,t3_dpfmog,cCp3C69w-4gcIYjKtvJpPmsjqdM=,1572481507,i.redd.it,False,15,teenagers
1857439,t3_bwulmr,2xibwTUA50T87ThesoIxnimOjKw=,1559685003,i.redd.it,False,12,aww


## Data collection and cleaning

Filter out the lines with incorrectly hashed usernames.

In [35]:
wrong_username = ['Io99IHkg-4QzX6xbKwbte0cuzp4=', 'EA1r-K5p_lVBLesLhCFRrKOPN-I=']
filtered_df_comments = df_comments[~df_comments['author'].isin(wrong_username)]
filtered_df_submissions = df_submissions[~df_submissions['author'].isin(wrong_username)]

Try to figure out which subreddits are politically related.

In [36]:
subreddits = filtered_df_comments['subreddit'].unique()
print(len(subreddits))
subreddits

92


array(['RedditSessions', 'nfl', 'soccer', 'ukpolitics', 'AskReddit',
       'unpopularopinion', 'news', 'memes', 'EscapefromTarkov',
       'teenagers', 'politics', 'worldnews', 'PrequelMemes',
       'distantsocializing', 'dankmemes', 'aww', 'Coronavirus',
       'Showerthoughts', 'TheYouShow', 'CFB', 'Animemes', 'neoliberal',
       'CryptoCurrency', 'europe', 'apexlegends', 'DestinyTheGame',
       'gaming', 'HistoryMemes', 'SquaredCircle', 'videos',
       'pcmasterrace', 'ACTrade', 'AmItheAsshole', 'NoStupidQuestions',
       'todayilearned', 'Genshin_Impact', 'funny', 'FortNiteBR',
       'PewdiepieSubmissions', 'Superstonk', 'barstoolsports',
       'wallstreetbets', 'nba', 'Whatcouldgowrong', 'PublicFreakout',
       'facepalm', 'Market76', 'relationship_advice', 'pics', 'formula1',
       'relationships', 'WTF', 'FreeKarma4U', 'RocketLeagueExchange',
       'gifs', 'wow', 'modernwarfare', 'mildlyinteresting',
       'PoliticalCompassMemes', 'personalfinance', 'AskMen', 'PS5',


It turns out that there are only 92 different subreddits occurring, which is not a lot. For the sake of accuracy we will manually identify the
(overtly) political subreddits. Then we filter the two dataframes with the array of political subreddits.

In [29]:
political_subreddits = ['ukpolitics', 'news', 'worldnews', 'neoliberal', 'Conservative', 'PoliticalCompassMemes', 'PoliticalHumor']
political_subreddits

['ukpolitics',
 'news',
 'worldnews',
 'neoliberal',
 'Conservative',
 'PoliticalCompassMemes',
 'PoliticalHumor']

In [42]:
filtered_df_comments = filtered_df_comments[filtered_df_comments['subreddit'].isin(political_subreddits)]
filtered_df_submissions = filtered_df_submissions[filtered_df_submissions['subreddit'].isin(political_subreddits)]

## Distribution of GS-scores

## Longevity in communities

## Detecting political echo chambers