### **Imports**

In [123]:
import numpy as np
import requests
import pandas as pd
import matplotlib.pyplot as plt
import time
import sys

from collections import Counter

from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import re

#### **Read in data saved in ./datasets folder**

In [86]:
asksci=pd.read_csv('askscience_posts.csv')

In [87]:
asksci.drop(columns='Unnamed: 0', inplace=True)

In [88]:
asksci.shape

(35300, 6)

In [89]:
asksci.columns

Index(['title', 'author', 'created_utc', 'selftext', 'score', 'subreddit'], dtype='object')

In [90]:
asksci.head()

Unnamed: 0,title,author,created_utc,selftext,score,subreddit
0,Why do we vomit when we are too hot?,childloser,1627143368,[removed],1,askscience
1,How much CO2 is released by manufacturing a ki...,banmeyoucoward,1627143292,[removed],1,askscience
2,Is the level of AI portrayed in movies (fully ...,SchoolThrow123,1627143240,[removed],1,askscience
3,Why do some animals have moustache and whiskers,notowork,1627143114,[removed],1,askscience
4,What is the maximal theoretical size for an an...,Vantaie,1627142862,[removed],1,askscience


In [94]:
asksci.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35300 entries, 0 to 35299
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        35300 non-null  object
 1   author       35300 non-null  object
 2   created_utc  35300 non-null  int64 
 3   selftext     35028 non-null  object
 4   score        35300 non-null  int64 
 5   subreddit    35300 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.6+ MB


In [98]:
asksci[asksci['selftext'].isnull()]

Unnamed: 0,title,author,created_utc,selftext,score,subreddit
187,Since mosquitoes and flies find their prey par...,whatisnuclear,1627091013,,1,askscience
1721,"Between foam, liquid, or bar, what is the best...",PHealthy,1626696841,,1,askscience
3203,What's the chance of getting Alzheimer?,[deleted],1626259248,,1,askscience
3204,What to do with a closed Oxidizing bottle?,[deleted],1626259038,,1,askscience
3205,What's the chance of getting Alzheimer,[deleted],1626258896,,1,askscience
...,...,...,...,...,...,...
33197,Lower Cortisol in Depression,[deleted],1617549755,,1,askscience
33347,Why does our immune system can fend off some p...,[deleted],1617502609,,1,askscience
33587,The murder mystery of Spain's 'Pit of Bones',[deleted],1617430573,,1,askscience
34069,what is that inner monologue in your head?,[deleted],1617298989,,1,askscience


In [92]:
(asksci['selftext'] != "[removed]").sum()

377

In [93]:
(asksci['selftext'] != "[deleted]").sum()

35300

In [28]:
asksci.shape

(35300, 6)

In [118]:
long = pd.read_csv('longevity_posts.csv')

In [119]:
long.drop(columns='Unnamed: 0', inplace=True)

In [120]:
long.shape

(18519, 6)

In [121]:
long.head()

Unnamed: 0,title,author,created_utc,selftext,score,subreddit
0,Scientists discover gene therapy provides neur...,festlap,1627142424,,1,longevity
1,Does Young Blood Really Have The Power To Reju...,JoeDerivative,1627141616,,1,longevity
2,Excess coffee: A bitter brew for brain health,stankmanly,1627134191,,1,longevity
3,Extending Human Lifespans: Using Artificial In...,sim04ful,1627123037,,1,longevity
4,YAP Upregulation as a Potentially Broad Basis ...,chromosomalcrossover,1627115449,,1,longevity


In [35]:
long['selftext'].isna().sum()

13002

In [40]:
long.drop_duplicates(inplace=True)

In [42]:
long.shape

(18518, 6)

In [43]:
long.columns

Index(['title', 'author', 'created_utc', 'selftext', 'score', 'subreddit'], dtype='object')

In [44]:
asksci.columns

Index(['title', 'author', 'created_utc', 'selftext', 'score', 'subreddit'], dtype='object')

In [58]:
df=pd.concat([asksci, long])

In [59]:
df.head()

Unnamed: 0,title,author,created_utc,selftext,score,subreddit
0,Why do we vomit when we are too hot?,childloser,1627143368,[removed],1,askscience
1,How much CO2 is released by manufacturing a ki...,banmeyoucoward,1627143292,[removed],1,askscience
2,Is the level of AI portrayed in movies (fully ...,SchoolThrow123,1627143240,[removed],1,askscience
3,Why do some animals have moustache and whiskers,notowork,1627143114,[removed],1,askscience
4,What is the maximal theoretical size for an an...,Vantaie,1627142862,[removed],1,askscience


In [60]:
df.shape

(53818, 6)

In [84]:
df[(df['selftext'] != "[removed]") & (df['selftext']!="[deleted]")].isnull().sum()

title              0
author             0
created_utc        0
selftext       13274
score              0
subreddit          0
dtype: int64

In [70]:
(df['selftext'] == "[removed]").sum()

37198

In [71]:
(df['selftext'] == "[deleted]").sum()

410

In [48]:
longevity_comments = pd.read_csv('longevityinlongevity_comments.csv')
longevity_comments.drop(columns='Unnamed: 0', inplace=True)

In [49]:
longevity_comments.head()

Unnamed: 0,author,body,score,created_utc,subreddit
0,nickengerer,Shameless plug: [longevity blog](http://nicken...,1,1626988425,longevity
1,rosts,"Not directly about longevity, but I'm expectin...",10,1626978437,longevity
2,newbooke,"Sure, hopefully. By definition it kind of can'...",1,1626977878,longevity
3,snormie,Better fix climate change or all the longevity...,-9,1626963790,longevity
4,mamaBiskothu,"Have you seen big hero 6, there's this stoner ...",-8,1626953498,longevity


In [50]:
askscience_comments = pd.read_csv('longevityinaskscience_comments.csv')
askscience_comments.drop(columns='Unnamed: 0', inplace=True)

In [53]:
df_comments=pd.concat([longevity_comments, askscience_comments])

0

**Create labels for subreddits**

In [63]:
subreddit_dict = {'longevity':0,
            'askscience':1}

In [64]:
df_comments['subreddit'] = df_comments['subreddit'].map(subreddit_dict)
df['subreddit'] = df['subreddit'].map(subreddit_dict)

In [65]:
df_comments.head()

Unnamed: 0,author,body,score,created_utc,subreddit
0,nickengerer,Shameless plug: [longevity blog](http://nicken...,1,1626988425,0
1,rosts,"Not directly about longevity, but I'm expectin...",10,1626978437,0
2,newbooke,"Sure, hopefully. By definition it kind of can'...",1,1626977878,0
3,snormie,Better fix climate change or all the longevity...,-9,1626963790,0
4,mamaBiskothu,"Have you seen big hero 6, there's this stoner ...",-8,1626953498,0
