In [55]:
# Imports
import pandas as pd

In [56]:
# Read in the data csv's with only the 6-Reddit Params identified as useful for NLP
suicide_OG = pd.read_csv('../data/suicide_watch_6.csv')
depression_OG = pd.read_csv('../data/depression_6.csv')

In [57]:
# Column names associated with 6 Reddit Parameters identified as useful for NLP
reddit_params = ['subreddit','author','selftext', 'title', 'num_comments', 'created_utc']

## EDA

### A.) r/suicide_watch Data

In [58]:
# Keep an unclean copy for future reference
suicide_df = suicide_OG.copy()

In [59]:
# Every single entry has a properly labeled tag for the subreddit: 'SuicideWatch'
suicide_df.subreddit.value_counts()

SuicideWatch    100100
Name: subreddit, dtype: int64

In [60]:
# Reassign for simplicity
suicide_df['subreddit'] = 1

In [106]:
# Check for proper data types, all looks correct
suicide_df.dtypes

subreddit        int64
author          object
selftext        object
title           object
num_comments     int64
created_utc      int64
dtype: object

In [62]:
# Notice 2-titles are null and 6,249-'selftext' entries are null
suicide_df.isnull().sum()

subreddit          0
author             0
selftext        6249
title              2
num_comments       0
created_utc        0
dtype: int64

In [63]:
# After a closer look at null 'title' entries, the 'selftext' data are worth keeping
suicide_df[suicide_df['title'].isnull()==True].selftext

63974    Nobody really cares unless you have a reason o...
75035    I’ve been feeling pretty empty these past few ...
Name: selftext, dtype: object

In [64]:
# 6,093 [removed] 'selftext' entries
# 2,628 [deleted] 'selftext' entries

# 83,446 + 619 = 84,065 unique 'selftext' entries
suicide_df.selftext.value_counts()[618:].sum()

83446

In [65]:
# There are 86,389 unique titles 
# Consider how to ID duplicates to drop
suicide_df.title.value_counts()[:10]

.                        310
Help                     272
I want to die            194
I need help              153
Goodbye                  105
I'm done                  98
I’m done                  91
help                      88
Please help me            70
I want to kill myself     66
Name: title, dtype: int64

In [66]:
# There are 54,682 unique authors contributing
# There are 3,077 author names [deleted]
suicide_df.author.value_counts()

[deleted]             3077
NightAir_              133
cornrain               121
darkpols368            102
Throwawayiguess457     102
                      ... 
peacoatbuttons           1
Ril_Ruq                  1
GaetanoCim               1
anononder                1
Tomahawk598              1
Name: author, Length: 54682, dtype: int64

In [67]:
# 24.89% have zero comments
# ~60% have 2 comments or less
round(suicide_df.num_comments.value_counts(normalize=True)[:5]*100,2)

0    24.89
1    20.68
2    14.40
3     9.18
4     6.57
Name: num_comments, dtype: float64

***SUMMARY OF EDA FINDINGS***

1. selftext
    * 6,249 NaN
    * 6,093 '[removed]'
    * 2,628 '[deleted]'
    * 619 w/ at least 1 duplicate
2. title
    * 2 NaN
    * 86,389 unique
3. author
    * 3,077 '[deleted]'
    * 54,682 unique

## Cleaning Data

### A.) r/suicide_watch

In [68]:
# Drop all 6,249 submissions with null 'selftext'
suicide_df.dropna(inplace=True)
suicide_df.isnull().sum()

subreddit       0
author          0
selftext        0
title           0
num_comments    0
created_utc     0
dtype: int64

In [69]:
# ID indexes where selftext entries equal to either [deleted] or [removed]
drop_rows = suicide_df[(suicide_df.selftext == '[deleted]') | 
                              (suicide_df.selftext == '[removed]')
                             ].index.tolist()

# Drop all 6,093-'[removed]' & 2,628-'[deleted]' selftext data
suicide_df.drop(drop_rows,inplace=True)

In [70]:
# Comments are not included in this analysis so,
# posts where selftext AND title are duplicates should be dropped
suicide_df.drop_duplicates(subset=['title','selftext'], 
                           keep='last', 
                           inplace=True, 
                           ignore_index=True)

In [71]:
# The indices have no meaning so lets clean it up
suicide_df.reset_index(drop=True, inplace=True)

In [72]:
# Notice only 4 selftext/title duplicates were removed in cleaning
suicide_df.selftext.value_counts()[615:];

In [73]:
# Notice that we still have 73,325 unique title entries after cleaning
suicide_df.selftext.value_counts()[:10]

.                                                                                                                                                                                                                                      26
Please                                                                                                                                                                                                                                 18
Title                                                                                                                                                                                                                                  17
please                                                                                                                                                                                                                                 12
...                                                             

In [74]:
# NOTE: I did not directly drop [deleted] authors, but now there are over 3,000 fewer
len(suicide_df[suicide_df['author']=='[deleted]'])

71

In [75]:
# 20.64% have zero comments
# 57.43% have 2 comments or less
round(suicide_df.num_comments.value_counts(normalize=True)[:5]*100,2)

1    21.72
0    20.64
2    15.07
3     9.80
4     6.98
Name: num_comments, dtype: float64

## EDA

### B.) r/depression Data

In [76]:
# Keep an unclean copy for future reference
depression_df = depression_OG.copy()

In [77]:
# Every single entry has a properly labeled tag for the subreddit: 'depression'
depression_df.subreddit.value_counts()

depression    100100
Name: subreddit, dtype: int64

In [78]:
# Reassign for simplicity
depression_df['subreddit'] = 0

In [79]:
# Check for proper data types, all looks correct
depression_df.dtypes

subreddit        int64
author          object
selftext        object
title           object
num_comments     int64
created_utc      int64
dtype: object

In [80]:
# Notice 1,309 entries in 'selftext' are null
depression_df.isnull().sum()

subreddit          0
author             0
selftext        1309
title              0
num_comments       0
created_utc        0
dtype: int64

In [81]:
# 14,189 [removed] selftext
# 2,163 [deleted] selftext

# 81,542 + 410 = 81,951 unique 'selftext' entries
depression_df.selftext.value_counts()[409:].sum()

81542

In [82]:
# There are 89,999 unique titles 
# Consider how to ID duplicates to drop
depression_df.title.value_counts()[:10]

Help                                                                      160
I need help                                                               109
.                                                                         106
AIs from AI Dungeon 2 to sexy to funny and one based wholly on Reddit!     91
I hate myself                                                              80
I want to die                                                              77
...                                                                        69
I don't know what to do                                                    63
Depression                                                                 61
Am I depressed?                                                            61
Name: title, dtype: int64

In [83]:
# There are 59,130 unique authors contributing
# There are 3,004 author names [deleted]
depression_df.author.value_counts()

[deleted]           3004
HypeToHype           122
h3xadecimal2          91
lifeishard99          78
_ChilledVibez         77
                    ... 
gothiccheesepuff       1
anand-damani           1
God_Is_Pizza           1
feonixrizen            1
vhr6190                1
Name: author, Length: 59130, dtype: int64

In [84]:
# 43.52% have zero comments
# ~72% have 2 comments or less
round(depression_df.num_comments.value_counts(normalize=True)[:5]*100,2)

0    43.52
1    16.68
2    11.91
3     7.11
4     5.12
Name: num_comments, dtype: float64

## Cleaning Data

### B.) r/depression

In [85]:
# Drop all 1,309 submissions with null 'selftext'
depression_df.dropna(inplace=True)
depression_df.isnull().sum()

subreddit       0
author          0
selftext        0
title           0
num_comments    0
created_utc     0
dtype: int64

In [86]:
# ID indexes where selftext entries equal to either [deleted] or [removed]
drop_rows = depression_df[(depression_df.selftext == '[deleted]') | 
                              (depression_df.selftext == '[removed]')
                             ].index.tolist()

# Drop all 14,189-'[removed]' & 2,163-'[deleted]' selftext data
depression_df.drop(drop_rows,inplace=True)

In [87]:
# Comments are not included in this analysis so,
# posts with duplicate selftext AND title submissions should be dropped
depression_df.drop_duplicates(subset=['title','selftext'], 
                           keep='last', 
                           inplace=True, 
                           ignore_index=True)

In [88]:
# The indices have no meaning so lets clean it up
depression_df.reset_index(drop=True, inplace=True)

In [89]:
# Notice 194 selftext/title duplicates were removed in cleaning
# We still have 81,949 unique selftext entries
depression_df.selftext.value_counts()[215:];

In [90]:
# 74,111 unique title submissions remain after cleaning
depression_df.title.value_counts()

Help                                                                                               145
I need help                                                                                         98
.                                                                                                   93
I hate myself                                                                                       68
...                                                                                                 67
                                                                                                  ... 
I’m just done                                                                                        1
I've forgotten what it's like to be a functioning person. I'm watching myself from the outside.      1
How do I deal with those damn voices?                                                                1
Recently diagnosed; kind of makes sense                                  

In [91]:
# NOTE: I did not directly drop [deleted] authors, but now there are nearly 3,000 fewer
len(depression_df[depression_df['author']=='[deleted]'])

26

In [92]:
depression_df.author.value_counts()

HypeToHype              117
lifeishard99             64
SniperNoSwiper           61
TehDarkLorde             59
Random_Doggo_            58
                       ... 
thebigsadlol              1
IsAnyoneThereTonight      1
Ambrus6421                1
invertedme                1
etherious14               1
Name: author, Length: 52522, dtype: int64

In [93]:
# 35.5% have zero comments
# 67.61% have 2 comments or less
round(depression_df.num_comments.value_counts(normalize=True)[:5]*100,2)

0    35.50
1    18.61
2    13.50
3     8.21
4     5.92
Name: num_comments, dtype: float64

***OVERVIEW OF EDA AND CLEANING***

1. suicide_watch: selftext
    * 6,249 NaN         --> 0 (after cleaning)
    * 6,093 '[removed]' --> 0 (after cleaning)
    * 2,628 '[deleted]' --> 0 (after cleaning)
    * 619 with more than 1 duplicate --> 615 (after cleaning)
    * 84,064 --> 84, 060 (after cleaning)
2. suicide_watch: title
    * 2 NaN --> 0 (after cleaning)
    * 86,389 unique --> 73,325 (after cleaning)
3. suicide_watch: author
    * 3,077 '[deleted]' --> 71(after cleaning)
    * 54,682 unique --> 50,469(after cleaning)
4. suicide_watch: comments
    * ~25% have 0 comments --> ~21%(after cleaning)
    * ~60% have 2 comments or less --> ~57% (after cleaning)
---

1. depression: selftext
    * 1,309 NaN          --> 0 (after cleaning)
    * 14,189 '[removed]' --> 0 (after cleaning)
    * 2,163 '[deleted]'  --> 0 (after cleaning)
    * 410 w/ at least 1 duplicate --> 216 (after cleaning)
    * 81,951 unique submissions --> 81,949 (after cleaning)
2. depression: title
    * 2 NaN   --> 0 (after cleaning)
    * 89,999 unique --> 74,111 (after cleaning)
3. depression: author
    * 3,004 '[deleted]' --> 26(after cleaning)
    * 59,130 unique     --> 52,522(after cleaning)
4. depression: comments
    * ~43.5% have 0 comments --> ~35.5%(after cleaning)
    * ~72% have 2 comments or less --> ~67% (after cleaning)

***Check for outliers, short submissions that are meaningless***

In [94]:
# Look for issues with short submissions
short_subs = []
for i in range(len(suicide_df)):
    if (len(suicide_df.selftext[i])<10) & (len(suicide_df.title[i])<10):
        short_subs.append(i)
        
# Display for manual inspection
print(short_subs)

# Double check that the correct rows were identified as jibberish before dropping below
suicide_df.iloc[[652, 20022, 25039, 26251, 31309, 37223, 38500, 54171, 55021, 58548, 58854, 58856, 61708, 64563, 65187, 71086, 80854, 83595]]

[32, 568, 652, 3374, 3699, 4597, 5384, 7833, 9296, 9510, 9656, 10837, 10849, 11776, 12108, 13138, 13614, 15401, 15920, 16533, 17962, 19559, 20022, 20378, 24406, 25039, 25080, 26016, 26251, 30317, 31309, 34433, 37223, 38500, 41713, 42074, 46489, 47512, 48446, 54171, 55021, 55131, 55182, 58548, 58854, 58856, 59703, 60097, 61708, 62631, 63020, 63677, 64563, 65187, 66745, 67908, 69734, 71086, 71266, 72629, 73454, 73545, 78584, 78956, 79357, 79951, 80854, 81272, 81941, 83595]


Unnamed: 0,subreddit,author,selftext,title,num_comments,created_utc
652,1,TensionOdd,他妈的,有点儿想死,0,1602750659
20022,1,Daddydick-nuts,...,...,1,1598510950
25039,1,DuppyChubbySuppyBup,ttt,ttttttt,0,1597407155
26251,1,DontCallDaCops,Testing.,Testing.,1,1597146561
31309,1,hi2020xx,.,Hi,2,1596055753
37223,1,you-should-kys,ok,ok,5,1594653313
38500,1,jnat7715,Bust,Nut,0,1594388487
54171,1,p_1000selfterminate,Xkd,Ehajdj,0,1590870419
55021,1,ZestycloseChest9,Aas,Hh,0,1590676597
58548,1,willnebrickhead,dcdcdcd,dcdc,0,1589889415


In [95]:
# These are the rows identified as jibberish submissions by manual inspection above
suicide_df.drop([652, 20022, 25039, 26251, 31309, 37223, 38500, 54171, 55021, 58548, 58854, 58856, 61708, 64563, 65187, 71086, 80854, 83595],inplace=True)
# Drop the jibberish submissions and reset the index since it has no meaning
suicide_df.reset_index(drop=True, inplace=True)

In [96]:
# Look for issues with short submissions
short_subs = []
for i in range(len(depression_df)):
    if (len(depression_df.selftext[i])<10):
        short_subs.append(i)
        
# Display for manual inspection
print(short_subs)

# Double check that the correct rows were identified as jibberish before dropping below
depression_df.iloc[short_subs]

[17140, 44494, 45340, 45472, 49930, 53777, 54039, 54791, 55010, 57157, 57469, 60014, 61470, 61786, 62253, 62838, 63570, 63648, 66005, 66347, 66792, 67040, 69093, 69820, 70202, 71916, 75623, 76258, 77601, 78597, 82090]


Unnamed: 0,subreddit,author,selftext,title,num_comments,created_utc
17140,0,guisardwizard,thats it.,I cant explain... why cant I explain how much ...,2,1599999138
44494,0,porcodio25,Ty,Yo how many floors do I have to jump from to d...,4,1595388451
45340,0,AHawk1987,Why,Life is a chore,0,1595243681
45472,0,zaynpt666,pain,pain,0,1595221307
49930,0,ComedyTragedy01,.,Why'd I do that?,1,1594495881
53777,0,HazelSkyCat,Title.,Lonely,0,1593928872
54039,0,thethrowawayguy82,please,when does it end,2,1593901716
54791,0,brothersinchrist2020,Wtf,My agoraphobia has gone from being too afraid ...,1,1593791457
55010,0,stonerchik25,☹,It's happening,0,1593755225
57157,0,porraSV,The title,I’m so tired that I want cease to exist.,2,1593442368


In [97]:
# Only one submission identified as jibberish by inspection of shortest data above
depression_df.drop([75623],inplace=True)
# Drop the jibberish submissions and reset the index since it has no meaning
depression_df.reset_index(drop=True, inplace=True)

In [98]:
# Saving my cleaned up Data for NLP
suicide_clean = suicide_df.copy()
depression_clean = depression_df.copy()

In [99]:
# Concatenate the Subreddit DataFrames
subreddits = pd.concat([suicide_clean,depression_clean], ignore_index=True)

In [100]:
# For NLP I want to have one Series so 'title' & 'selftext' should be joined
subreddits['submission_text'] = None
for i in range(len(subreddits)):
    subreddits['submission_text'][i] = subreddits.title[i]+ '. ' + subreddits.selftext[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subreddits['submission_text'][i] = subreddits.title[i]+ '. ' + subreddits.selftext[i]


In [105]:
# Check that all is copacetic
subreddits.head()

Unnamed: 0,subreddit,author,selftext,title,num_comments,created_utc,submission_text
0,1,Kurt-without-Nirvana,"Yeah, I moved as a last chance. My friend has ...",I'm going to hang myself in a few hours - I ho...,0,1602897591,I'm going to hang myself in a few hours - I ho...
1,1,exhaustedwith_life,"He's dead, stopped taking his dialysis and die...",my dad had a twitter account i didnt know about,0,1602897402,my dad had a twitter account i didnt know abou...
2,1,dentistsitned,I feel so anxious every moment I’m still here ...,Worthless,0,1602897149,Worthless. I feel so anxious every moment I’m ...
3,1,thatsmileonyourface,i remember the first time grabbing a knife out...,birthday hooray. happy 10 years of suicidal th...,0,1602896928,birthday hooray. happy 10 years of suicidal th...
4,1,Plecofish,Just tried to slit my wrists again but I knew ...,What counts as a suicide attempt?,0,1602896898,What counts as a suicide attempt?. Just tried ...


In [102]:
# Check for NaNs from concat
subreddits.isnull().sum()

subreddit          0
author             0
selftext           0
title              0
num_comments       0
created_utc        0
submission_text    0
dtype: int64

In [103]:
subreddits_clean = subreddits[['subreddit','submission_text']]

In [104]:
# Save only columns with useful data for NLP classification modeling and analysis
suicide_clean.to_csv('../data/suicide_clean.csv',index=False)
depression_clean.to_csv('../data/depression_clean.csv',index=False)
subreddits_clean.to_csv('../data/subreddits_clean.csv',index=False)