In [462]:
# Imports
import pandas as pd
import requests

In [463]:
# Read in the data csv's with only the 6-Reddit Params identified as useful for NLP
suicide_OG = pd.read_csv('./data/suicide_watch_6.csv')
depression_OG = pd.read_csv('./data/depression_6.csv')

In [464]:
# Column names associated with 6 Reddit Parameters identified as useful for NLP
reddit_params = ['subreddit','author','selftext', 'title', 'num_comments', 'created_utc']

## EDA

### A.) r/suicide_watch Data

In [465]:
# Keep an unclean copy for future reference
suicide_df = suicide_OG.copy()

In [466]:
# Every single entry has a properly labeled tag for the subreddit: 'SuicideWatch'
suicide_df.subreddit.value_counts()

SuicideWatch    100100
Name: subreddit, dtype: int64

In [467]:
# Reassign for simplicity
suicide_df['subreddit'] = 1

In [468]:
# Check for proper data types, all looks correct
suicide_df.dtypes

subreddit        int64
author          object
selftext        object
title           object
num_comments     int64
created_utc      int64
dtype: object

In [469]:
# Notice 2-titles are null and 6,249-'selftext' entries are null
suicide_df.isnull().sum()

subreddit          0
author             0
selftext        6249
title              2
num_comments       0
created_utc        0
dtype: int64

In [470]:
# After a closer look at null 'title' entries, the 'selftext' data are worth keeping
suicide_df[suicide_df['title'].isnull()==True].selftext

63974    Nobody really cares unless you have a reason o...
75035    I’ve been feeling pretty empty these past few ...
Name: selftext, dtype: object

In [471]:
# 6,093 [removed] 'selftext' entries
# 2,628 [deleted] 'selftext' entries

# 83,446 + 619 = 84,065 unique 'selftext' entries
suicide_df.selftext.value_counts()[618:].sum()

83446

In [472]:
# There are 86,389 unique titles 
# Consider how to ID duplicates to drop
suicide_df.title.value_counts()[:10]

.                        310
Help                     272
I want to die            194
I need help              153
Goodbye                  105
I'm done                  98
I’m done                  91
help                      88
Please help me            70
I want to kill myself     66
Name: title, dtype: int64

In [473]:
# There are 54,682 unique authors contributing
# There are 3,077 author names [deleted]
suicide_df.author.value_counts()

[deleted]             3077
NightAir_              133
cornrain               121
darkpols368            102
Throwawayiguess457     102
                      ... 
RastGG                   1
Emilister05              1
shindjkkb                1
justadumbhoe             1
exhoesbrokemyheart       1
Name: author, Length: 54682, dtype: int64

In [474]:
# 24.89% have zero comments
# ~60% have 2 comments or less
round(suicide_df.num_comments.value_counts(normalize=True)[:5]*100,2)

0    24.89
1    20.68
2    14.40
3     9.18
4     6.57
Name: num_comments, dtype: float64

***SUMMARY OF EDA FINDINGS***

1. selftext
    * 6,249 NaN
    * 6,093 '[removed]'
    * 2,628 '[deleted]'
    * 619 w/ at least 1 duplicate
2. title
    * 2 NaN
    * 86,389 unique
3. author
    * 3,077 '[deleted]'
    * 54,682 unique

## Cleaning Data

### A.) r/suicide_watch

In [475]:
# Drop all 6,249 submissions with null 'selftext'
suicide_df.dropna(inplace=True)
suicide_df.isnull().sum()

subreddit       0
author          0
selftext        0
title           0
num_comments    0
created_utc     0
dtype: int64

In [476]:
# ID indexes where selftext entries equal to either [deleted] or [removed]
drop_rows = suicide_df[(suicide_df.selftext == '[deleted]') | 
                              (suicide_df.selftext == '[removed]')
                             ].index.tolist()

# Drop all 6,093-'[removed]' & 2,628-'[deleted]' selftext data
suicide_df.drop(drop_rows,inplace=True)

In [477]:
# Comments are not included in this analysis so,
# posts where selftext AND title are duplicates should be dropped
suicide_df.drop_duplicates(subset=['title','selftext'], 
                           keep='last', 
                           inplace=True, 
                           ignore_index=True)

In [478]:
# The indices have no meaning so lets clean it up
suicide_df.reset_index(drop=True, inplace=True)

In [479]:
# Notice only 4 selftext/title duplicates were removed in cleaning
suicide_df.selftext.value_counts()[615:];

In [480]:
# Notice that we still have 73,325 unique title entries after cleaning
suicide_df.selftext.value_counts()[:10]

.                                                                                                                                                                                                                                      26
Please                                                                                                                                                                                                                                 18
Title                                                                                                                                                                                                                                  17
please                                                                                                                                                                                                                                 12
...                                                             

In [481]:
# NOTE: I did not directly drop [deleted] authors, but now there are over 3,000 fewer
len(suicide_df[suicide_df['author']=='[deleted]'])

71

In [482]:
# 20.64% have zero comments
# 57.43% have 2 comments or less
round(suicide_df.num_comments.value_counts(normalize=True)[:5]*100,2)

1    21.72
0    20.64
2    15.07
3     9.80
4     6.98
Name: num_comments, dtype: float64

## EDA

### B.) r/depression Data

In [483]:
# Keep an unclean copy for future reference
depression_df = depression_OG.copy()

In [484]:
# Every single entry has a properly labeled tag for the subreddit: 'depression'
depression_df.subreddit.value_counts()

depression    100100
Name: subreddit, dtype: int64

In [485]:
# Reassign for simplicity
depression_df['subreddit'] = 0

In [486]:
# Check for proper data types, all looks correct
depression_df.dtypes

subreddit        int64
author          object
selftext        object
title           object
num_comments     int64
created_utc      int64
dtype: object

In [487]:
# Notice 1,309 entries in 'selftext' are null
depression_df.isnull().sum()

subreddit          0
author             0
selftext        1309
title              0
num_comments       0
created_utc        0
dtype: int64

In [488]:
# 14,189 [removed] selftext
# 2,163 [deleted] selftext

# 81,542 + 410 = 81,951 unique 'selftext' entries
depression_df.selftext.value_counts()[409:].sum()

81542

In [489]:
# There are 89,999 unique titles 
# Consider how to ID duplicates to drop
depression_df.title.value_counts()[:10]

Help                                                                      160
I need help                                                               109
.                                                                         106
AIs from AI Dungeon 2 to sexy to funny and one based wholly on Reddit!     91
I hate myself                                                              80
I want to die                                                              77
...                                                                        69
I don't know what to do                                                    63
Am I depressed?                                                            61
Depression                                                                 61
Name: title, dtype: int64

In [490]:
# There are 59,130 unique authors contributing
# There are 3,004 author names [deleted]
depression_df.author.value_counts()

[deleted]         3004
HypeToHype         122
h3xadecimal2        91
lifeishard99        78
_ChilledVibez       77
                  ... 
rosesandlillys       1
thebigsadlol         1
Meeboi009            1
cheap_dates          1
jacob_juno_69        1
Name: author, Length: 59130, dtype: int64

In [491]:
# 43.52% have zero comments
# ~72% have 2 comments or less
round(depression_df.num_comments.value_counts(normalize=True)[:5]*100,2)

0    43.52
1    16.68
2    11.91
3     7.11
4     5.12
Name: num_comments, dtype: float64

## Cleaning Data

### B.) r/depression

In [492]:
# Drop all 1,309 submissions with null 'selftext'
depression_df.dropna(inplace=True)
depression_df.isnull().sum()

subreddit       0
author          0
selftext        0
title           0
num_comments    0
created_utc     0
dtype: int64

In [493]:
# ID indexes where selftext entries equal to either [deleted] or [removed]
drop_rows = depression_df[(depression_df.selftext == '[deleted]') | 
                              (depression_df.selftext == '[removed]')
                             ].index.tolist()

# Drop all 14,189-'[removed]' & 2,163-'[deleted]' selftext data
depression_df.drop(drop_rows,inplace=True)

In [494]:
# Comments are not included in this analysis so,
# posts with duplicate selftext AND title submissions should be dropped
depression_df.drop_duplicates(subset=['title','selftext'], 
                           keep='last', 
                           inplace=True, 
                           ignore_index=True)

In [495]:
# The indices have no meaning so lets clean it up
depression_df.reset_index(drop=True, inplace=True)

In [496]:
# Notice 194 selftext/title duplicates were removed in cleaning
# We still have 81,949 unique selftext entries
depression_df.selftext.value_counts()[215:];

In [497]:
# 74,111 unique title submissions remain after cleaning
depression_df.title.value_counts()

Help                                                                                 145
I need help                                                                           98
.                                                                                     93
I hate myself                                                                         68
...                                                                                   67
                                                                                    ... 
I Can't Even... Why Me!?! Fucking Hell Why Me!!!                                       1
I have trouble making close, meaningful connections with others                        1
Forgiving                                                                              1
"I'm worried about you" How do I respond?                                              1
Why is guilt-tripping such a common method for ppl to try to make you stay alive.      1
Name: title, Length: 

In [498]:
# NOTE: I did not directly drop [deleted] authors, but now there are nearly 3,000 fewer
len(depression_df[depression_df['author']=='[deleted]'])

26

In [499]:
depression_df.author.value_counts()

HypeToHype              117
lifeishard99             64
SniperNoSwiper           61
TehDarkLorde             59
Random_Doggo_            58
                       ... 
sponsoredbyrazer          1
maru_chou                 1
Explanation-Upstairs      1
Someselfhelpcrap          1
jacob_juno_69             1
Name: author, Length: 52522, dtype: int64

In [500]:
# 35.5% have zero comments
# 67.61% have 2 comments or less
round(depression_df.num_comments.value_counts(normalize=True)[:5]*100,2)

0    35.50
1    18.61
2    13.50
3     8.21
4     5.92
Name: num_comments, dtype: float64

***OVERVIEW OF EDA AND CLEANING***

1. suicide_watch: selftext
    * 6,249 NaN         --> 0 (after cleaning)
    * 6,093 '[removed]' --> 0 (after cleaning)
    * 2,628 '[deleted]' --> 0 (after cleaning)
    * 619 with more than 1 duplicate --> 615 (after cleaning)
    * 84,064 --> 84, 060 (after cleaning)
2. suicide_watch: title
    * 2 NaN --> 0 (after cleaning)
    * 86,389 unique --> 73,325 (after cleaning)
3. suicide_watch: author
    * 3,077 '[deleted]' --> 71(after cleaning)
    * 54,682 unique --> 50,469(after cleaning)
4. suicide_watch: comments
    * ~25% have 0 comments --> ~21%(after cleaning)
    * ~60% have 2 comments or less --> ~57% (after cleaning)
---

1. depression: selftext
    * 1,309 NaN          --> 0 (after cleaning)
    * 14,189 '[removed]' --> 0 (after cleaning)
    * 2,163 '[deleted]'  --> 0 (after cleaning)
    * 410 w/ at least 1 duplicate --> 216 (after cleaning)
    * 81,951 unique submissions --> 81,949 (after cleaning)
2. depression: title
    * 2 NaN   --> 0 (after cleaning)
    * 89,999 unique --> 74,111 (after cleaning)
3. depression: author
    * 3,004 '[deleted]' --> 26(after cleaning)
    * 59,130 unique     --> 52,522(after cleaning)
4. depression: comments
    * ~43.5% have 0 comments --> ~35.5%(after cleaning)
    * ~72% have 2 comments or less --> ~67% (after cleaning)

In [502]:
# Save only columns with useful data for NLP classification modeling and analysis
suicide_df.to_csv('./data/suicide_clean.csv',index=False)
depression_df.to_csv('./data/depression_clean.csv',index=False)