## Data Cleaning, Pre-Processing, and Analysis

# Data Cleaning
This is the code for data cleaning, which means finding missing values and using the meaningful data for the classifier. 

In [2]:
# data analysis imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# NLP Imports
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re

from sklearn.feature_extraction.text import CountVectorizer

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
import wordninja

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ayaanhaque/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ayaanhaque/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# data configurations
pd.set_option('display.max_columns', 100)
sns.set_style("darkgrid")

In [4]:
# initializing cvs files
depression = pd.read_csv('../data/depression.csv')
suicide_watch = pd.read_csv('../data/suicide_watch.csv')

In [5]:
#visualizing depression dataset
pd.set_option('display.max_columns', 500)
depression.head()

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,subreddit_name_prefixed,hidden,pwls,link_flair_css_class,downs,hide_score,name,quarantine,link_flair_text_color,author_flair_background_color,subreddit_type,ups,total_awards_received,media_embed,author_flair_template_id,is_original_content,user_reports,secure_media,is_reddit_media_domain,is_meta,category,secure_media_embed,link_flair_text,can_mod_post,score,approved_by,author_premium,thumbnail,edited,author_flair_css_class,author_flair_richtext,gildings,content_categories,is_self,mod_note,created,link_flair_type,wls,removed_by_category,banned_by,author_flair_type,domain,allow_live_comments,selftext_html,likes,suggested_sort,banned_at_utc,view_count,archived,no_follow,is_crosspostable,pinned,over_18,all_awardings,awarders,media_only,can_gild,spoiler,locked,author_flair_text,visited,removed_by,num_reports,distinguished,subreddit_id,mod_reason_by,removal_reason,link_flair_background_color,id,is_robot_indexable,report_reasons,author,discussion_type,num_comments,send_replies,whitelist_status,contest_mode,mod_reports,author_patreon_flair,author_flair_text_color,permalink,parent_whitelist_status,stickied,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video,is_suicide
0,,depression,We understand that most people who reply immed...,t2_1t70,False,,0,False,Our most-broken and least-understood rules is ...,[],r/depression,False,0.0,,0,False,t3_doqwow,False,dark,,public,1818,0,{},,False,[],,False,False,,{},,False,1818,,True,,False,,[],{},,True,,1572390000.0,text,0.0,,,text,self.depression,False,"&lt;!-- SC_OFF --&gt;&lt;div class=""md""&gt;&lt...",,confidence,,,False,False,False,False,False,[],[],False,False,False,False,,False,,,moderator,t5_2qqqf,,,,doqwow,True,,SQLwitch,,133,True,no_ads,False,[],False,,/r/depression/comments/doqwow/our_mostbroken_a...,no_ads,True,https://www.reddit.com/r/depression/comments/d...,611580,1572361000.0,0,,False,0
1,,depression,Welcome to /r/depression's check-in post - a p...,t2_64qjj,False,,0,False,Regular Check-In Post,[],r/depression,False,0.0,,0,False,t3_exo6f1,False,dark,,public,310,0,{},,False,[],,False,False,,{},,False,310,,False,,False,,[],{},,True,,1580678000.0,text,0.0,,,text,self.depression,True,"&lt;!-- SC_OFF --&gt;&lt;div class=""md""&gt;&lt...",,new,,,False,False,False,False,False,[],[],False,False,False,False,,False,,,moderator,t5_2qqqf,,,,exo6f1,True,,circinia,,1644,False,no_ads,False,[],False,,/r/depression/comments/exo6f1/regular_checkin_...,no_ads,True,https://www.reddit.com/r/depression/comments/e...,611580,1580649000.0,0,,False,0
2,,depression,I've been feeling really depressed and lonely ...,t2_17aooz,False,,0,False,I hate it so much when you try and express you...,[],r/depression,False,0.0,,0,False,t3_fedwbi,False,dark,,public,89,0,{},,False,[],,False,False,,{},,False,89,,False,,False,,[],{},,True,,1583532000.0,text,0.0,,,text,self.depression,False,"&lt;!-- SC_OFF --&gt;&lt;div class=""md""&gt;&lt...",,confidence,,,False,False,False,False,False,[],[],False,False,False,False,,False,,,,t5_2qqqf,,,,fedwbi,True,,TheNewKiller69,,8,True,no_ads,False,[],False,,/r/depression/comments/fedwbi/i_hate_it_so_muc...,no_ads,False,https://www.reddit.com/r/depression/comments/f...,611580,1583503000.0,0,,False,0
3,,depression,I literally broke down crying and asked to go ...,t2_5v2j4itq,False,,0,False,I went to the hospital because I was having re...,[],r/depression,False,0.0,,0,False,t3_feel0k,False,dark,,public,39,0,{},,False,[],,False,False,,{},,False,39,,False,,False,,[],{},,True,,1583535000.0,text,0.0,,,text,self.depression,False,"&lt;!-- SC_OFF --&gt;&lt;div class=""md""&gt;&lt...",,confidence,,,False,False,False,False,False,[],[],False,False,False,False,,False,,,,t5_2qqqf,,,,feel0k,True,,Jazzlecrab,,15,True,no_ads,False,[],False,,/r/depression/comments/feel0k/i_went_to_the_ho...,no_ads,False,https://www.reddit.com/r/depression/comments/f...,611580,1583507000.0,0,,False,0
4,,depression,Any kind soul want to give a depressed person ...,t2_15xfmv,False,,0,False,Cake day for me,[],r/depression,False,0.0,,0,False,t3_fe6ua3,False,dark,,public,315,2,{},,False,[],,False,False,,{},,False,315,,False,,1583471814.0,,[],{'gid_1': 2},,True,,1583491000.0,text,0.0,,,text,self.depression,False,"&lt;!-- SC_OFF --&gt;&lt;div class=""md""&gt;&lt...",,confidence,,,False,False,False,False,False,"[{'count': 2, 'is_enabled': True, 'subreddit_i...",[],False,False,False,False,,False,,,,t5_2qqqf,,,,fe6ua3,True,,Depressed_Kid786,,37,True,no_ads,False,[],False,,/r/depression/comments/fe6ua3/cake_day_for_me/,no_ads,False,https://www.reddit.com/r/depression/comments/f...,611580,1583463000.0,0,,False,0


### Relevant Data

After viewing the data, there are 100 columns, but barely any of them are really needed for our classifier. We will choose the proper columns and go from there. 

To start, we will look at the title, text body, author username, number of comments, and lastly the URL of the post. 

In [6]:
depression[["title", "selftext", "author",  "num_comments", "is_suicide","url"]].head(5)

Unnamed: 0,title,selftext,author,num_comments,is_suicide,url
0,Our most-broken and least-understood rules is ...,We understand that most people who reply immed...,SQLwitch,133,0,https://www.reddit.com/r/depression/comments/d...
1,Regular Check-In Post,Welcome to /r/depression's check-in post - a p...,circinia,1644,0,https://www.reddit.com/r/depression/comments/e...
2,I hate it so much when you try and express you...,I've been feeling really depressed and lonely ...,TheNewKiller69,8,0,https://www.reddit.com/r/depression/comments/f...
3,I went to the hospital because I was having re...,I literally broke down crying and asked to go ...,Jazzlecrab,15,0,https://www.reddit.com/r/depression/comments/f...
4,Cake day for me,Any kind soul want to give a depressed person ...,Depressed_Kid786,37,0,https://www.reddit.com/r/depression/comments/f...


From these rows, we can see a few posts that people posted. The second post looks like it is from a moderator, as it is a checkin and they have lots of comments. Posts like these potentially have to be removed. 

In [7]:
suicide_watch[["title", "selftext", "author",  "num_comments", "is_suicide","url"]].head(5)

Unnamed: 0,title,selftext,author,num_comments,is_suicide,url
0,New wiki on how to avoid accidentally encourag...,We've been seeing a worrying increase in pro-s...,SQLwitch,260,1,https://www.reddit.com/r/SuicideWatch/comments...
1,Reminder: Absolutely no activism of any kind i...,"If you want to recognise an occasion, please d...",SQLwitch,124,1,https://www.reddit.com/r/SuicideWatch/comments...
2,To every single poster here i wanne say one thing,I really fucking feel you,NussNougatCroissant,46,1,https://www.reddit.com/r/SuicideWatch/comments...
3,I just want it all to stop,Everyone ends up hating me eventually. \nMy ps...,hda-SVN-njhdsx,5,1,https://www.reddit.com/r/SuicideWatch/comments...
4,"Nobody gives a fuck until you die, and even th...",,lil_peemis,3,1,https://www.reddit.com/r/SuicideWatch/comments...


This is the suicide dataset. Just from the preview, the titles and posts are clearly different, but it is very hard to distinguish which is which and how to classify that. Post 5 has no body, which also could be problematic as it is a missing value.

In [8]:
# viewing shapes of datasets
print(depression[["title", "selftext", "author",  "num_comments", "is_suicide","url"]].shape)
print(suicide_watch[["title", "selftext", "author",  "num_comments", "is_suicide","url"]].shape)

(917, 6)
(980, 6)


they are a bit different in size, but after cleaning and processing this shouldn't be an issue. 

In [None]:
# reading post 118 from the depression dataset
print(depression["selftext"][80])
len(depression["selftext"][80])

In [10]:
# reading post 118 from the suicide dataset
print(suicide_watch["selftext"][118])
len(suicide_watch["selftext"][118])

I'm an 18 year old with severe depression, anxiety, ADHD, borderline personality disorder and DID at a point in my life. But I'm slowly recovering.

I have always felt like I never belonged to this world, because I am "too imaginative" and "too kind". I behave "not like how people should". I'm mostly well-liked, but I know they think I'm a weirdo. Everyone does.

It's final exam time for us, and I have to secure good marks, otherwise I won't be able to enter any good streams in a good college. It would be the end of my future. My family is in a financial crunch, but they give up everything for me. I am dead-set on helping my friends get through their depression and suicidal tendencies, but I'm failing at that. I have had three attempts, but I was saved by a person, who himself was alexithymic (I think) and suicidal. He still is, and he says if he doesn't do well, he will commit suicide.

I don't know what I'm gonna do if I don't perform the way I should. But I certainly know that I wil

1172

just from reading them, they look pretty similar. However, the stories of both people are completely different and one of them is suicidal. The second person explicitly says they will die in 3 months, but the first post makes no such remarks. A classifier could do a good job of distinguishing between these two. Let's read two more. 

In [11]:
print(depression["selftext"][6])

why does it hurt so much? Why can’t I be happy without it? There’s this empty void in my heart that gets bigger everyday. I’m just waiting until it eats me up, since I’ll never have 2 sided love.


In [12]:
print(suicide_watch["selftext"][100])

I wanted to die starting in Jan 2018, but things have only gotten worse.

In summer 2018, those fucks on the Suicide Prevention chatroom called the police when I expressed suicidal ideations, so I was kidnapped and sent to a series of hospital-prisons with junk medical staff. I lost my job due to the hospital stay - and my apartment, car, and dog shortly followed.

Can't get a decent job because my resume is now all fucked up and I have no connections, and I refuse to go back to miserable jobs that pay horribly. I'd rather die than do that for life.

I am about to be sued on $4K debt, and then yesterday I was handed a $6.5K medical bill for treatment that would have been 100% unnecessary had I still had insurance and was able to go to regular check ups.

You fucks on the Suicide Prevention line made my life demonstrably worse. You destroyed the mechanisms that kept me going as an independent and self-sufficient human. Now I have nothing and am in a far worse position (logistically spea

between these two posts, there is a clear distinction, but it would be hard for a regular person reading it to be sure. Only a classifier generalized on thousands of peoples suffering could do this. But the first post talks about how their life is getting better, while the second explains that the person thinks that it is time to die. 

In [13]:
# the 5 columns we chose seem good so lets shorten the datasets. 
dep_columns = depression[["title", "selftext", "author",  "num_comments", "is_suicide","url"]]
sui_columns = suicide_watch[["title", "selftext", "author",  "num_comments", "is_suicide","url"]]

# lets combine the datasets into one massive dataset. 
combined_data = pd.concat([dep_columns,sui_columns],axis=0, ignore_index=True)    
combined_data

Unnamed: 0,title,selftext,author,num_comments,is_suicide,url
0,Our most-broken and least-understood rules is ...,We understand that most people who reply immed...,SQLwitch,133,0,https://www.reddit.com/r/depression/comments/d...
1,Regular Check-In Post,Welcome to /r/depression's check-in post - a p...,circinia,1644,0,https://www.reddit.com/r/depression/comments/e...
2,I hate it so much when you try and express you...,I've been feeling really depressed and lonely ...,TheNewKiller69,8,0,https://www.reddit.com/r/depression/comments/f...
3,I went to the hospital because I was having re...,I literally broke down crying and asked to go ...,Jazzlecrab,15,0,https://www.reddit.com/r/depression/comments/f...
4,Cake day for me,Any kind soul want to give a depressed person ...,Depressed_Kid786,37,0,https://www.reddit.com/r/depression/comments/f...
...,...,...,...,...,...,...
1892,I'm in so much pain,It's so hard just to get out of bed every morn...,BigPete543,6,1,https://www.reddit.com/r/SuicideWatch/comments...
1893,I'm too stupid to achieve anything in life. I'...,I have a low IQ. Twice I've been tested and bo...,Elisbt,3,1,https://www.reddit.com/r/SuicideWatch/comments...
1894,Useless,"I don't feel useless, im useless.",Gayhova,5,1,https://www.reddit.com/r/SuicideWatch/comments...
1895,I wish it was as easy as disappearing into thi...,I wish there was some way I could will myself ...,evofe,1,1,https://www.reddit.com/r/SuicideWatch/comments...


In [15]:
# saving the combined data in our datasets folder
combined_data.to_csv('/Users/ayaanhaque/Desktop/SuiSense/data/combined_data.csv', index = False)

In [16]:
# checking for missing values
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1897 entries, 0 to 1896
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         1897 non-null   object
 1   selftext      1832 non-null   object
 2   author        1897 non-null   object
 3   num_comments  1897 non-null   int64 
 4   is_suicide    1897 non-null   int64 
 5   url           1897 non-null   object
dtypes: int64(2), object(4)
memory usage: 89.0+ KB


it looks like the only missing values are in the text body, which makes sense. 

In [17]:
# looking at the posts with missing text values
combined_data[combined_data["selftext"].isnull()].head(10)

Unnamed: 0,title,selftext,author,num_comments,is_suicide,url
184,Can someone recommend good qualities in therap...,,eito_8,2,0,https://www.reddit.com/r/depression/comments/f...
921,"Nobody gives a fuck until you die, and even th...",,lil_peemis,3,1,https://www.reddit.com/r/SuicideWatch/comments...
923,I have two brothers who have killed themselves...,,ArsenalOwl,1,1,https://www.reddit.com/r/SuicideWatch/comments...
925,I want to die I want to die I want to die,,alynde,4,1,https://www.reddit.com/r/SuicideWatch/comments...
934,"I am so sorry, but it has gotten worse",,SmushyKidK,4,1,https://www.reddit.com/r/SuicideWatch/comments...
937,I want to douse my body in gasoline and set my...,,SalehRobbins,3,1,https://www.reddit.com/r/SuicideWatch/comments...
941,I can't do this anymore,,sappy_banana,4,1,https://www.reddit.com/r/SuicideWatch/comments...
957,"If I had a gun, I’d blow my fucking brains out...",,CGM2004,1,1,https://www.reddit.com/r/SuicideWatch/comments...
960,This world is a joke.,,crybaby1577,11,1,https://www.reddit.com/r/SuicideWatch/comments...
964,I think I’m ready,,___horse___,4,1,https://www.reddit.com/r/SuicideWatch/comments...


The posts with missing values are either very concise in the title and to the point, or the main text is basically in the title. Luckily, there aren't that many posts with missing values. However, most of the null values are in the suicide dataset, which makes sense but also could be troublesome for our classifier. Maybe using the titles as the text would be a good approach. 

In [18]:
combined_data["is_suicide"][combined_data["selftext"].isnull()].value_counts()

1    64
0     1
Name: is_suicide, dtype: int64

In [19]:
# the best approach for the null values it to just fill them with "emptypost"
combined_data["selftext"].fillna("emptypost",inplace=True)

In [20]:
# checking if filling missing values worked
combined_data[combined_data["selftext"].isin(["emptypost"])].head()

Unnamed: 0,title,selftext,author,num_comments,is_suicide,url
184,Can someone recommend good qualities in therap...,emptypost,eito_8,2,0,https://www.reddit.com/r/depression/comments/f...
921,"Nobody gives a fuck until you die, and even th...",emptypost,lil_peemis,3,1,https://www.reddit.com/r/SuicideWatch/comments...
923,I have two brothers who have killed themselves...,emptypost,ArsenalOwl,1,1,https://www.reddit.com/r/SuicideWatch/comments...
925,I want to die I want to die I want to die,emptypost,alynde,4,1,https://www.reddit.com/r/SuicideWatch/comments...
934,"I am so sorry, but it has gotten worse",emptypost,SmushyKidK,4,1,https://www.reddit.com/r/SuicideWatch/comments...


In [21]:
# checking entire dataset for missing values
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1897 entries, 0 to 1896
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         1897 non-null   object
 1   selftext      1897 non-null   object
 2   author        1897 non-null   object
 3   num_comments  1897 non-null   int64 
 4   is_suicide    1897 non-null   int64 
 5   url           1897 non-null   object
dtypes: int64(2), object(4)
memory usage: 89.0+ KB


# Data Preprocessing
The posts are all written in different punctuation and capitalizations, so standardizing the data is an important first step. 

### Preprocessing Functions
Let's begin by removing capitalizations, reducing sentences to base words, and removing punctuation. We will add this as a new column to our data.

In [22]:
def processing_text(series_to_process):
    new_list = []
    tokenizer = RegexpTokenizer(r'(\w+)')
    lemmatizer = WordNetLemmatizer()
    for i in range(len(series_to_process)):
        # tokenized item in a new list
        dirty_string = (series_to_process)[i].lower()
        words_only = tokenizer.tokenize(dirty_string) # words_only is a list of only the words, no punctuation
        #Lemmatize the words_only
        words_only_lem = [lemmatizer.lemmatize(i) for i in words_only]
        # removing stop words
        words_without_stop = [i for i in words_only_lem if i not in stopwords.words("english")]
        # return seperated words
        long_string_clean = " ".join(word for word in words_without_stop)
        new_list.append(long_string_clean)
    return new_list

In [None]:
# checking to see if the new columns were added
combined_data["selftext_clean"] = processing_text(combined_data["selftext"])
combined_data["title_clean"] = processing_text(combined_data["title"])
pd.set_option("display.max_colwidth", 100)
combined_data.head(8)

Cleaning the titles and text worked, and that is important for our classifier to simplify the process and create a clearer distinction between the two datasets. 

In [None]:
# checking selftext_clean
pd.set_option("display.max_colwidth", 1000)
combined_data[["selftext","selftext_clean"]].head(2)

In [None]:
# testing wordninja
author_test = []
for i in range(10):
    splits_list = wordninja.split(combined_data["author"][i])
    combined_string = " ".join(splits_list)
    author_test.append(combined_string)
test_dict = {combined_data["author"][i]:author_test[i] for i in range(10)}
print(test_dict)

In [None]:
# lets also clean the author names
def processing_author_names(series_to_process):
    author_split = []
    for i in range(len(series_to_process)):
        splits_list = wordninja.split(series_to_process[i])
        combined_string = " ".join(splits_list)
        author_split.append(combined_string)
    new_list = []
    tokenizer = RegexpTokenizer(r'(\w+)')
    lemmatizer = WordNetLemmatizer()
    for i in range(len(author_split)):
        #TOKENISED ITEM(LONG STRING) IN A LIST
        dirty_string = (author_split)[i].lower()
        words_only = tokenizer.tokenize(dirty_string) #WORDS_ONLY IS A LIST THAT DOESN'T HAVE PUNCTUATION
        #LEMMATISE THE ITEMS IN WORDS_ONLY
        words_only_lem = [lemmatizer.lemmatize(i) for i in words_only]
        #REMOVING STOP WORDS FROM THE LEMMATIZED LIST
        words_without_stop = [i for i in words_only_lem if i not in stopwords.words("english")]
        #RETURN SEPERATED WORDS INTO LONG STRING
        long_string_clean = " ".join(word for word in words_without_stop)
        new_list.append(long_string_clean)
    return new_list

In [None]:
combined_data["author_clean"]= processing_author_names(combined_data["author"])

# checking author_clean
pd.set_option("display.max_colwidth", 100)
combined_data[["author","author_clean"]].tail(10)

so it doesn't work that well, but it isn't too big a deal because the author names don't matter as much, as long as it is simplified it is working well.

In [None]:
# Making sure there is no new missing values added
combined_data.isnull().sum()

### Data Preprocessing Complete
This was a relatively simple process because we only have a few attributes to adjust. We have 3 attributes to train our model on now.

# EDA
For EDA, let's just go through a few things to better understand the dataset so we can properly build our classifier

In [None]:
# masking for visualization
suicide_posts = combined_data[combined_data["is_suicide"] ==1]["selftext_clean"]
suicide_titles = combined_data[combined_data["is_suicide"] ==1]["title_clean"]
suicide_authors = combined_data[combined_data["is_suicide"] ==1]["author_clean"]

depression_posts = combined_data[combined_data["is_suicide"] ==0]["selftext_clean"]
depression_titles = combined_data[combined_data["is_suicide"] ==0]["title_clean"]
depression_authors = combined_data[combined_data["is_suicide"] ==0]["author_clean"]

Let's look at the top words used

In [None]:
# function for visualization
def plot_most_used_words(category_string, data_series, palette, image_mask):
    #CHECKING OUT COMMON WORDS IN r/SuicideWatch USING CVEC
    cvec = CountVectorizer(stop_words='english')
    cvec.fit(data_series)
    #CREATING A DATAFRAME OF EXTRACTED WORDS
    created_df = pd.DataFrame(cvec.transform(data_series).todense(),
                              columns=cvec.get_feature_names())
    total_words = created_df.sum(axis=0)
    
    #<<<BARPLOT>>>
    #CREATING A FINAL DATAFRAME OF THE TOP 20 WORDS
    top_20_words = total_words.sort_values(ascending = False).head(20)
    top_20_words_df = pd.DataFrame(top_20_words, columns = ["count"])
    #PLOTTING THE COUNT OF THE TOP 20 WORDS
    sns.set_style("white")
    plt.figure(figsize = (15, 8), dpi=300)
    ax = sns.barplot(y= top_20_words_df.index, x="count", data=top_20_words_df, palette = palette)
    
    plt.xlabel("Count", fontsize=9)
    plt.ylabel('Common Words in {}'.format(category_string), fontsize=9)
    plt.yticks(rotation=-5)

In [None]:
plot_most_used_words("r/depression Posts", depression_posts, palette="ocean_r", image_mask="../assets/depression_mask.png")

In [None]:
plot_most_used_words("r/SuicideWatch Posts", suicide_posts, palette="magma", image_mask="../assets/suicide_mask.png")

This shows a clear problem for classification. The top words in post datasets are very similar, which could make the job of the classifier much harder, especially considering that it is not just binary classification.

Maybe using titles will be a better approach.

Lets see which users post a lot and more often in both subreddits to help our classifier

In [None]:
# looking at top authors
combined_data["author"].value_counts().head(20)

In [None]:
#isolating authors who posted more than once
df_author_counts=(pd.DataFrame(combined_data["author"].value_counts()))
df_author_counts.reset_index(level=0, inplace=True)
authors_posting_more_than_once = list(df_author_counts[df_author_counts["author"]>1]["index"])
authors_posting_more_than_once

In [None]:
# fish out authors who posted in both subreddits
pd.set_option("display.max_colwidth", 100)
#creating dataframe of mean values of is_suicide
more_than_once_mean_df = combined_data[combined_data["author"].isin(authors_posting_more_than_once)].groupby("author").mean()
more_than_once_mean_df.reset_index(level=0, inplace=True)
# isolating authors of both reddits
double_posters_mask_0 = ((more_than_once_mean_df["is_suicide"]) !=0) 
double_posters_mask_1 = ((more_than_once_mean_df["is_suicide"]) !=1.0) 
# totalling the double posters
double_posters = more_than_once_mean_df[double_posters_mask_0][double_posters_mask_1].sort_values("num_comments", ascending=False)
print(len(double_posters))
# creating list
top_double_posters_list= list(double_posters["author"].head(7))
top_double_posters_list

Now we will plot length of posts

In [None]:
combined_data["selftext_length"]= [len(combined_data["selftext"][i]) for i in range(len(combined_data))]

In [None]:
combined_data["title_length"]= [len(combined_data["title"][i]) for i in range(len(combined_data))]

In [None]:
ave_length_dep_title = combined_data["title_length"][combined_data["is_suicide"] ==0].mean()
ave_length_sui_title = combined_data["title_length"][combined_data["is_suicide"] ==1].mean()
ave_length_dep_post = combined_data["selftext_length"][combined_data["is_suicide"] ==0].mean()
ave_length_sui_post = combined_data["selftext_length"][combined_data["is_suicide"] ==1].mean()

print("Average length of a r/depression title: {}".format(ave_length_dep_title))
print("Average length of a r/SuicideWatch title: {}".format(ave_length_sui_title))
print("Average length of a r/depression post: {}".format(ave_length_dep_post))
print("Average length of a r/SuicideWatch post: {}".format(ave_length_sui_post))

In [None]:
# visualizing data with scatter plot
sns.set_style("white")
plt.figure(figsize = (18, 12))
sns.scatterplot(data =combined_data,
               y = "selftext_length", 
               x = "author",
               hue = 'is_suicide', 
               palette = "magma_r",
               size = 'selftext_length',
               sizes=(20, 150));
plt.title("Length of Posts");
plt.xlabel("Authors");
plt.ylabel("Number of words");
plt.xticks(rotation=65);

We can see that the lengths of posts cluster around the bottom, which is close to 1000 words. This shows the grief of the users because most reddit posts are much shorter. Some are even 20k+, which could be good for the classifier.

In [None]:
#SAVING combined_data
#combined_data.to_csv('../data/data_for_model.csv', index = False)