In [27]:
import pandas as pd
from tqdm import tqdm 
import sys
import os
import preprocessor as p

project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from helpers.text_preprocessor import clean_tweets
from helpers.data_validation import clean_and_validate_dataset

# Data collection and preprocessing

We have collected data using Twitter API, from older research and projects that discussed depression and anxiety detection, and from the Reddit site. The data collection process was iterable with the process cycle of the project.

### Version 1 of the data


In the first cycle, we gathered data that have 20000 rows and then trained the first version of the models using it.

In [3]:
data_v1 = pd.read_csv('../data/raw/Mental_Health_Twitter.csv', usecols=['post_text', 'label'])

cleaned_tweets = []

#loop on every tweet
for i in tqdm(range(len(data_v1['post_text']))):
    tweet = data_v1['post_text'][i]

    #Use tweet-preprocessor module
    clean_text = p.clean(tweet) 
    
    #Use clean_tweets function
    filtered_tweet = clean_tweets(clean_text) 
    cleaned_tweets.append(filtered_tweet)

data_v1['cleaned_text'] = cleaned_tweets

#Select the columns we interested in for classification
data_v1 = data_v1[['cleaned_text','label']] 

#Rename the columns for clarification
data_v1.rename(columns = {'cleaned_text':'filtered_tweet',
                          'label':'is_depression'}, 
                          inplace = True) 

data_v1.to_csv('../data/processed/Mental_Health_Twitter_Processed_v1.csv')

100%|██████████| 20007/20007 [00:28<00:00, 699.22it/s] 


### Version 2 of the data

We collected more data for serving the models and increasing the performance. In the second cycle, we gathered data that have 47185 rows and then trained the second version of the models using it and the first version of data.

In [4]:
data_v2 = pd.read_csv('../data/raw/Mental_Health_Dataset.csv')

cleaned_tweets = []
for i in tqdm(range(len(data_v2['filtered_tweet']))):
    tweet = data_v2['filtered_tweet'][i]
    clean_text = p.clean(tweet)
    filtered_tweet = clean_tweets(clean_text)
    cleaned_tweets.append(filtered_tweet)
    
data_v2['filtered_tweet'] = cleaned_tweets
data_v2.to_csv('../data/processed/Mental_Health_Dataset_Processed_v2.csv')

100%|██████████| 47185/47185 [01:35<00:00, 495.38it/s] 


### Version 3 of the data

Repeated the data collection process to get higher performance for our models. In the last cycle, we gathered data that have 232074 rows and then trained the last version of the models using it and the other versions of data.

In [14]:
data_v3 = pd.read_csv('../data/raw/Suicide_Detection.csv', encoding = "ISO-8859-1", usecols=['text', 'class'])
data_v3.rename(columns = {'text':'filtered_tweet',
                          'class':'is_depression'},
                          inplace = True )

data_v3.replace({"is_depression": {"suicide": 1, "non-suicide": 0}}, inplace=True)

# Replace NaN or non-string values with empty string
data_v3['filtered_tweet'] = data_v3['filtered_tweet'].astype(str)
cleaned_tweets = []

for i in tqdm(range(len(data_v3['filtered_tweet']))):
    tweet = data_v3['filtered_tweet'][i]
    clean_text = p.clean(tweet)
    filtered_tweet = clean_tweets(clean_text)
    cleaned_tweets.append(filtered_tweet)
    
data_v3['filtered_tweet'] = cleaned_tweets

data_v3 = data_v3[['filtered_tweet','is_depression']]
data_v3.to_csv('../data/processed/Suicide_Detection_Processed_v3.csv')

100%|██████████| 233403/233403 [13:37<00:00, 285.37it/s] 


## Version 4

In [17]:
data_v4 = pd.read_csv('../data/raw/Mental_Health_Corpus.csv')
data_v4.rename(columns = {'text':'filtered_tweet',
                          'label':'is_depression'},
                          inplace = True )
cleaned_tweets = []

for i in tqdm(range(len(data_v4['filtered_tweet']))):
    tweet = data_v4['filtered_tweet'][i]
    clean_text = p.clean(tweet)
    filtered_tweet = clean_tweets(clean_text)
    cleaned_tweets.append(filtered_tweet)
    
data_v4['filtered_tweet'] = cleaned_tweets
data_v4.to_csv('../data/processed/Mental_Health_Corpus_Processed_v4.csv')

100%|██████████| 27977/27977 [01:36<00:00, 291.00it/s]


## Version 5

In [18]:
data_v5 = pd.read_csv('../data/raw/Sentiment_Tweets.csv', usecols=['message to examine', 'label (depression result)'])
data_v5.rename(columns = {"message to examine":'filtered_tweet',
                          'label (depression result)':'is_depression'},
                          inplace = True)
cleaned_tweets = []

for i in tqdm(range(len(data_v5['filtered_tweet']))):
    tweet = data_v5['filtered_tweet'][i]
    clean_text = p.clean(tweet)
    filtered_tweet = clean_tweets(clean_text)
    cleaned_tweets.append(filtered_tweet)
    
data_v5['filtered_tweet'] = cleaned_tweets
data_v5.to_csv('../data/processed/Sentiment_Tweets_Processed_v5.csv')

100%|██████████| 10314/10314 [00:12<00:00, 801.86it/s]


## Version 6

In [19]:
data_v6 = pd.read_csv('../data/raw/Depression_Dataset_Reddit.csv')
data_v6.rename(columns = {"clean_text":'filtered_tweet',
                          'is_depression':'is_depression'},
                          inplace = True )
cleaned_tweets = []
for i in tqdm(range(len(data_v6['filtered_tweet']))):
    tweet = data_v6['filtered_tweet'][i]
    clean_text = p.clean(tweet)
    filtered_tweet = clean_tweets(clean_text)
    cleaned_tweets.append(filtered_tweet)
    
data_v6['filtered_tweet'] = cleaned_tweets
data_v6.to_csv('../data/processed/Depression_Dataset_Reddit_Processed_v6.csv')

100%|██████████| 7731/7731 [00:17<00:00, 441.84it/s] 


## Final Dataset

In [20]:
data_final = pd.concat([data_v1,
                        data_v2,
                        data_v3,
                        data_v4,
                        data_v5,
                        data_v6])
data_final.shape

(346617, 2)

In [None]:
data_final = pd.concat([data_v1,
                        data_v2,
                        data_v3,
                        data_v4,
                        data_v5,
                        data_v6])
data_final.shape

(329775, 3)

In [33]:
data_final = clean_and_validate_dataset(df = data_final, 
                           text_col = 'filtered_tweet', 
                          label_col = 'is_depression')

Class distribution:
is_depression
0    168930
1    160291
Name: count, dtype: int64

Final dataset shape: (329221, 2)


In [34]:
#Save the final data
data_final.to_csv('../data/processed/data_final.csv')

Now we can use this data for modeling and get confident about the performance and good classifications models