In [1]:
import string
import re
import pandas as pd
from tqdm import tqdm #adding progress bars to show the processing
import preprocessor as p
from  nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Data preprocessing steps

**Preprocessing here is done by two methods:**

### Method1: 
Using tweet-preprocessor. Preprocessor is a preprocessing library for tweet data written in Python. When building Machine Learning systems based on tweet data, preprocessing is required. This library makes it easy to clean, parse or tokenize the tweets.

### Method2:
 We’ve manually defined a function to double-check our tweet preprocessing and it’s always better to be sure that our data is cleaned 100%. The method called clean_tweets(tweet) will clean some remains of the Twitter data left undone by the tweet-preprocessor and double-check emoticons and emojis because some older version of mobile’s emoticons is not supported in the tweet preprocessor’s clean method (Method1).

#### Preprocessing techniques
* Handle Emoticons and Emojis
* Removing mentions
* Remove consecutive non-ASCII characters
* Remove punctuation and numbers
* Remove the word if it less than 4 character
* Remove stop words
* Applay lemmatization and stemming on the words

 Let’s declare a series of emoticons (Happy & Sad) because we don’t need the old school emoticons in the middle of a sentence blocking us against our sentiment analysis.

In [2]:
#HappyEmoticons
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
   ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D','=-3', '=3', ':-))',
    ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P','x-p', 'xp', 'XP', ':-p', ':p', '=p',
    ':-b', ':b', '>:)', '>;)', '>:-)','<3'
    ])

# Sad Emoticons
emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])

Because that’s a must, nowadays people don’t tweet without emojis, as in a matter of fact it became another language, so have to come up with a plan to do so.

In [3]:
#Emoji patterns
emoji_pattern = re.compile("["
         u"\U0001F600-\U0001F64F"  # emoticons
         u"\U0001F300-\U0001F5FF"  # symbols & pictographs
         u"\U0001F680-\U0001F6FF"  # transport & map symbols
         u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
         u"\U00002702-\U000027B0"
         u"\U000024C2-\U0001F251"
         "]+", flags=re.UNICODE)


And then we combine both happy and sad emoticon array-lists first:

In [4]:
#combine sad and happy emoticons
emoticons = emoticons_happy.union(emoticons_sad)

Define the lemmatization

In [5]:
#stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

In [5]:
# words difficult to detect by the preprocessing modules
difficult_to_detect = ["'re","'s","'m","'ve","n't","...","``","'","im",
    "ca","itv","-","a.","dont","us","could","can","'d","__",'aaron', 'ab', 
    'zurab', 'zwart', 'zyl',"ll","u",'__', '___', '____','a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from', 
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're',
             's', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']

difficult_to_detect = set(difficult_to_detect)

Declare the clean_tweets function

In [6]:
def clean_tweets(tweet):

    tweet = tweet.lower() #lower the words to be in the same format for the modules

    tweet = re.sub (r':', '', tweet) #after tweepy preprocessing the colon symbol left remain after

    tweet = re.sub (r',ÄI', '', tweet) #removing mentions
    
    tweet = re.sub (r'[^\x00-\x7F]+','', tweet) #replace consecutive non-ASCII characters with a space
    
    tweet = emoji_pattern.sub (r'', tweet)  #remove emojis from tweet
    
    tweet = re.sub('[0-9]+', '', tweet) #remove numbers

    tweet = re.sub(f'[{string.punctuation}]','',tweet) #remove punctuation 
    
    stop_words = set(stopwords.words('english')) #get the stop words

    word_tokens = word_tokenize(tweet) #extract the tokens from string of characters
    

    filtered_tweet = [] 

    #looping through conditions to filter the words
    for w in word_tokens:
        
        #check tokens against stop words, emoticons and words difficult to detect 
        if w not in stop_words and w not in emoticons and w not in difficult_to_detect:

            if len(w)>1: #remove the word if it less than 2 character

                w = lemmatizer.lemmatize(w) # Applay lemmatization on the word 

                filtered_tweet.append(w) #Append the pure word to the list after cleaning

    return ' '.join(filtered_tweet) #Reconstruct the tweet after cleaning

# Data collection and preprocessing

We have collected data using Twitter API, from older research and projects that discussed depression and anxiety detection, and from the Reddit site. The data collection process was iterable with the process cycle of the project.

### Version 1 of the data


In the first cycle, we gathered data that have 20000 rows and then trained the first version of the models using it.

In [8]:
data_v1 = pd.read_csv('E:/Data/Gradution project/Term1 final Gradution project/Memtal health data/Raw data in the model/Mental-Health-Twitter.csv')
data_v1

Unnamed: 0.1,Unnamed: 0,post_id,post_created,post_text,user_id,followers,friends,favourites,statuses,retweets,label
0,0.0,6.378950e+17,Sun Aug 30 07:48:37 +0000 2015,It's just over 2 years since I was diagnosed w...,1.013187e+09,84.0,211.0,251.0,837.0,0.0,1
1,1.0,6.378900e+17,Sun Aug 30 07:31:33 +0000 2015,"It's Sunday, I need a break, so I'm planning t...",1.013187e+09,84.0,211.0,251.0,837.0,1.0,1
2,2.0,6.377490e+17,Sat Aug 29 22:11:07 +0000 2015,Awake but tired. I need to sleep but my brain ...,1.013187e+09,84.0,211.0,251.0,837.0,0.0,1
3,3.0,6.376960e+17,Sat Aug 29 18:40:49 +0000 2015,RT @SewHQ: #Retro bears make perfect gifts and...,1.013187e+09,84.0,211.0,251.0,837.0,2.0,1
4,4.0,6.376960e+17,Sat Aug 29 18:40:26 +0000 2015,It’s hard to say whether packing lists are mak...,1.013187e+09,84.0,211.0,251.0,837.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...
20002,,,,I am so sad and not happy,,,,,,,1
20003,,,,I am disapointed,,,,,,,1
20004,,,,the product is horrible,,,,,,,1
20005,,,,I don't feel so good,,,,,,,1


Here we apply the two preprocessing functions to clean the tweets.

In [9]:
cleaned_tweets = []
for i in tqdm(range(len(data_v1['post_text']))):
    tweet = data_v1['post_text'][i] #loop on every tweet
    clean_text = p.clean(tweet) #Use tweet-preprocessor module
    filtered_tweet = clean_tweets(clean_text) #Use clean_tweets function
    cleaned_tweets.append(filtered_tweet)

100%|██████████| 20007/20007 [00:21<00:00, 911.40it/s] 


In [10]:
data_v1['cleaned_text'] = cleaned_tweets
data_v1 = data_v1[['cleaned_text','label']] #Select the columns we interested in for classification
data_v1.rename(columns = {'cleaned_text':'filtered_tweet','label':'is_depression'}, inplace = True ) #Rename the columns for clarification
data_v1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_v1.rename(columns = {'cleaned_text':'filtered_tweet','label':'is_depression'}, inplace = True ) #Rename the columns for clarification


Unnamed: 0,filtered_tweet,is_depression
0,year since diagnosed today taking moment refle...,1
1,sunday need break planning spend little time p...,1
2,awake tired need sleep brain idea,1
3,bear make perfect gift great beginner get stit...,1
4,hard say whether packing list making life easi...,1
...,...,...
20002,sad happy,1
20003,disapointed,1
20004,product horrible,1
20005,feel good,1


In [11]:
# data_v1.to_csv('E:/Data/Gradution project/Term1 final Gradution project/Memtal health data/Raw data in the model/data_v1.csv')

### Version 2 of the data

We collected more data for serving the models and increasing the performance. In the second cycle, we gathered data that have 47185 rows and then trained the second version of the models using it and the first version of data.

In [12]:
data_v2 = pd.read_csv('E:/Data/Gradution project/Term1 final Gradution project/Memtal health data/Raw data in the model/Data_07092022_v1.csv')
cleaned_tweets = []
for i in tqdm(range(len(data_v2['filtered_tweet']))):
    tweet = data_v2['filtered_tweet'][i]
    clean_text = p.clean(tweet)
    filtered_tweet = clean_tweets(clean_text)
    cleaned_tweets.append(filtered_tweet)
    
data_v2['filtered_tweet'] = cleaned_tweets
data_v2

100%|██████████| 47185/47185 [00:59<00:00, 791.15it/s] 


Unnamed: 0,is_depression,filtered_tweet
0,1,recently went breakup said still want friend s...
1,0,know navigate feeling new feeling stretch unde...
2,0,month already told depressed week hing particu...
3,1,exhausted think finally rest think maybe thing...
4,1,severly bullied since till resulted depressed ...
...,...,...
47180,0,health wealth planning health conscious people
47181,1,gatwick noise sewer full flow plan add two run...
47182,0,instead money bank amp cloud tree based agricu...
47183,0,happy birthday bos money account year surface ...


### Version 3 of the data

Repeated the data collection process to get higher performance for our models. In the last cycle, we gathered data that have 232074 rows and then trained the last version of the models using it and the other versions of data.

In [16]:
data_v3 = pd.read_csv('E:/Data/Gradution project/Term1 final Gradution project/Memtal health data/Raw data in the model/Suicide_Detection.csv', encoding = "ISO-8859-1")
data_v3 = data_v3[['text','class']]
data_v3.rename(columns = {'text':'filtered_tweet','class':'is_depression'},inplace = True )
data_v3['is_depression'].replace({"suicide":1,"non-suicide":0},inplace = True )
data_v3.head()

  data_v3 = pd.read_csv('E:/Data/Gradution project/Term1 final Gradution project/Memtal health data/Raw data in the model/Suicide_Detection.csv', encoding = "ISO-8859-1")
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_v3.rename(columns = {'text':'filtered_tweet','class':'is_depression'},inplace = True )


Unnamed: 0,filtered_tweet,is_depression
0,Ex Wife Threatening SuicideRecently I left my ...,1
1,Am I weird I don't get affected by compliments...,0
2,Finally 2020 is almost over... So I can never ...,0
3,i need helpjust help me im crying so hard,1
4,"Iâm so lostHello, my name is Adam (16) and I...",1


In [17]:
#Check the number of rows for each class
data_v3 = data_v3[(data_v3['is_depression'] == 1) | (data_v3['is_depression'] == 0)]

In [18]:
data_v3.reset_index(inplace = True)

In [20]:
data_v3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232045 entries, 0 to 232044
Data columns (total 3 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   index           232045 non-null  int64 
 1   filtered_tweet  232045 non-null  object
 2   is_depression   232045 non-null  object
dtypes: int64(1), object(2)
memory usage: 5.3+ MB


In [22]:
#Check the number of rows for each class
data_v3['is_depression'].value_counts()

1    116033
0    116012
Name: is_depression, dtype: int64

In [23]:
cleaned_tweets = []
for i in tqdm(range(len(data_v3['filtered_tweet']))):
    tweet = data_v3['filtered_tweet'][i]
    clean_text = p.clean(tweet)
    filtered_tweet = clean_tweets(clean_text)
    cleaned_tweets.append(filtered_tweet)
    
data_v3['filtered_tweet'] = cleaned_tweets
data_v3

100%|██████████| 232045/232045 [07:10<00:00, 538.44it/s]


Unnamed: 0,index,filtered_tweet,is_depression
0,0,ex wife threatening suiciderecently left wife ...,1
1,1,weird get affected compliment coming someone k...,0
2,2,finally almost never hear bad year ever swear ...,0
3,3,need helpjust help cry hard,1
4,4,losthello name adam ive struggling year afraid...,1
...,...,...,...
232040,233398,like rock going get anything go,0
232041,233399,tell many friend lonely everything deprived pr...,0
232042,233400,pee probably taste like salty tea someone dran...,0
232043,233401,usual stuff find hereim posting sympathy pity ...,1


In [24]:
data_v3 = data_v3[['filtered_tweet','is_depression']]
data_v3

Unnamed: 0,filtered_tweet,is_depression
0,ex wife threatening suiciderecently left wife ...,1
1,weird get affected compliment coming someone k...,0
2,finally almost never hear bad year ever swear ...,0
3,need helpjust help cry hard,1
4,losthello name adam ive struggling year afraid...,1
...,...,...
232040,like rock going get anything go,0
232041,tell many friend lonely everything deprived pr...,0
232042,pee probably taste like salty tea someone dran...,0
232043,usual stuff find hereim posting sympathy pity ...,1


Integrate the three versions of data for the training

In [27]:
final_data = pd.concat([data_v1,data_v2,data_v3])
final_data.shape

(299237, 2)

In [28]:
final_data

Unnamed: 0,filtered_tweet,is_depression
0,year since diagnosed today taking moment refle...,1
1,sunday need break planning spend little time p...,1
2,awake tired need sleep brain idea,1
3,bear make perfect gift great beginner get stit...,1
4,hard say whether packing list making life easi...,1
...,...,...
232040,like rock going get anything go,0
232041,tell many friend lonely everything deprived pr...,0
232042,pee probably taste like salty tea someone dran...,0
232043,usual stuff find hereim posting sympathy pity ...,1


In [29]:
#Check for duplicates before saving
final_data.drop_duplicates(inplace= True)
final_data.shape

(296712, 2)

In [30]:
#Check the number of rows for each class
final_data['is_depression'].value_counts()

0    150953
1    145759
Name: is_depression, dtype: int64

In [31]:
final_data[final_data['filtered_tweet']=='']

Unnamed: 0,filtered_tweet,is_depression
81,,1
10684,,0


In [32]:
#Check for empty tweets
final_data = final_data[final_data['filtered_tweet']!='']
final_data[final_data['filtered_tweet']=='']

Unnamed: 0,filtered_tweet,is_depression


In [33]:
# #Save the final data
# final_data.to_csv('C:/Users/ZIAD/Desktop/Gradution/final_data.csv')

## Version 4

In [34]:
data_v4 = pd.read_csv('E:/Data/Gradution project/Term1 final Gradution project/Memtal health data/Raw data in the model/Mental_Health_Corpus.csv')
data_v4.rename(columns = {'text':'filtered_tweet','label':'is_depression'},inplace = True )
data_v4.head()

Unnamed: 0,filtered_tweet,is_depression
0,dear american teens question dutch person hear...,0
1,nothing look forward lifei dont many reasons k...,1
2,music recommendations im looking expand playli...,0
3,im done trying feel betterthe reason im still ...,1
4,worried year old girl subject domestic physic...,1


In [35]:
cleaned_tweets = []
for i in tqdm(range(len(data_v4['filtered_tweet']))):
    tweet = data_v4['filtered_tweet'][i]
    clean_text = p.clean(tweet)
    filtered_tweet = clean_tweets(clean_text)
    cleaned_tweets.append(filtered_tweet)
    
data_v4['filtered_tweet'] = cleaned_tweets
data_v4

100%|██████████| 27977/27977 [00:46<00:00, 600.32it/s]


Unnamed: 0,filtered_tweet,is_depression
0,dear american teen question dutch person heard...,0
1,nothing look forward lifei many reason keep go...,1
2,music recommendation looking expand playlist u...,0
3,done trying feel betterthe reason still alive ...,1
4,worried year old girl subject domestic physica...,1
...,...,...
27972,posting everyday people stop caring religion m...,0
27973,okay definetly need hear guy opinion ive prett...,0
27974,cant get dog think ill kill myselfthe last thi...,1
27975,whats point princess bridei really think like ...,1


In [37]:
data_v4_final = pd.concat([final_data,data_v4])
data_v4_final.shape

(324687, 2)

In [38]:
#Check for duplicates before saving
data_v4_final.drop_duplicates(inplace= True)
data_v4_final.shape

(311732, 2)

In [39]:
data_v4_final =  data_v4_final[data_v4_final['filtered_tweet'] != '']

In [40]:
# #Save the final data
# data_v4_final.to_csv('E:/Data/Gradution project/Term1 final Gradution project/data_v4_final.csv')

## Version 5

In [41]:
data_v5 = pd.read_csv('E:/Data/Gradution project/Term1 final Gradution project/Memtal health data/Raw data in the model/sentiment_tweets3.csv')
data_v5.head()

Unnamed: 0,Index,message to examine,label (depression result)
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat Need to send 'em to my accountant tomo...,0
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0


In [42]:
list(data_v5)

['Index', 'message to examine', 'label (depression result)']

In [43]:
data_v5.rename(columns = {"message to examine":'filtered_tweet','label (depression result)':'is_depression'},inplace = True )
data_v5.head()

Unnamed: 0,Index,filtered_tweet,is_depression
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat Need to send 'em to my accountant tomo...,0
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0


In [45]:
cleaned_tweets = []
for i in tqdm(range(len(data_v5['filtered_tweet']))):
    tweet = data_v5['filtered_tweet'][i]
    clean_text = p.clean(tweet)
    filtered_tweet = clean_tweets(clean_text)
    cleaned_tweets.append(filtered_tweet)
    
data_v5['filtered_tweet'] = cleaned_tweets
data_v5

100%|██████████| 10314/10314 [00:07<00:00, 1437.38it/s]


Unnamed: 0,Index,filtered_tweet,is_depression
0,106,real good moment miss much,0
1,217,reading manga,0
2,220,,0
3,288,need send em accountant tomorrow oddly wasnt e...,0
4,540,add myspace,0
...,...,...,...
10309,802309,depression herbo mood done stressing people de...,1
10310,802310,depression succumbs brain make feel like never...,1
10311,802311,ketamine nasal spray show promise depression s...,1
10312,802312,mistake bad day depression everyone em,1


In [46]:
# #Save the the version 5 data
# data_v5.to_csv('E:/Data/Gradution project/Term1 final Gradution project/data_v5.csv')

In [48]:
data_v5_final = pd.concat([data_v4_final,data_v5])
data_v5_final.shape

(322044, 3)

## Version 6

In [49]:
data_v6 = pd.read_csv('E:/Data/Gradution project/Term1 final Gradution project/Memtal health data/Raw data in the model/depression_dataset_reddit_cleaned.csv')
data_v6.rename(columns = {"clean_text":'filtered_tweet','is_depression':'is_depression'},inplace = True )
data_v6.head()

Unnamed: 0,filtered_tweet,is_depression
0,we understand that most people who reply immed...,1
1,welcome to r depression s check in post a plac...,1
2,anyone else instead of sleeping more when depr...,1
3,i ve kind of stuffed around a lot in my life d...,1
4,sleep is my greatest and most comforting escap...,1


In [50]:
cleaned_tweets = []
for i in tqdm(range(len(data_v6['filtered_tweet']))):
    tweet = data_v6['filtered_tweet'][i]
    clean_text = p.clean(tweet)
    filtered_tweet = clean_tweets(clean_text)
    cleaned_tweets.append(filtered_tweet)
    
data_v6['filtered_tweet'] = cleaned_tweets
data_v6

100%|██████████| 7731/7731 [00:08<00:00, 913.19it/s] 


Unnamed: 0,filtered_tweet,is_depression
0,understand people reply immediately op invitat...,1
1,welcome depression check post place take momen...,1
2,anyone else instead sleeping depressed stay ni...,1
3,kind stuffed around lot life delaying inevitab...,1
4,sleep greatest comforting escape whenever wake...,1
...,...,...
7726,snow,0
7727,moulin rouge mad cry,0
7728,trying shout find people list,0
7729,ughh find red sox hat got ta wear creepy nick ...,0


In [51]:
data_final = pd.concat([data_v5_final,data_v6])
data_final.shape

(329775, 3)

In [52]:
data_final['is_depression'].value_counts()

0    169261
1    160514
Name: is_depression, dtype: int64

In [53]:
data_final.shape

(329775, 3)

In [54]:
#Check for duplicates before saving
data_final.drop_duplicates(inplace= True)
data_final.shape

(329642, 3)

In [55]:
data_final = data_final[data_final['filtered_tweet'] != '']

In [56]:
data_final

Unnamed: 0,filtered_tweet,is_depression,Index
0,year since diagnosed today taking moment refle...,1,
1,sunday need break planning spend little time p...,1,
2,awake tired need sleep brain idea,1,
3,bear make perfect gift great beginner get stit...,1,
4,hard say whether packing list making life easi...,1,
...,...,...,...
7726,snow,0,
7727,moulin rouge mad cry,0,
7728,trying shout find people list,0,
7729,ughh find red sox hat got ta wear creepy nick ...,0,


In [57]:
#Save the final data
data_final.to_csv('E:/Data/Gradution project/Term1 final Gradution project/Memtal health data/Raw data in the model/data_final.csv')

Now we can use this data for modeling and get confident about the performance and good classifications models