# Depression Detection Model for Twitter Accounts
Santiago Paiz

Ekaterina Podruzhko

Abdullah Al Saidi

Hisham Al Hashmi



# Data Discovery

In [1]:
import pandas as pd

In [2]:
fighting_dep = pd.read_csv('data/depression/I am fighting depression.csv')
suffering_dep = pd.read_csv('data/depression/I suffer from depression.csv')
diagnosed_dep = pd.read_csv('data/depression/I am diagnosed with depression.csv')
users = pd.read_csv('data/depression/users_timelines.csv')


In [3]:
fighting_dep.sample(2)

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,...,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest
57,1430537144121151496,1430537144121151496,2021-08-25 14:26:47 UTC,2021-08-25,14:26:47,0,1168963714491568128,dbubblegumface,Donkeyboy Bubblegumface,,...,,,,,,[],,,,
25,1433196732188332035,1433196106385530880,2021-09-01 22:35:03 UTC,2021-09-01,22:35:03,0,1217528909383839746,agsmaiinthing,soph,,...,,,,,,"[{'screen_name': 'madisonbeer', 'name': 'madis...",,,,


In [4]:
suffering_dep.sample(2)

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,...,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest
25,1434874823537213443,1434870459401744389,2021-09-06 13:43:11 UTC,2021-09-06,13:43:11,0,1252233854401671168,assyjack,Assy Jack,,...,,,,,,"[{'screen_name': 'KendraWrites', 'name': 'Kend...",,,,
527,1432123792109735937,1432123792109735937,2021-08-29 23:31:34 UTC,2021-08-29,23:31:34,0,484986884,stoveraymond,Steve Raymond,,...,,,,,,[],,,,


In [5]:
diagnosed_dep.sample(2) 

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,...,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest
52,1430947902017478660,1430947902017478660,2021-08-26 17:39:00 UTC,2021-08-26,17:39:00,0,26999982,eyeammichael,Eye am RA ☀️,,...,,,,,,[],,,,
44,1431707154692526080,1431707129568587777,2021-08-28 19:56:00 UTC,2021-08-28,19:56:00,0,1365908852596563970,bl4rh31mr,Dormant Blauheim,,...,,,,,,[],,,,


In [6]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 36 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               48895 non-null  int64  
 1   conversation_id  48895 non-null  int64  
 2   created_at       48895 non-null  object 
 3   date             48895 non-null  object 
 4   time             48895 non-null  object 
 5   timezone         48895 non-null  int64  
 6   user_id          48895 non-null  int64  
 7   username         48895 non-null  object 
 8   name             48895 non-null  object 
 9   place            41 non-null     object 
 10  tweet            48895 non-null  object 
 11  language         48895 non-null  object 
 12  mentions         48895 non-null  object 
 13  urls             48895 non-null  object 
 14  photos           48895 non-null  object 
 15  replies_count    48895 non-null  int64  
 16  retweets_count   48895 non-null  int64  
 17  likes_count 

# Data Cleaning

1. Clean nan columns
2. Joining tables of depressed twitters (depressed)
3. Add column that represents if the user is depressed. (to the deprssed_df)
4. Intersect the depressed dataframe with the users data frame to categorize the depressed with the non depressed.
5. Get the final data frame by adding both

First of we want to take out all the columns that are completely empty.

## 1. Clean Nan Columns

First we drop all the columns that we know for certain that are null.

In [7]:
users.isnull().all()

id                 False
conversation_id    False
created_at         False
date               False
time               False
timezone           False
user_id            False
username           False
name               False
place              False
tweet              False
language           False
mentions           False
urls               False
photos             False
replies_count      False
retweets_count     False
likes_count        False
hashtags           False
cashtags           False
link               False
retweet            False
quote_url          False
video              False
thumbnail          False
near                True
geo                 True
source              True
user_rt_id          True
user_rt             True
retweet_id          True
reply_to           False
retweet_date        True
translate           True
trans_src           True
trans_dest          True
dtype: bool

In [8]:
users.dropna(how='all', axis=1, inplace=True)

In [9]:
fighting_dep.dropna(how='all', axis=1, inplace=True)
suffering_dep.dropna(how='all', axis=1, inplace=True)
diagnosed_dep.dropna(how='all', axis=1, inplace=True)

In [11]:
assert len(set(fighting_dep).symmetric_difference(set(suffering_dep))) == 0
assert len(set(suffering_dep).symmetric_difference(set(diagnosed_dep))) == 0

Three of the dataframes have the same schema, the only different dataframe is the users. So we get the difference between the two of them:

In [12]:
print(set(diagnosed_dep).symmetric_difference(users))

{'place'}


The only difference is that the users dataframe contains information about the place of where 

In [15]:
users[users['place'].isnull() == False].sample(3)

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,...,retweets_count,likes_count,hashtags,cashtags,link,retweet,quote_url,video,thumbnail,reply_to
29497,1435217862138941447,1435217862138941447,2021-09-07 12:26:18 UTC,2021-09-07,12:26:18,0,65581806,scott1984fp,Scott1984FP,"{'type': 'Point', 'coordinates': [52.13921, -0...",...,1,1,[],[],https://twitter.com/Scott1984FP/status/1435217...,False,,0,,[]
29425,1435284078245203968,1435284078245203968,2021-09-07 16:49:25 UTC,2021-09-07,16:49:25,0,65581806,scott1984fp,Scott1984FP,"{'type': 'Point', 'coordinates': [52.13921, -0...",...,0,0,[],[],https://twitter.com/Scott1984FP/status/1435284...,False,,0,,[]
29417,1435313579582967812,1435313579582967812,2021-09-07 18:46:38 UTC,2021-09-07,18:46:38,0,65581806,scott1984fp,Scott1984FP,"{'type': 'Point', 'coordinates': [52.13921, -0...",...,0,0,[],[],https://twitter.com/Scott1984FP/status/1435313...,False,,0,,[]


## 2. Joining tables

In [22]:
depressed = pd.concat([fighting_dep, suffering_dep, diagnosed_dep], ignore_index=True)

## 3. Add column that represents if the user is depressed

In [37]:
depressed['Depressed'] = 1

## 4. Intersect the depressed dataframe with the users data frame to categorize the depressed with the non depressed.

In [38]:
intersection = set(depressed['user_id'].values).intersection(set(users['user_id'].values))

In [39]:
def add_depression(row):
    if row['user_id'] in intersection:
        return 1
    return 0

In [40]:
users['Depressed'] = users.apply(add_depression, axis=1)

## 5. Get the final data frame by adding both

In [42]:
complete_df = pd.concat([depressed, users], ignore_index=True)

In [48]:
complete_df[['Depressed', 'id']].groupby('Depressed').count()

Unnamed: 0_level_0,id
Depressed,Unnamed: 1_level_1
0,8694
1,40893
