In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('future.no_silent_downcasting', True)
import warnings
warnings.filterwarnings("ignore", "is_categorical_dtype")
warnings.filterwarnings("ignore", "use_inf_as_na")

# Data Preparation

In [2]:
data_path = '/kaggle/input/the-depression-dataset/data/'
activity = pd.DataFrame()
for dirname, _, filenames in os.walk(data_path):
    for filename in filenames:
        if filename == 'scores.csv':
            continue
        tmp = pd.read_csv(os.path.join(dirname, filename))
        tmp['timestamp'] = pd.to_datetime(tmp['timestamp'])
        tmp['number'] = filename[:-4]
        tmp = tmp.drop(columns=['date'])
        activity = pd.concat([activity, tmp])

Activity files contains following columns:

* timestamp(one minute intervals)
* date(date of measurement)
* activity(activity measurement from the actigraph watch)

In [3]:
activity.sample(5)

Unnamed: 0,timestamp,activity,number
12342,2003-03-27 04:42:00,0,control_32
17849,2004-01-21 18:29:00,204,control_18
3830,2003-06-15 02:20:00,59,condition_5
1412,2004-09-01 08:32:00,499,condition_10
21464,2004-03-10 06:44:00,0,control_26


In [4]:
scores_path = '/kaggle/input/the-depression-dataset/data/scores.csv'
df = pd.read_csv(scores_path)

The scores file contains the following columns:

* number(patient identifier)
* 
* days(number of days of measurements)
* 
* gender(1 or 2 for female or male)
* 
* age(age in age groups)
* 
* afftype(1: bipolar II, 2: unipolar depressive, 3: bipolar I) 
* 
* melanch (1: melancholia, 2: no melancholia)
* 
* inpatient (1: inpatient, 2: outpatient)
* 
* edu (education grouped in years)
* 
* marriage (1: married or cohabiting, 2: single)
* 
* work (1: working or studying, 2: unemployed/sick leave/pension)
* 
* madrs1 (MADRS score when measurement started)
* 
* madrs2 (MADRS when measurement stopped)

In [5]:
df.sample(5)

Unnamed: 0,number,days,gender,age,afftype,melanch,inpatient,edu,marriage,work,madrs1,madrs2
0,condition_1,11,2,35-39,2.0,2.0,2.0,6-10,1.0,2.0,19.0,19.0
3,condition_4,13,2,25-29,2.0,2.0,2.0,11-15,1.0,1.0,20.0,16.0
24,control_2,20,1,30-34,,,,,,,,
11,condition_12,12,2,40-44,1.0,2.0,2.0,6-10,2.0,2.0,25.0,21.0
39,control_17,9,1,45-49,,,,,,,,


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   number     55 non-null     object 
 1   days       55 non-null     int64  
 2   gender     55 non-null     int64  
 3   age        55 non-null     object 
 4   afftype    23 non-null     float64
 5   melanch    20 non-null     float64
 6   inpatient  23 non-null     float64
 7   edu        53 non-null     object 
 8   marriage   23 non-null     float64
 9   work       23 non-null     float64
 10  madrs1     23 non-null     float64
 11  madrs2     23 non-null     float64
dtypes: float64(7), int64(2), object(3)
memory usage: 5.3+ KB


In [7]:
print(f'age unique: {df.age.unique()}\nedo unique: {df.edu.unique()}')

age unique: ['35-39' '40-44' '45-49' '25-29' '50-54' '20-24' '60-64' '55-59' '30-34'
 '65-69']
edo unique: ['6-10' '11-15' '16-20' ' ' nan]
