In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import joypy
import altair as alt

from sklearn import linear_model
from sklearn.preprocessing import LabelEncoder

import random

import pytz

import warnings
warnings.filterwarnings("ignore")

# DASS Analysis

### Read in the data

In [None]:
data = pd.read_csv('../data/DASS_data_21.02.19/data.csv', delimiter='\t')


display(data)

## Overview of the data

## Separate the Data into the 3 different diseases

For the first part we need the columns begining with Q

In [None]:
Q_cols = [col for col in list(data) if col[0]=='Q']

data_self = data[Q_cols]

display(data_self)

_ The type of Question is recorded in http://www2.psy.unsw.edu.au/dass//Download%20files/Dass_template.pdf _ so I had to represent this for safekeeping 

In [None]:
Q_Type = ['S', 'A', 'D', 'A', 'D', 'S', 'A', 'S', 'A', 'D', 'S', 'S', 'D', 'S', 'A', 'D', 'D', 'S', 'A', 'A', 'D', 'S', 'A', 'D', 'A', 'D', 'S', 'A', 'S', 'A', 'D', 'S', 'S', 'D', 'S', 'A', 'D', 'D', 'S', 'A', 'A', 'D']

Q = ['Q'+str(i) for i in range(1, 43)]

Q_Type_df = pd.DataFrame(data= [Q_Type], columns=Q)

display(Q_Type_df)

### Depression 

In [None]:
depression = [col for col in Q_Type_df if Q_Type_df[col].all()=='D' ]

data_depression = list(i for i in data_self for j in depression if i.startswith(j.strip()))

data_depression_answers = pd.DataFrame()
for col in data_depression:
    col = col.strip()
    if col.endswith('A'):
        data_depression_answers[col] = data[col]

data_depression_answers['scores'] = data_depression_answers.sum(axis = 1)

display(data_depression_answers['scores'].describe())

#data_depression_answers

In [None]:
data_depression_answers.to_csv('../data/depression.csv')

Let's rank the depression scores as:   
'Normal' => 0-9,   
'Mild' => 10- 13,   
'Moderate' => 14 - 20,   
'Severe' => 21 - 27,   
'Extremely Severe' => 28++

In [None]:
## Percentages for Depression
print(
    (len(data_depression_answers[data_depression_answers['scores'] < 10])*100)/len(data_depression_answers),
    (len(data_depression_answers[data_depression_answers['scores'] < 14])*100)/len(data_depression_answers),
    (len(data_depression_answers[data_depression_answers['scores'].between(14,20)])*100)/len(data_depression_answers), 
    (len(data_depression_answers[data_depression_answers['scores'].between(21,27)])*100)/len(data_depression_answers), 
    (len(data_depression_answers[data_depression_answers['scores'] > 27])*100)/len(data_depression_answers)
    , sep= '\n')

'Normal' => 0,  
'Mild' => 0,  
'Moderate' => 373,   
'Severe' => 3697,   
'Extremely Severe' => 35705

### Anxiety

In [None]:
anxiety = [col for col in Q_Type_df if Q_Type_df[col].all()=='A' ]

data_anxiety = list(i for i in data_self for j in anxiety if i.startswith(j.strip()))

data_anxiety_answers = pd.DataFrame()
for col in data_anxiety:
    col = col.strip()
    if col.endswith('A'):
        data_anxiety_answers[col] = data[col]

data_anxiety_answers['scores'] = data_anxiety_answers.sum(axis = 1)

display(data_anxiety_answers['scores'].describe())
#display(data_anxiety_answers)
data_anxiety_answers.to_csv('../data/anxiety.csv')

In [None]:
## Percentages for Depression
print(
    (len(data_anxiety_answers[data_anxiety_answers['scores'] < 7])*100)/len(data_anxiety_answers),
    (len(data_anxiety_answers[data_anxiety_answers['scores'] < 9])*100)/len(data_anxiety_answers),
    (len(data_anxiety_answers[data_anxiety_answers['scores'].between(10,14)])*100)/len(data_anxiety_answers), 
    (len(data_anxiety_answers[data_anxiety_answers['scores'].between(15,19)])*100)/len(data_anxiety_answers), 
    (len(data_anxiety_answers[data_anxiety_answers['scores'] > 20])*100)/len(data_anxiety_answers)
    , sep= '\n')

Let's rank the anxiety scores according to the [DASS Webpage](http://www2.psy.unsw.edu.au/groups/dass/):   
'Normal' => 0-7,   
'Mild' => 8- 9,   
'Moderate' => 10 - 14,   
'Severe' => 15 - 19,   
'Extremely Severe' => 20++  

We can already see from the description of the scores that the minimum score is 21 so the whole population according to the DASS website is _Extremely Severely Anxious_.

### Stress

In [None]:
stress = [col for col in Q_Type_df if Q_Type_df[col].all()=='S' ]

data_stress = list(i for i in data_self for j in stress if i.startswith(j.strip()))

data_stress_answers = pd.DataFrame()
for col in data_stress:
    col = col.strip()
    if col.endswith('A'):
        data_stress_answers[col] = data[col]

data_stress_answers['scores'] = data_stress_answers.sum(axis = 1)

display(data_stress_answers['scores'].describe())
#display(data_stress_answers)
data_stress_answers.to_csv('../data/stress.csv')

Let's rank the stress scores according to the DASS Webpage:  
'Normal' => 0-14,  
'Mild' => 15-18,  
'Moderate' => 19 - 25,  
'Severe' => 26 - 33,   
'Extremely Severe' => 34++


In [None]:
## Percentages for Stress
print(
    (len(data_stress_answers[data_stress_answers['scores'] < 10])*100)/len(data_stress_answers),
    (len(data_stress_answers[data_stress_answers['scores'] < 14])*100)/len(data_stress_answers),
    (len(data_stress_answers[data_stress_answers['scores'].between(14,20)])*100)/len(data_stress_answers), 
    (len(data_stress_answers[data_stress_answers['scores'].between(21,27)])*100)/len(data_stress_answers), 
    (len(data_stress_answers[data_stress_answers['scores'] > 27])*100)/len(data_stress_answers)
    , sep= '\n')

In [None]:
print(
    (len(data_stress_answers[data_stress_answers['scores'] < 10])),
    (len(data_stress_answers[data_stress_answers['scores'] < 14])),
    (len(data_stress_answers[data_stress_answers['scores'].between(14,20)])), 
    (len(data_stress_answers[data_stress_answers['scores'].between(21,27)])), 
    (len(data_stress_answers[data_stress_answers['scores'] > 27]))
    , sep= '\n')

## Look for Relationships

In [None]:
print(data.columns)

Included with the answers to these questions are demographics such as _hand, religion, orientation, race, voted, married, familysize and major_  

__How do these demographics influence the scores?__  

I'll start by making a dataframe which contains these demographics together with the scores for depression, anxiety and stress

In [None]:
data_scores = pd.DataFrame(list(zip(
                                    data_depression_answers['scores'], 
                                    data_anxiety_answers['scores'], 
                                    data_stress_answers['scores'],
                                    data.hand, 
                                    data.religion,
                                    data.orientation, 
                                    data.race, 
                                    data.voted, 
                                    data.married, 
                                    data.familysize, 
                                    data.major,
                                    data.education, 
                                    data.country, 
                                    data.urban, 
                                    data.age
                                  )),
                            columns=['Depression', 'Anxiety', 'Stress','hand', 'religion','orientation', 'race', 'voted', 'married', 'familysize', 'major', 'education', 'country', 'urban', 'age'])

data_scores.info()

In [None]:
pd.set_option('display.max_row', 1000)

# Make all items uppercase then get unique values
majors = list(data_scores['major'].str.upper().unique())

In [None]:
display(data_scores.head(10))

'major' still has string values so let's transform it.

In [None]:
l_enc=LabelEncoder()

data_scores_no_null=data_scores.dropna().reset_index(drop=True)
data_scores_no_null=data_scores_no_null[(data_scores_no_null.country!='NONE')&(data_scores_no_null.country!='XK')]

data_scores_no_null.major=data_scores_no_null.major.str.upper()
display(data_scores_no_null.head())

data_scores_no_null['major_coded']=l_enc.fit_transform(data_scores_no_null['major'])

#data_scores_no_null

In [None]:
a_reg=linear_model.LinearRegression()
b_reg=linear_model.LinearRegression()
c_reg=linear_model.LinearRegression()
d_reg=linear_model.LinearRegression()
e_reg=linear_model.LinearRegression()
f_reg=linear_model.LinearRegression()
g_reg=linear_model.LinearRegression()
h_reg=linear_model.LinearRegression()

# How does hand influence depression?
X,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12 = data_scores_no_null[['Depression','Anxiety','Stress']],data_scores_no_null[['hand']],data_scores_no_null[['religion']],data_scores_no_null[['orientation']],data_scores_no_null[['race']],data_scores_no_null[['voted']],data_scores_no_null[['married']],data_scores_no_null[['familysize']],data_scores_no_null[['major_coded']],data_scores_no_null[['education']],data_scores_no_null[['country']],data_scores_no_null[['urban']],data_scores_no_null[['age']]

a_reg.fit(X.values,y1.values)
print(*a_reg.coef_,'hand')

b_reg.fit(X.values,y2.values)
print(*b_reg.coef_,'religion')

c_reg.fit(X.values,y3.values)
print(*c_reg.coef_,'orientation')

d_reg.fit(X.values,y4.values)
print(*d_reg.coef_,'race')

e_reg.fit(X.values,y5.values)
print(*e_reg.coef_,'voted')

f_reg.fit(X.values,y6.values)
print(*f_reg.coef_,'married')

g_reg.fit(X.values,y7.values)
print(*g_reg.coef_,'familysize')

h_reg.fit(X.values,y8.values)
print(*h_reg.coef_,'major_coded')

e_reg.fit(X.values,y9.values)
print(*e_reg.coef_,'education')

g_reg.fit(X.values,y11.values)
print(*g_reg.coef_,'urban')

h_reg.fit(X.values,y12.values)
print(*h_reg.coef_,'age')

The result array represents the coefficient values of Depression, Anxiety and Stress.   

In the case of 'hand':   
Depression: -4.95419093e-04     
Anxiety: 1.06536105e-03    
Stress: -1.56561544e-05  

In the case of 'urban':   
Depression: 9.06183139e-04       
Anxiety: 1.77405256e-04     
Stress: -1.59427085e-05  
The values are too small to be an influence significant enough of the hand used or what neighborhood they grew in.  
However, if we think of it, where one grew up plays a big enough role of if they get one of these diseases.  

In the case of 'religion':   
Depression: -0.05492764     
Anxiety: 0.06825119   
Stress: -0.00303897    
The values are big enough to be significant.  

For 'orientation':   
Depression: -0.00129916        
Anxiety: 0.01209818    
Stress: -0.00434024   
Also significant enough to be considered.  

'race':   
Depression: 0.20711314        
Anxiety: -0.43028709         
Stress: 0.15951941   

'voted':   
Depression: -0.00238202            
Anxiety: 0.00788635         
Stress: -0.00188073   

'married':   
Depression: -0.00119288            
Anxiety: -0.00875352            
Stress: 0.0058205   

'familysize':   
Depression: -0.01307079                
Anxiety: 0.02121744            
Stress: -0.00851036   

'major':   
Depression: -0.64239818               
Anxiety: -1.9763147              
Stress: 1.48369737   
Which are all significant enough to be considered.


In [None]:
sns_plot = sns.pairplot(
                          data_scores_no_null, 
                          corner=True, 
                          diag_kws=dict(fill=False), 
                          plot_kws=dict(marker="+", linewidth=1),
                       )
sns_plot.savefig("output.png")

### Visualize ,Visualize, Visualize

In [None]:
data_scores_no_null.country = [pytz.country_names[code] for code in data_scores_no_null.country]


In [None]:
""" fig1,axes1 = joypy.joyplot(data_scores_no_null,
                           alpha=1,
                           background=None,
                           bins=2,
                           by="country",
                           color=['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a'],
                           column=['hand','religion','orientation','race','voted','married','familysize','education','urban','age'],
                           fade=True,
                           figsize=(19,28),
                           fill=True,
                           kind="normalized_counts",
                           legend=True,
                           linecolor='w',
                           linewidth=.25,
                           overlap=4,
                           range_style='own',
                           xlabels=False,
                           xlabelsize=10,
                           x_range=(-10,90),
                           xrot=60,
                           ylabels=True,
                           ylabelsize=10,
                           ylim='max',
                           yrot=None) """

In [None]:
a_plot = pd.read_csv('../data/Categorical.csv')

a_plot.country = [pytz.country_names[code] for code in a_plot.country]
display(a_plot.head(3))



In [None]:
(len(a_plot[a_plot.country == 'Malaysia'])/len(a_plot))*100

In [None]:
df_13=pd.DataFrame(columns=['Series','Value','Country'])
df_13=df_13.fillna(0)
df_13['Value']=data_scores_no_null.age
df_13['Country']=data_scores_no_null.country
df_13['Series']='age'


df_12=pd.DataFrame(columns=['Series','Value','Country'])
df_12=df_12.fillna(0)
df_12['Value']=data_scores_no_null.urban
df_12['Country']=data_scores_no_null.country
df_12['Series']='urban'

df_11=pd.DataFrame(columns=['Series','Value','Country'])
df_11=df_11.fillna(0)
df_11['Value']=data_scores_no_null.education
df_11['Country']=data_scores_no_null.country
df_11['Series']='education'

df_10=pd.DataFrame(columns=['Series','Value','Country'])
df_10=df_10.fillna(0)
df_10['Value']=data_scores_no_null.familysize
df_10['Country']=data_scores_no_null.country
df_10['Series']='familysize'

df_9=pd.DataFrame(columns=['Series','Value','Country'])
df_9=df_9.fillna(0)
df_9['Value']=data_scores_no_null.married
df_9['Country']=data_scores_no_null.country
df_9['Series']='married'

df_8=pd.DataFrame(columns=['Series','Value','Country'])
df_8=df_8.fillna(0)
df_8['Value']=data_scores_no_null.voted
df_8['Country']=data_scores_no_null.country
df_8['Series']='voted'

df_7=pd.DataFrame(columns=['Series','Value','Country'])
df_7=df_7.fillna(0)
df_7['Value']=data_scores_no_null.race
df_7['Country']=data_scores_no_null.country
df_7['Series']='race'

df_6=pd.DataFrame(columns=['Series','Value','Country'])
df_6=df_6.fillna(0)
df_6['Value']=data_scores_no_null.orientation
df_6['Country']=data_scores_no_null.country
df_6['Series']='orientation'

df_5=pd.DataFrame(columns=['Series','Value','Country'])
df_5=df_5.fillna(0)
df_5['Value']=data_scores_no_null.religion
df_5['Country']=data_scores_no_null.country
df_5['Series']='religion'

df_1=pd.DataFrame(columns=['Series','Value','Country'])
df_1=df_1.fillna(0)
df_1['Value']=data_scores_no_null.Depression
df_1['Country']=data_scores_no_null.country
df_1['Series']='Depression'

df_2=pd.DataFrame(columns=['Series','Value','Country'])
df_2=df_2.fillna(0)
df_2['Value']=data_scores_no_null.Anxiety
df_2['Country']=data_scores_no_null.country
df_2['Series']='Anxiety'

df_3=pd.DataFrame(columns=['Series','Value','Country'])
df_3=df_3.fillna(0)
df_3['Value']=data_scores_no_null.Stress
df_3['Country']=data_scores_no_null.country
df_3['Series']='Stress'

df_4=pd.DataFrame(columns=['Series','Value','Country'])
df_4=df_4.fillna(0)
df_4['Value']=data_scores_no_null.hand
df_4['Country']=data_scores_no_null.country
df_4['Series']='hand'

res=pd.concat([df_1,df_2,df_3,df_4,df_5,df_6,df_7,df_8,df_9,df_10,df_11,df_12,df_13])
res.head(10)

#display(res[res.Country=='Japan'])


In [None]:
a_plot.head(3)

In [None]:
alt.data_transformers.disable_max_rows()

tooltip = alt.Tooltip(['disease:N', 'score:Q','country:N'])
click = alt.selection_multi(fields=['disease'])

series= pd.DataFrame(data=a_plot.disease.unique(),columns=['disease'])

color_s= alt.Color('disease:N')
color1 = alt.condition(click, 
                       color_s, 
                       alt.value('gray'))

rect = alt.Chart(series).mark_rect().encode(alt.Y('disease:N', axis=alt.Axis(tickSize=0.05)),
                                            #alt.X(title='Diseases'),
                                            color=color1
                                            ).add_selection(
                                              click
                                            ).interactive()

chart =alt.Chart(a_plot).mark_point(filled=True,
                                    fillOpacity=0.8).encode(alt.X('score:Q', axis=alt.Axis(grid=True) ,title='Disease Density',scale=alt.Scale(domain=[15, 85])),
                                                            alt.Y('country:N',axis=alt.Axis(tickSize=0.25, grid=False),title=None),
                                                            alt.Shape('disease:N'),
                                                            alt.Size('count(score):Q', scale=alt.Scale(range=[50,500])),
                                                            color=alt.condition(click, color_s, alt.value('grey')), #color_s,
                                                            tooltip=tooltip).transform_filter(click).add_selection(click).properties(width = alt.Step(10),
                                                                                                                                     height = alt.Step(15)).interactive()

c = rect | chart

c

In [None]:
# Some people put in some pretty random values. I can't explain any value greater than 200 so I decided to drop them

res = res[res.Value < 200]

In [None]:
scale = alt.Scale(scheme='category20b')
tooltip1 = alt.Tooltip(['Country:N','Series:N','Value:Q','count(Value):Q'])

""" Interactive Selectors """
brush = alt.selection_interval(encodings=['x']) # ----- Pick up everything on the y-scale
click = alt.selection_multi(fields=['Series']) # ----- Rectangular boxes [Country / Series]
series= pd.DataFrame(data=res.Series.unique(),columns=['Series'])

""" Colors """
color = alt.Column('Series:N')
    ##########
color_s = alt.Color('Series:N',
                    scale=scale
                   )
color_c = alt.Color('Country:N',
                    scale=scale
                   )
    ##########
color1= alt.condition(brush,
                      color_s,
                      alt.value('grey')
                     ) 
color2= alt.condition(click, 
                      color_c, 
                      alt.value('gray')
                     )
color3= alt.condition(click, 
                      color_s, 
                      alt.value('gray')
                     )
color4= alt.condition(brush,
                      color_c,
                      alt.value('gray')
                     )                      

""" Graphs """
rect = alt.Chart(series).mark_rect().encode(alt.Y('Series:N'),
                                            color=color3
                                            ).add_selection(
                                              click
                                            ).interactive()

area= alt.Chart(res).mark_area(interpolate='step',
                               #binSpacing=10,
                               line=True,
                               fillOpacity=0.2,
                               cornerRadius=0.8).encode(alt.X('Country:N', axis=alt.Axis(tickSize=0.25, grid=False), title=None), 
                                                        alt.Y('distinct(Value):Q', stack='center', title=None),
                                                        color=alt.condition(brush, 'Series', alt.value('grey')), 
                                                        tooltip=tooltip1).transform_filter(click).add_selection(brush).properties(width = alt.Step(15),
                                                                                                                                  height = alt.Step(20)).interactive(bind_y=True)

pnts = alt.Chart(res).mark_point(filled=True,
                                 size=1000, 
                                 stroke='black', 
                                 strokeWidth=0.15, 
                                 fillOpacity=0.4).encode(alt.Y('average(Value):Q', axis=alt.Axis(grid=False) ,title=None),
                                                          alt.X('Country:N',axis=alt.Axis(tickSize=0.25)),
                                                          alt.Shape('Series:N'),
                                                          alt.Size('count(Value):Q', scale=alt.Scale(range=[50,300])),
                                                          color=alt.condition(brush, color_s, alt.value('grey')), #color_s,
                                                          tooltip=tooltip1).transform_filter(click).transform_filter(brush).properties(width = alt.Step(15),
                                                                                                                                       height = alt.Step(20)).interactive()

""" Chart Elements """
""" alt.vconcat( area,
             alt.hconcat(bars,pnts, data=res),
             data = res,
             title = "DASS"
           ) """


chart = rect | (area & pnts)

chart
#chart.save('DASS.html', scale_factor=2.0)                             

So what does this data tell me??
> Very many people from Malaysia took the quizz for one.  
> There are various arrangements of Anxiety, Depression, Stress and other demograohics we can look into and each has an explicable outcome.

However, We have to remember to refer to the cookbook to explain variables like race, religion, orientation ... lest the findings seem stupid.

It was very interesting to work on this dataset. In the future i hope to do some more work on it to include more xenographic and unusual graphs.

In [None]:
!ls -lh 