In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# reading csv file
df=pd.read_csv("../Datasets/data.csv")

In [3]:
# size of dataset(rows,columns)
df.shape

(39775, 172)

In [4]:
# columns
df.columns

Index(['Q1A', 'Q1I', 'Q1E', 'Q2A', 'Q2I', 'Q2E', 'Q3A', 'Q3I', 'Q3E', 'Q4A',
       ...
       'screensize', 'uniquenetworklocation', 'hand', 'religion',
       'orientation', 'race', 'voted', 'married', 'familysize', 'major'],
      dtype='object', length=172)

In [5]:
# datatypes of column
df.dtypes

Q1A            int64
Q1I            int64
Q1E            int64
Q2A            int64
Q2I            int64
               ...  
race           int64
voted          int64
married        int64
familysize     int64
major         object
Length: 172, dtype: object

In [6]:
df.isnull().sum()

Q1A               0
Q1I               0
Q1E               0
Q2A               0
Q2I               0
              ...  
race              0
voted             0
married           0
familysize        0
major         11403
Length: 172, dtype: int64

In [7]:
# unamed columns are not needed and can be dropped
df=df.iloc[:,:172]

In [8]:
df.head()

Unnamed: 0,Q1A,Q1I,Q1E,Q2A,Q2I,Q2E,Q3A,Q3I,Q3E,Q4A,...,screensize,uniquenetworklocation,hand,religion,orientation,race,voted,married,familysize,major
0,4,28,3890,4,25,2122,2,16,1944,4,...,1,1,1,12,1,10,2,1,2,
1,4,2,8118,1,36,2890,2,35,4777,3,...,2,1,2,7,0,70,2,1,4,
2,3,7,5784,1,33,4373,4,41,3242,1,...,2,1,1,4,3,60,1,1,3,
3,2,23,5081,3,11,6837,2,37,5521,1,...,2,1,2,4,5,70,2,1,5,biology
4,2,36,3215,2,13,7731,3,5,4156,4,...,2,2,3,10,1,10,2,1,4,Psychology


In [9]:
# This response is stored in variable A (e.g. Q1A). 
# Also recorded was the time taken in milliseconds to answer that question (E) and 
# that question's position in the survey (I).
# we don't need the position of question in survey and the time taken
# hence we will drop these columns

for i in range(1,43):
    pos="Q"+ str(i) + "I"
    time="Q"+ str(i) + "E"
    df.drop([pos,time], axis = 1, inplace = True)

In [10]:
df.head()

Unnamed: 0,Q1A,Q2A,Q3A,Q4A,Q5A,Q6A,Q7A,Q8A,Q9A,Q10A,...,screensize,uniquenetworklocation,hand,religion,orientation,race,voted,married,familysize,major
0,4,4,2,4,4,4,4,4,2,1,...,1,1,1,12,1,10,2,1,2,
1,4,1,2,3,4,4,3,4,3,2,...,2,1,2,7,0,70,2,1,4,
2,3,1,4,1,4,3,1,3,2,4,...,2,1,1,4,3,60,1,1,3,
3,2,3,2,1,3,3,4,2,3,3,...,2,1,2,4,5,70,2,1,5,biology
4,2,2,3,4,4,2,4,4,4,3,...,2,2,3,10,1,10,2,1,4,Psychology


In [11]:
#some columns we don't need because they don't help us in our problem statement
# engnat				"Is English your native language?", 1=Yes, 2=No
# hand				"What hand do you use to write with?", 1=Right, 2=Left, 3=Both
# religion			"What is your religion?", 1=Agnostic, 2=Atheist, 3=Buddhist, 4=Christian (Catholic), 5=Christian (Mormon), 6=Christian (Protestant), 7=Christian (Other), 8=Hindu, 9=Jewish, 10=Muslim, 11=Sikh, 12=Other
# orientation			"What is your sexual orientation?", 1=Heterosexual, 2=Bisexual, 3=Homosexual, 4=Asexual, 5=Other
# race				"What is your race?", 10=Asian, 20=Arab, 30=Black, 40=Indigenous Australian, 50=Native American, 60=White, 70=Other
# voted				"Have you voted in a national election in the past year?", 1=Yes, 2=No
# married				"What is your marital status?", 1=Never married, 2=Currently married, 3=Previously married
# major				"If you attended a university, what was your major (e.g. "psychology", "English", "civil engineering")?"
# country				ISO country code of where the user connected from
# screensize			1=device with small screen (phone, etc), 2=device with big screen (laptop, desktop, etc)
# uniquenetworklocation		1=only one survey from user's specific network in dataset, 2=multiple surveys submitted from the network of this user  (2 does not necessarily imply duplicate records for an individual, as it could be different students at a single school or different memebers of the same household; and even if 1 there still could be duplicate records from a single individual e.g. if they took it once on their wifi and once on their phone)
# source			how the user found the test, 1=from the front page of the site hosting the survey, 2=from google, 0=other or unknown
# introelapse		The time spent on the introduction/landing page (in seconds)
# testelapse		The time spent on all the DASS questions (should be equivalent to the time elapsed on all the indiviudal questions combined)
# surveyelapse	The time spent answering the rest of the demographic and survey questions
drop=['engnat','hand','religion','orientation','race','voted','married','major',
      'country','screensize','uniquenetworklocation','source','introelapse','testelapse','surveyelapse']
df.drop(drop,axis = 1, inplace = True)
df.head()

Unnamed: 0,Q1A,Q2A,Q3A,Q4A,Q5A,Q6A,Q7A,Q8A,Q9A,Q10A,...,VCL12,VCL13,VCL14,VCL15,VCL16,education,urban,gender,age,familysize
0,4,4,2,4,4,4,4,4,2,1,...,0,0,1,1,1,2,3,2,16,2
1,4,1,2,3,4,4,3,4,3,2,...,0,0,1,1,1,2,3,2,16,4
2,3,1,4,1,4,3,1,3,2,4,...,0,0,1,1,1,2,3,2,17,3
3,2,3,2,1,3,3,4,2,3,3,...,0,0,1,1,1,1,3,2,13,5
4,2,2,3,4,4,2,4,4,4,3,...,0,1,1,1,1,3,2,2,19,4


In [12]:
# since we are following DASS21 scale we are going to drop the remaining questions 
# and  reorder them according to our own survey questions
# Q1    (s)   Q29 Do you find it hard to calm down after something upset you?
# Q2    (a)   Q2  Were you aware of the dryness in your mouth?
# Q3    (d)   Q3  Were you able to experience any positive feelings?
# Q4    (a)   Q4  Do you experience breathing difficulty (eg, excessively rapid breathing, breathlessness in the absence of physical exertion).
# Q5    (d)   Q42 Do you find it difficult to work up the initiative to do things?
# Q6    (s)   Q6  Do you tend to over-react in situations?
# Q7    (a)   Q41 Do you experience trembling (eg, in your hands).
# Q8    (s)   Q33 Do you find yourself in a state of nervous tension?
# Q9    (a)   Q40 Are you worried about situations in which you might panic and make a fool of yourself?
# Q10   (d)   Q10 Do you feel that you have nothing to look forward to?
# Q11   (s)   Q39 Do you find yourself getting agitated?
# Q12   (s)   Q8  Do you find it difficult to relax?
# Q13   (d)   Q13 Do you feel sad and depressed?
# Q14   (s)   Q32 Do you find it difficult to tolerate interruptions while doing something?
# Q15   (a)   Q28 Do you find yourself on the verge of panicking?
# Q16   (d)   Q31 Do you find it hard to get enthusiastic about things?
# Q17   (d)   Q17 Do you feel you weren't worth much as a person?
# Q18   (s)   Q14 Do you find yourself getting impatient when you are delayed in any way (eg, elevators, traffic lights, being kept waiting).
# Q19   (a)   Q25 How aware are you of the action of your heart in the absence of any physical exertion (eg, sense of heart rate increase, heart missing a beat).
# Q20   (a)   Q20 Do you feel scared without any good reason?
# Q21   (d)   Q38 Do you feel that life has become meaningless?

In [13]:
Q_to_drop=['1','5','7','9','11','12','15','16','18','19','21','22','23','24','26','27','30','34','35','36','37']
for i in Q_to_drop:
    d="Q"+ i + "A"
    df.drop([d], axis = 1, inplace = True)
del Q_to_drop
df.head()

Unnamed: 0,Q2A,Q3A,Q4A,Q6A,Q8A,Q10A,Q13A,Q14A,Q17A,Q20A,...,VCL12,VCL13,VCL14,VCL15,VCL16,education,urban,gender,age,familysize
0,4,2,4,4,4,1,4,4,3,3,...,0,0,1,1,1,2,3,2,16,2
1,1,2,3,4,4,2,4,4,4,1,...,0,0,1,1,1,2,3,2,16,4
2,1,4,1,3,3,4,4,1,4,1,...,0,0,1,1,1,2,3,2,17,3
3,3,2,1,3,2,3,1,4,3,2,...,0,0,1,1,1,1,3,2,13,5
4,2,3,4,2,4,3,4,4,4,4,...,0,1,1,1,1,3,2,2,19,4


In [14]:
# Question mapping
q1=[29,2,3,4,42,6,41,33,40,10,39,8,13,32,28,31,17,14,25,20,38]
q2=list(map(str,range(1,22)))
q_map = {}
i=1
for key in q1:
    q_map[str(key)] = str(i)
    i+=1
del q1
q_map

{'29': '1',
 '2': '2',
 '3': '3',
 '4': '4',
 '42': '5',
 '6': '6',
 '41': '7',
 '33': '8',
 '40': '9',
 '10': '10',
 '39': '11',
 '8': '12',
 '13': '13',
 '32': '14',
 '28': '15',
 '31': '16',
 '17': '17',
 '14': '18',
 '25': '19',
 '20': '20',
 '38': '21'}

In [15]:
dass=['S','A','D','A','D','S','A','S','A','D','S','S','D','S','A','D','D','S','A','A','D']
n=[]
for ((key,value),d) in zip(q_map.items(),dass):
    old='Q'+key+"A"
    new='Q'+value+"("+d+")"
    n.append(new)
    df.rename(columns={old:new}, inplace=True)
    df[new].replace([1, 2, 3, 4], [0, 1, 2, 3],inplace=True)

for i in df.columns:
    if(i not in n):
        n.append(i)

In [16]:
df.head()

Unnamed: 0,Q2(A),Q3(D),Q4(A),Q6(S),Q12(S),Q10(D),Q13(D),Q18(S),Q17(D),Q20(A),...,VCL12,VCL13,VCL14,VCL15,VCL16,education,urban,gender,age,familysize
0,3,1,3,3,3,0,3,3,2,2,...,0,0,1,1,1,2,3,2,16,2
1,0,1,2,3,3,1,3,3,3,0,...,0,0,1,1,1,2,3,2,16,4
2,0,3,0,2,2,3,3,0,3,0,...,0,0,1,1,1,2,3,2,17,3
3,2,1,0,2,1,2,0,3,2,1,...,0,0,1,1,1,1,3,2,13,5
4,1,2,3,1,3,2,3,3,3,3,...,0,1,1,1,1,3,2,2,19,4


In [17]:
df=df.reindex(columns=n)
df.head()

Unnamed: 0,Q1(S),Q2(A),Q3(D),Q4(A),Q5(D),Q6(S),Q7(A),Q8(S),Q9(A),Q10(D),...,VCL12,VCL13,VCL14,VCL15,VCL16,education,urban,gender,age,familysize
0,3,3,1,3,3,3,3,1,2,0,...,0,0,1,1,1,2,3,2,16,2
1,2,0,1,2,1,3,1,2,0,1,...,0,0,1,1,1,2,3,2,16,4
2,1,0,3,0,3,2,0,0,1,3,...,0,0,1,1,1,2,3,2,17,3
3,2,2,1,0,1,2,3,1,3,2,...,0,0,1,1,1,1,3,2,13,5
4,1,1,2,3,2,1,3,3,3,2,...,0,1,1,1,1,3,2,2,19,4


In [18]:
# Calculating Scores of Each Participant
df['Str']=0
df['Anx']=0
df['Dep']=0
s=[]
d=[]
a=[]
for i in df.columns:
    if(i.find("(D)")!=-1):
#         d.append(i)
        df['Dep']+=df[i]
    if(i.find("(A)")!=-1):
#         a.append(i)
        df['Anx']+=df[i]
    if(i.find("(S)")!=-1):
#         s.append(i)
        df['Str']+=df[i]

In [19]:
df.head()

Unnamed: 0,Q1(S),Q2(A),Q3(D),Q4(A),Q5(D),Q6(S),Q7(A),Q8(S),Q9(A),Q10(D),...,VCL15,VCL16,education,urban,gender,age,familysize,Str,Anx,Dep
0,3,3,1,3,3,3,3,1,2,0,...,1,1,2,3,2,16,2,19,18,13
1,2,0,1,2,1,3,1,2,0,1,...,1,1,2,3,2,16,4,16,7,11
2,1,0,3,0,3,2,0,0,1,3,...,1,1,2,3,2,17,3,8,2,21
3,2,2,1,0,1,2,3,1,3,2,...,1,1,1,3,2,13,5,11,9,8
4,1,1,2,3,2,1,3,3,3,2,...,1,1,3,2,2,19,4,16,19,16


In [20]:
df['Str']=df['Str']*2
df['Dep']=df['Dep']*2
df['Anx']=df['Anx']*2

In [21]:
df.head()

Unnamed: 0,Q1(S),Q2(A),Q3(D),Q4(A),Q5(D),Q6(S),Q7(A),Q8(S),Q9(A),Q10(D),...,VCL15,VCL16,education,urban,gender,age,familysize,Str,Anx,Dep
0,3,3,1,3,3,3,3,1,2,0,...,1,1,2,3,2,16,2,38,36,26
1,2,0,1,2,1,3,1,2,0,1,...,1,1,2,3,2,16,4,32,14,22
2,1,0,3,0,3,2,0,0,1,3,...,1,1,2,3,2,17,3,16,4,42
3,2,2,1,0,1,2,3,1,3,2,...,1,1,1,3,2,13,5,22,18,16
4,1,1,2,3,2,1,3,3,3,2,...,1,1,3,2,2,19,4,32,38,32


### Filtering Out rows Using VCL6,VCL9,VCL12 because we have use it for verification

In [22]:
df_upd = df[df['VCL6'] < 1]

In [23]:
df_upd = df_upd[df_upd['VCL9'] < 1]

In [24]:
df_upd = df_upd[df_upd['VCL12'] < 1]

In [25]:
df_upd.shape

(34583, 55)

In [26]:
d=df_upd['VCL9']==1
d.sum()

0

In [27]:
classes=['Normal','Mild','Moderate','Severe','Extremely Severe']

In [28]:
df_upd['Depression']=pd.cut(df_upd['Dep'],bins=[-1,9,13,20,27,42],labels=classes)
df_upd['Anxiety']=pd.cut(df_upd['Anx'],bins=[-1,7,9,14,19,42],labels=classes)
df_upd['Stress']=pd.cut(df_upd['Str'],bins=[-1,14,18,25,33,42],labels=classes)

In [29]:
df_upd.head()

Unnamed: 0,Q1(S),Q2(A),Q3(D),Q4(A),Q5(D),Q6(S),Q7(A),Q8(S),Q9(A),Q10(D),...,urban,gender,age,familysize,Str,Anx,Dep,Depression,Anxiety,Stress
0,3,3,1,3,3,3,3,1,2,0,...,3,2,16,2,38,36,26,Severe,Extremely Severe,Extremely Severe
1,2,0,1,2,1,3,1,2,0,1,...,3,2,16,4,32,14,22,Severe,Moderate,Severe
2,1,0,3,0,3,2,0,0,1,3,...,3,2,17,3,16,4,42,Extremely Severe,Normal,Mild
3,2,2,1,0,1,2,3,1,3,2,...,3,2,13,5,22,18,16,Moderate,Severe,Moderate
4,1,1,2,3,2,1,3,3,3,2,...,2,2,19,4,32,38,32,Extremely Severe,Extremely Severe,Severe


In [30]:
df_upd['Depression'].unique()

['Severe', 'Extremely Severe', 'Moderate', 'Mild', 'Normal']
Categories (5, object): ['Normal' < 'Mild' < 'Moderate' < 'Severe' < 'Extremely Severe']

In [31]:
scale_mapper = {'Normal':0 ,'Mild':1, 'Moderate':2, 'Severe':3 ,'Extremely Severe':4}
col=['Depression','Anxiety','Stress']
for i in col:
    df_upd[i]= df_upd[i].replace(scale_mapper)

In [32]:
df_upd.head()

Unnamed: 0,Q1(S),Q2(A),Q3(D),Q4(A),Q5(D),Q6(S),Q7(A),Q8(S),Q9(A),Q10(D),...,urban,gender,age,familysize,Str,Anx,Dep,Depression,Anxiety,Stress
0,3,3,1,3,3,3,3,1,2,0,...,3,2,16,2,38,36,26,3,4,4
1,2,0,1,2,1,3,1,2,0,1,...,3,2,16,4,32,14,22,3,2,3
2,1,0,3,0,3,2,0,0,1,3,...,3,2,17,3,16,4,42,4,0,1
3,2,2,1,0,1,2,3,1,3,2,...,3,2,13,5,22,18,16,2,3,2
4,1,1,2,3,2,1,3,3,3,2,...,2,2,19,4,32,38,32,4,4,3


In [33]:
#  saving the preprocessed dataframe
df_upd.to_csv('../Datasets/CleanData.csv',index=False)

### Dropping VCL1-VCL12 Columns because they were used for validation

In [34]:
# reading csv file
df=pd.read_csv("../Datasets/CleanData.csv")

In [35]:
df.shape

(34583, 58)

In [36]:
for i in range(1,17):
    d="VCL"+ str(i)
    df.drop([d], axis = 1, inplace = True)

df.head()

Unnamed: 0,Q1(S),Q2(A),Q3(D),Q4(A),Q5(D),Q6(S),Q7(A),Q8(S),Q9(A),Q10(D),...,urban,gender,age,familysize,Str,Anx,Dep,Depression,Anxiety,Stress
0,3,3,1,3,3,3,3,1,2,0,...,3,2,16,2,38,36,26,3,4,4
1,2,0,1,2,1,3,1,2,0,1,...,3,2,16,4,32,14,22,3,2,3
2,1,0,3,0,3,2,0,0,1,3,...,3,2,17,3,16,4,42,4,0,1
3,2,2,1,0,1,2,3,1,3,2,...,3,2,13,5,22,18,16,2,3,2
4,1,1,2,3,2,1,3,3,3,2,...,2,2,19,4,32,38,32,4,4,3


### Filtering Age

In [37]:
df.age.min()

13

In [38]:
df_upd = df[df['age'] >= 18]
df_upd = df_upd[df_upd['age']<85]

In [39]:
df_upd.shape

(28472, 42)

### Filtering familysize column

In [40]:
df_upd.familysize.max()

99

In [41]:
df_upd = df_upd[df_upd['familysize'] <=15]
# df_upd = df_upd[df_upd['familysize']<90]

In [42]:
df_upd.shape

(28465, 42)

### Seperating Depression,Anxiety and Stress Dataset

In [43]:
df_q=df_upd.iloc[:,:21]

In [44]:
df_dep=pd.DataFrame()
df_str=pd.DataFrame()
df_anx=pd.DataFrame()

In [45]:
for i in df_q.columns:
    if(i.find("(D)")!=-1):
        df_dep[i]=df_upd[i]
    if(i.find("(A)")!=-1):
        df_anx[i]=df_upd[i]
    if(i.find("(S)")!=-1):
        df_str[i]=df_upd[i]
df_dep["Depression"]=df_upd["Depression"]
df_anx["Anxiety"]=df_upd["Anxiety"]
df_str["Stress"]=df_upd["Stress"]

In [46]:
df_dep.head()
df_str.head()
df_anx.head()

Unnamed: 0,Q2(A),Q4(A),Q7(A),Q9(A),Q15(A),Q19(A),Q20(A),Anxiety
4,1,3,3,3,3,3,3,4
5,0,0,0,0,0,0,1,0
7,1,0,2,3,3,2,3,4
9,2,1,0,2,1,2,2,4
10,0,0,2,1,0,0,1,1


In [47]:
df_dep.shape

(28465, 8)

In [48]:
#  saving the dataframe
df_dep.to_csv('../Datasets/Depression.csv',index=False)
df_anx.to_csv('../Datasets/Anxiety.csv',index=False)
df_str.to_csv('../Datasets/Stress.csv',index=False)

In [49]:
#  saving the preprocessed dataframe
df_upd.to_csv('../Datasets/CleanData.csv',index=False)

In [50]:
# reading csv file
df=pd.read_csv("../Datasets/CleanData.csv")

In [51]:
classes=['Nuclear','Joint','Extended']
df['family']=pd.cut(df['familysize'],bins=[-1,5,10,15],labels=classes)

In [52]:
from sklearn import preprocessing 
label_enc = preprocessing.LabelEncoder()
df["family_enc"]=label_enc.fit_transform(df["family"])
list(label_enc.classes_) 

['Extended', 'Joint', 'Nuclear']

In [53]:
df.head()

Unnamed: 0,Q1(S),Q2(A),Q3(D),Q4(A),Q5(D),Q6(S),Q7(A),Q8(S),Q9(A),Q10(D),...,age,familysize,Str,Anx,Dep,Depression,Anxiety,Stress,family,family_enc
0,1,1,2,3,2,1,3,3,3,2,...,19,4,32,38,32,4,4,3,Nuclear,2
1,2,0,1,0,1,0,0,3,0,1,...,20,4,18,2,12,1,0,1,Nuclear,2
2,3,1,3,0,3,3,2,3,3,3,...,18,3,38,28,42,4,4,4,Nuclear,2
3,2,2,1,1,1,2,0,3,2,2,...,18,2,30,20,20,2,4,3,Nuclear,2
4,2,0,0,0,1,1,2,0,1,2,...,20,2,22,8,16,2,1,2,Nuclear,2


In [54]:
#  saving the preprocessed dataframe
df.to_csv('../Datasets/CleanData.csv',index=False)

In [55]:
df['age'].max()

82

In [56]:
classes=['18-25','26-35','36-45','46-60','60+']
df['age class']=pd.cut(df['age'],bins=[-17,25,35,45,60,82],labels=classes)

In [57]:
from sklearn import preprocessing 
label_enc = preprocessing.LabelEncoder()
df["age_enc"]=label_enc.fit_transform(df["age class"])
list(label_enc.classes_) 

['18-25', '26-35', '36-45', '46-60', '60+']

In [58]:
#  saving the preprocessed dataframe
df.to_csv('../Datasets/CleanData.csv',index=False)