In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [62]:
df = pd.read_csv(
                'data/Student_Mental_Health.csv'
)

In [63]:
df.head()

Unnamed: 0,Timestamp,Choose your gender,Age,What is your course?,Your current year of Study,What is your CGPA?,Marital status,Do you have Depression?,Do you have Anxiety?,Do you have Panic attack?,Did you seek any specialist for a treatment?
0,8/7/2020 12:02,Female,18.0,Engineering,year 1,3.00 - 3.49,No,Yes,No,Yes,No
1,8/7/2020 12:04,Male,21.0,Islamic education,year 2,3.00 - 3.49,No,No,Yes,No,No
2,8/7/2020 12:05,Male,19.0,BIT,Year 1,3.00 - 3.49,No,Yes,Yes,Yes,No
3,8/7/2020 12:06,Female,22.0,Laws,year 3,3.00 - 3.49,Yes,Yes,No,No,No
4,8/7/2020 12:13,Male,23.0,Mathemathics,year 4,3.00 - 3.49,No,No,No,No,No


##### Checking for NaN Values

In [28]:
df.isnull().sum()

Timestamp                                       0
Choose your gender                              0
Age                                             1
What is your course?                            0
Your current year of Study                      0
What is your CGPA?                              0
Marital status                                  0
Do you have Depression?                         0
Do you have Anxiety?                            0
Do you have Panic attack?                       0
Did you seek any specialist for a treatment?    0
dtype: int64

In [64]:
df[df.Age.isnull()]

Unnamed: 0,Timestamp,Choose your gender,Age,What is your course?,Your current year of Study,What is your CGPA?,Marital status,Do you have Depression?,Do you have Anxiety?,Do you have Panic attack?,Did you seek any specialist for a treatment?
43,8/7/2020 15:07,Male,,BIT,year 1,0 - 1.99,No,No,No,No,No


In [65]:
df = df.dropna(subset=['Age'])  # only 1 value is missing

In [66]:
df.isnull().sum()

Timestamp                                       0
Choose your gender                              0
Age                                             0
What is your course?                            0
Your current year of Study                      0
What is your CGPA?                              0
Marital status                                  0
Do you have Depression?                         0
Do you have Anxiety?                            0
Do you have Panic attack?                       0
Did you seek any specialist for a treatment?    0
dtype: int64

##### Renaming Columns

In [67]:
df.columns

Index(['Timestamp', 'Choose your gender', 'Age', 'What is your course?',
       'Your current year of Study', 'What is your CGPA?', 'Marital status',
       'Do you have Depression?', 'Do you have Anxiety?',
       'Do you have Panic attack?',
       'Did you seek any specialist for a treatment?'],
      dtype='object')

In [69]:
df = df.rename(columns={
    'Timestamp': 'timestamp',
    'Age': 'age',
    'Choose your gender': 'gender',
    'What is your course?': 'course',
    'Your current year of Study': 'study_year',
    'What is your CGPA?': 'cgpa',
    'Marital status': "marital_status",
    'Do you have Depression?': 'depression',
    'Do you have Anxiety?': 'anxiety',
    'Do you have Panic attack?': 'panic_attack',
    'Did you seek any specialist for a treatment?': 'specialist_consultation_status',
})

In [70]:
df['timestamp'] = pd.to_datetime(df['timestamp'], format='mixed', dayfirst=True, errors='coerce')

In [71]:
df.head(5)

Unnamed: 0,timestamp,gender,age,course,study_year,cgpa,marital_status,depression,anxiety,panic_attack,specialist_consultation_status
0,2020-07-08 12:02:00,Female,18.0,Engineering,year 1,3.00 - 3.49,No,Yes,No,Yes,No
1,2020-07-08 12:04:00,Male,21.0,Islamic education,year 2,3.00 - 3.49,No,No,Yes,No,No
2,2020-07-08 12:05:00,Male,19.0,BIT,Year 1,3.00 - 3.49,No,Yes,Yes,Yes,No
3,2020-07-08 12:06:00,Female,22.0,Laws,year 3,3.00 - 3.49,Yes,Yes,No,No,No
4,2020-07-08 12:13:00,Male,23.0,Mathemathics,year 4,3.00 - 3.49,No,No,No,No,No


##### Converting Data Types & Checking Outliers

In [74]:
df.dtypes

timestamp                         datetime64[ns]
gender                                    object
age                                      float64
course                                    object
study_year                                object
cgpa                                      object
marital_status                            object
depression                                object
anxiety                                   object
panic_attack                              object
specialist_consultation_status            object
dtype: object

In [75]:
for col in ['marital_status', 'depression', 'anxiety', 'panic_attack', 'specialist_consultation_status']:
    print(f"\n{col} unique values:")
    print(df[col].unique())


marital_status unique values:
['No' 'Yes']

depression unique values:
['Yes' 'No']

anxiety unique values:
['No' 'Yes']

panic_attack unique values:
['Yes' 'No']

specialist_consultation_status unique values:
['No' 'Yes']


In [76]:
categorical_cols = [
    'marital_status',
    'depression',
    'anxiety',
    'panic_attack',
    'specialist_consultation_status'
]

df[categorical_cols] = df[categorical_cols].astype('category')

In [77]:
df.dtypes

timestamp                         datetime64[ns]
gender                                    object
age                                      float64
course                                    object
study_year                                object
cgpa                                      object
marital_status                          category
depression                              category
anxiety                                 category
panic_attack                            category
specialist_consultation_status          category
dtype: object

In [78]:
df.gender.unique()

array(['Female', 'Male'], dtype=object)

In [80]:
df.age.value_counts()

age
18.0    32
24.0    23
19.0    21
23.0    13
20.0     6
21.0     3
22.0     2
Name: count, dtype: int64

##### Removing Duplicates

In [87]:
df.drop_duplicates(inplace=True)

In [88]:
# saving cleaned data
df.to_csv("data/processed/cleaned_data.csv", index=False)

In [89]:
df['study_year'] = df['study_year'].str.strip().str.lower()

In [92]:
df.study_year.value_counts()

study_year
year 1    42
year 2    26
year 3    24
year 4     8
Name: count, dtype: int64