In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# read data
df = pd.read_csv('train.csv')

In [3]:
# head of data
df.head()

Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0


In [4]:
# info 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140700 entries, 0 to 140699
Data columns (total 20 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   id                                     140700 non-null  int64  
 1   Name                                   140700 non-null  object 
 2   Gender                                 140700 non-null  object 
 3   Age                                    140700 non-null  float64
 4   City                                   140700 non-null  object 
 5   Working Professional or Student        140700 non-null  object 
 6   Profession                             104070 non-null  object 
 7   Academic Pressure                      27897 non-null   float64
 8   Work Pressure                          112782 non-null  float64
 9   CGPA                                   27898 non-null   float64
 10  Study Satisfaction                     27897 non-null   

In [5]:
# No. of columns
df.columns

Index(['id', 'Name', 'Gender', 'Age', 'City',
       'Working Professional or Student', 'Profession', 'Academic Pressure',
       'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction',
       'Sleep Duration', 'Dietary Habits', 'Degree',
       'Have you ever had suicidal thoughts ?', 'Work/Study Hours',
       'Financial Stress', 'Family History of Mental Illness', 'Depression'],
      dtype='object')

# Things To Analyse

- [] Male or Female have more mental problem 
- [] Which age group have more mental problem (both men and for women)
- [] Which city have more mental problem (both men and for women)
- [] Student or Working proffesion have more mental problem (both men and for women)
- [] which proffesion have less mental problems
- [] Which city have less mental problem (both men and for women)
- [] we should anlayse student separatedly from working proffesionals

# messy data
- [] `Proffesion` contain some missing values most of them are for students in proffesion
- [] `academy` contain many missing values becuase it is only for students not for working proffessinals
- [] `work pressure` similar , for students , they have no values, same for study satisfaction and work satisfaction
- [] `woking proffesion or student` , `familey histroy`, `male or female`, and `Suicidal thought` should convert to binary


In [6]:
# drop unnessasry columns
df.drop(columns=['id','Name'],inplace=True)

In [7]:
# Missing values count
df.isnull().sum()

Gender                                        0
Age                                           0
City                                          0
Working Professional or Student               0
Profession                                36630
Academic Pressure                        112803
Work Pressure                             27918
CGPA                                     112802
Study Satisfaction                       112803
Job Satisfaction                          27910
Sleep Duration                                0
Dietary Habits                                4
Degree                                        2
Have you ever had suicidal thoughts ?         0
Work/Study Hours                              0
Financial Stress                              4
Family History of Mental Illness              0
Depression                                    0
dtype: int64

In [8]:
# Percentage of missing values
df.isnull().mean()*100

Gender                                    0.000000
Age                                       0.000000
City                                      0.000000
Working Professional or Student           0.000000
Profession                               26.034115
Academic Pressure                        80.172708
Work Pressure                            19.842217
CGPA                                     80.171997
Study Satisfaction                       80.172708
Job Satisfaction                         19.836532
Sleep Duration                            0.000000
Dietary Habits                            0.002843
Degree                                    0.001421
Have you ever had suicidal thoughts ?     0.000000
Work/Study Hours                          0.000000
Financial Stress                          0.002843
Family History of Mental Illness          0.000000
Depression                                0.000000
dtype: float64

missing value are the confliction between students and working proffesionals,exept for proffesion


In [9]:
# Check for duplicates
df.duplicated().sum()

5

In [10]:
# Remove all duplicates in the data
df.drop_duplicates(inplace=True)

In [11]:
# Describe 
df.describe()

Unnamed: 0,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Work/Study Hours,Financial Stress,Depression
count,140695.0,27897.0,112777.0,27898.0,27897.0,112785.0,140695.0,140691.0,140695.0
mean,40.389268,3.142273,2.998954,7.658636,2.94494,2.974465,6.252539,2.988926,0.181691
std,12.383755,1.380457,1.405761,1.464466,1.360197,1.416076,3.853579,1.413617,0.385591
min,18.0,1.0,1.0,5.03,1.0,1.0,0.0,1.0,0.0
25%,29.0,2.0,2.0,6.29,2.0,2.0,3.0,2.0,0.0
50%,42.0,3.0,3.0,7.77,3.0,3.0,6.0,3.0,0.0
75%,51.0,4.0,4.0,8.92,4.0,4.0,10.0,4.0,0.0
max,60.0,5.0,5.0,10.0,5.0,5.0,12.0,5.0,1.0


In [12]:
# Convert binary Category to numbers
df['Gender'].unique()
df['Gender'] = df['Gender'].eq('Male').astype(int)

In [13]:
# Woking Professionals or Students -> 0,1
df['Working Professional or Student'] = df['Working Professional or Student'].eq('Student').astype(int)

In [14]:
# np.where(df['Profession'].isna(),np.where(df['Working Professional or Student']==1,'Student',np.NaN),df['Profession'])
df['Profession'] = df.apply(lambda row: ('Student' if row['Working Professional or Student']==1 else np.NaN) 
         if pd.isna(row['Profession'])  else row['Profession'],axis=1)

In [15]:
df.loc[df['Working Professional or Student']==1]['Profession'].unique()

array(['Student', 'Civil Engineer', 'Architect', 'UX/UI Designer',
       'Digital Marketer', 'Content Writer', 'Educational Consultant',
       'Teacher', 'Manager', 'Chef', 'Doctor', 'Lawyer', 'Entrepreneur',
       'Pharmacist'], dtype=object)

In [16]:
df[(df['Working Professional or Student']==1) & (df['Academic Pressure'].isna())]

Unnamed: 0,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
17549,0,20.0,Patna,1,Student,,,5.55,,,5-6 hours,Moderate,Class 12,No,0.0,3.0,Yes,0
21880,1,38.0,Chennai,1,Student,,5.0,,,4.0,5-6 hours,Healthy,Class 12,No,2.0,3.0,No,0
70453,0,20.0,Ahmedabad,1,Student,,,,,2.0,Less than 5 hours,Moderate,Class 12,Yes,12.0,3.0,Yes,1
75007,1,21.0,Lucknow,1,Student,,2.0,,,1.0,7-8 hours,Moderate,Class 12,Yes,3.0,3.0,Yes,0
105773,1,18.0,Ahmedabad,1,Student,,,,,1.0,Less than 5 hours,Moderate,Class 12,Yes,9.0,5.0,No,1
122983,0,30.0,Ghaziabad,1,Student,,,5.47,2.0,,Less than 5 hours,Unhealthy,B.Com,Yes,5.0,1.0,No,0
129756,1,18.0,Rajkot,1,Student,,5.0,,,4.0,7-8 hours,Moderate,Class 12,Yes,9.0,4.0,No,1
134830,0,24.0,Meerut,1,Student,,,,,2.0,More than 8 hours,Unhealthy,Class 12,No,0.0,5.0,No,0
137013,1,36.0,Varanasi,1,Student,,,8.54,3.0,,More than 8 hours,Moderate,Class 12,Yes,8.0,5.0,Yes,1


In [17]:
conf_features = ['Academic Pressure','Work Pressure','CGPA','CGPA','Study Satisfaction','Job Satisfaction']

for feature in conf_features:
    df[feature] = np.where(df[feature].isna(),
         np.where(df['Working Professional or Student']==1,0,df[feature]),df[feature])


for feature in conf_features:
    df[feature] = np.where(df[feature].isna(),
         np.where(df['Working Professional or Student']==0,0,df[feature]),df[feature])

In [18]:
df.loc[df['Academic Pressure'].isna()]

Unnamed: 0,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression


In [19]:
df['City'].unique()

array(['Ludhiana', 'Varanasi', 'Visakhapatnam', 'Mumbai', 'Kanpur',
       'Ahmedabad', 'Thane', 'Nashik', 'Bangalore', 'Patna', 'Rajkot',
       'Jaipur', 'Pune', 'Lucknow', 'Meerut', 'Agra', 'Surat',
       'Faridabad', 'Hyderabad', 'Srinagar', 'Ghaziabad', 'Kolkata',
       'Chennai', 'Kalyan', 'Nagpur', 'Vadodara', 'Vasai-Virar', 'Delhi',
       'Bhopal', 'Indore', 'Ishanabad', 'Vidhi', 'Ayush', 'Gurgaon',
       'Krishna', 'Aishwarya', 'Keshav', 'Harsha', 'Nalini', 'Aditya',
       'Malyansh', 'Raghavendra', 'Saanvi', 'M.Tech', 'Bhavna',
       'Less Delhi', 'Nandini', 'M.Com', 'Plata', 'Atharv', 'Pratyush',
       'City', '3.0', 'Less than 5 Kalyan', 'MCA', 'Mira', 'Moreadhyay',
       'Morena', 'Ishkarsh', 'Kashk', 'Mihir', 'Vidya', 'Tolkata', 'Anvi',
       'Krinda', 'Ayansh', 'Shrey', 'Ivaan', 'Vaanya', 'Gaurav', 'Harsh',
       'Reyansh', 'Kashish', 'Kibara', 'Vaishnavi', 'Chhavi', 'Parth',
       'Mahi', 'Tushar', 'MSc', 'No', 'Rashi', 'ME', 'Molkata',
       'Researcher', '

In [20]:
real_cities_and_misspellings = [
    'Ludhiana',  # Correct
    'Varanasi',  # Correct
    'Visakhapatnam',  # Correct
    'Mumbai',  # Correct
    'Kanpur',  # Correct
    'Ahmedabad',  # Correct
    'Thane',  # Correct
    'Nashik',  # Correct
    'Bangalore',  # Correct
    'Patna',  # Correct
    'Rajkot',  # Correct
    'Jaipur',  # Correct
    'Pune',  # Correct
    'Lucknow',  # Correct
    'Meerut',  # Correct
    'Agra',  # Correct
    'Surat',  # Correct
    'Faridabad',  # Correct
    'Hyderabad',  # Correct
    'Srinagar',  # Correct
    'Ghaziabad',  # Correct
    'Kolkata',  # Correct
    'Chennai',  # Correct
    'Kalyan',  # Correct
    'Nagpur',  # Correct
    'Vadodara',  # Correct
    'Vasai-Virar',  # Correct
    'Delhi',  # Correct
    'Bhopal',  # Correct
    'Indore',  # Correct
    'Gurgaon',  # Correct
    'Morena',  # Correct
    'Tolkata',  # Misspelled for "Kolkata"
    'Molkata',  # Misspelled for "Kolkata"
    'Less Delhi',  # Likely refers to "Delhi"
    'Khaziabad',  # Misspelled for "Ghaziabad"
    'Nalyan',  # Misspelled for "Kalyan"
]


In [21]:
df['City'] = np.where(df['City'].isin(real_cities_and_misspellings),df['City'],np.NaN)

In [22]:
# Misspelled city example
df.query('City=="Nalyan"')

Unnamed: 0,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
115761,0,31.0,Nalyan,1,Student,4.0,0.0,8.08,2.0,0.0,Less than 5 hours,Moderate,BA,Yes,6.0,2.0,Yes,0


In [23]:

misspelled_correct = [
    ['Tolkata', 'Kolkata'],
    ['Molkata', 'Kolkata'],
    ['Less Delhi', 'Delhi'],
    ['Khaziabad', 'Ghaziabad'],
    ['Nalyan', 'Kalyan']
]

In [24]:
for to_replace,replace in misspelled_correct:
    df['City'] = df['City'].str.replace(to_replace,replace)

In [25]:
# Check if all the city now are consistant and does not have any other problems
df['City'].unique()

array(['Ludhiana', 'Varanasi', 'Visakhapatnam', 'Mumbai', 'Kanpur',
       'Ahmedabad', 'Thane', 'Nashik', 'Bangalore', 'Patna', 'Rajkot',
       'Jaipur', 'Pune', 'Lucknow', 'Meerut', 'Agra', 'Surat',
       'Faridabad', 'Hyderabad', 'Srinagar', 'Ghaziabad', 'Kolkata',
       'Chennai', 'Kalyan', 'Nagpur', 'Vadodara', 'Vasai-Virar', 'Delhi',
       'Bhopal', 'Indore', nan, 'Gurgaon', 'Morena'], dtype=object)

In [26]:
# checking value count
df['City'].value_counts()

City
Kalyan           6592
Patna            5924
Vasai-Virar      5764
Kolkata          5692
Ahmedabad        5613
Meerut           5527
Ludhiana         5225
Pune             5210
Rajkot           5207
Visakhapatnam    5176
Srinagar         5074
Mumbai           4966
Indore           4872
Agra             4684
Surat            4636
Varanasi         4606
Vadodara         4568
Hyderabad        4496
Kanpur           4398
Jaipur           4328
Thane            4289
Lucknow          4279
Nagpur           4209
Bangalore        4122
Chennai          4044
Ghaziabad        3621
Delhi            3594
Bhopal           3475
Faridabad        3268
Nashik           3144
Gurgaon             1
Morena              1
Name: count, dtype: int64

In [27]:
# Dropping the less city data which is Guraon and Morena
df = df[~(df['City'].isin(['Gurgaon','Morena']))]

In [28]:
df['City']

0              Ludhiana
1              Varanasi
2         Visakhapatnam
3                Mumbai
4                Kanpur
              ...      
140695        Ahmedabad
140696        Hyderabad
140697          Kolkata
140698         Srinagar
140699            Patna
Name: City, Length: 140693, dtype: object

In [None]:
# Rename suicidal thought
df.rename('')

Unnamed: 0,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,0,49.0,Ludhiana,0,Chef,0.0,5.0,0.00,0.0,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,1,26.0,Varanasi,0,Teacher,0.0,4.0,0.00,0.0,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,1,33.0,Visakhapatnam,1,Student,5.0,0.0,8.97,2.0,0.0,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,1,22.0,Mumbai,0,Teacher,0.0,5.0,0.00,0.0,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,0,30.0,Kanpur,0,Business Analyst,0.0,1.0,0.00,0.0,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140695,0,18.0,Ahmedabad,0,,0.0,5.0,0.00,0.0,4.0,5-6 hours,Unhealthy,Class 12,No,2.0,4.0,Yes,1
140696,0,41.0,Hyderabad,0,Content Writer,0.0,5.0,0.00,0.0,4.0,7-8 hours,Moderate,B.Tech,Yes,6.0,5.0,Yes,0
140697,0,24.0,Kolkata,0,Marketing Manager,0.0,3.0,0.00,0.0,1.0,More than 8 hours,Moderate,B.Com,No,4.0,4.0,No,0
140698,0,49.0,Srinagar,0,Plumber,0.0,5.0,0.00,0.0,2.0,5-6 hours,Moderate,ME,Yes,10.0,1.0,No,0
