In [1]:
# Load Libraries
import psycopg2
import pandas as pd
import numpy as np

In [2]:
from config import password

#  Connect to PostgreSQL

con = psycopg2.connect(
        host= '127.0.0.1',
        database= 'mental_health_db',
        user= 'postgres',
        password=password
)

In [3]:
#Retreive data from PostgreSQL Server
cur = con.cursor()

cur.execute("select * from table_1")

rows= cur.fetchall()

rows

[(datetime.datetime(2014, 8, 27, 11, 29),
  37,
  'Female',
  'United States',
  'IL',
  'NA',
  'No',
  'Yes',
  'Often',
  '25-Jun',
  'No',
  'Yes',
  'Yes',
  'Not sure',
  'No',
  'Yes',
  'Yes',
  'Somewhat easy',
  'No',
  'No',
  'Some of them',
  'Yes',
  'No',
  'Maybe',
  'Yes',
  'No'),
 (datetime.datetime(2014, 8, 27, 11, 29),
  44,
  'M',
  'United States',
  'IN',
  'NA',
  'No',
  'No',
  'Rarely',
  'More than 1000',
  'No',
  'No',
  "Don't know",
  'No',
  "Don't know",
  "Don't know",
  "Don't know",
  "Don't know",
  'Maybe',
  'No',
  'No',
  'No',
  'No',
  'No',
  "Don't know",
  'No'),
 (datetime.datetime(2014, 8, 27, 11, 29),
  32,
  'Male',
  'Canada',
  'NA',
  'NA',
  'No',
  'No',
  'Rarely',
  '25-Jun',
  'No',
  'Yes',
  'No',
  'No',
  'No',
  'No',
  "Don't know",
  'Somewhat difficult',
  'No',
  'No',
  'Yes',
  'Yes',
  'Yes',
  'Yes',
  'No',
  'No'),
 (datetime.datetime(2014, 8, 27, 11, 29),
  31,
  'Male',
  'United Kingdom',
  'NA',
  'NA',
  'Y

### Questions for Exploration

1. What is the most common attributes towards mental health in the workplace?
2. Does Geographic location matter in terms of mental health awareness?
3. Gauge inside the mind of the employee, with the last columns

In [4]:
mhealth_df= pd.DataFrame(rows, columns = ["timestamp","age","gender","country","state","self_employed","family_history",
                                          "treatment","work_interfere","no_employees","remote_work","tech_company",
                                          "benefits","care_options","wellness_program","seek_help","anonymity",
                                         "leave","mental_health_consequence","phys_health_consequence","coworkers",
                                         "supervisor","mental_health_interview","phys_health_interview","mental_vs_physical",
                                         "obs_consequence"])

mhealth_df.head(20)

Unnamed: 0,timestamp,age,gender,country,state,self_employed,family_history,treatment,work_interfere,no_employees,...,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence
0,2014-08-27 11:29:00,37,Female,United States,IL,,No,Yes,Often,25-Jun,...,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No
1,2014-08-27 11:29:00,44,M,United States,IN,,No,No,Rarely,More than 1000,...,Don't know,Don't know,Maybe,No,No,No,No,No,Don't know,No
2,2014-08-27 11:29:00,32,Male,Canada,,,No,No,Rarely,25-Jun,...,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No
3,2014-08-27 11:29:00,31,Male,United Kingdom,,,Yes,Yes,Often,26-100,...,No,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes
4,2014-08-27 11:30:00,31,Male,United States,TX,,No,No,Never,100-500,...,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No
5,2014-08-27 11:31:00,33,Male,United States,TN,,Yes,No,Sometimes,25-Jun,...,Don't know,Don't know,No,No,Yes,Yes,No,Maybe,Don't know,No
6,2014-08-27 11:31:00,35,Female,United States,MI,,Yes,Yes,Sometimes,5-Jan,...,No,Somewhat difficult,Maybe,Maybe,Some of them,No,No,No,Don't know,No
7,2014-08-27 11:32:00,39,M,Canada,,,No,No,Never,5-Jan,...,Yes,Don't know,No,No,No,No,No,No,No,No
8,2014-08-27 11:32:00,42,Female,United States,IL,,Yes,Yes,Sometimes,100-500,...,No,Very difficult,Maybe,No,Yes,Yes,No,Maybe,No,No
9,2014-08-27 11:32:00,23,Male,Canada,,,No,No,Never,26-100,...,Don't know,Don't know,No,No,Yes,Yes,Maybe,Maybe,Yes,No


# Data Cleaning

##### Primary Dataframe Glance
Consists of 26 columns, no primary keys. Need to check data types

In [5]:
# Check Data Types
mhealth_df.dtypes

timestamp                    datetime64[ns]
age                                   int64
gender                               object
country                              object
state                                object
self_employed                        object
family_history                       object
treatment                            object
work_interfere                       object
no_employees                         object
remote_work                          object
tech_company                         object
benefits                             object
care_options                         object
wellness_program                     object
seek_help                            object
anonymity                            object
leave                                object
mental_health_consequence            object
phys_health_consequence              object
coworkers                            object
supervisor                           object
mental_health_interview         

In [6]:
# Calculate Sum of NA Values

# Our Self-Eployed/work-interfere Columns have NA vlaues but is not being pulled in as a null value 
mhealth_df.isnull().sum().sort_values(ascending=False)

obs_consequence              0
mental_vs_physical           0
age                          0
gender                       0
country                      0
state                        0
self_employed                0
family_history               0
treatment                    0
work_interfere               0
no_employees                 0
remote_work                  0
tech_company                 0
benefits                     0
care_options                 0
wellness_program             0
seek_help                    0
anonymity                    0
leave                        0
mental_health_consequence    0
phys_health_consequence      0
coworkers                    0
supervisor                   0
mental_health_interview      0
phys_health_interview        0
timestamp                    0
dtype: int64

## Drop Irrelevant Columns

Let's review columns and determine if they need to be dropped
- Going to drop column, due to irrelevance in end product of this project
- Removing column, due to irrelevance in end product. Only relevant to people in USA

In [7]:
mhealth_df.drop('timestamp', axis=1, inplace=True)
mhealth_df.drop('state', axis=1, inplace=True)

In [8]:
mhealth_df.head(10)

Unnamed: 0,age,gender,country,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,...,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence
0,37,Female,United States,,No,Yes,Often,25-Jun,No,Yes,...,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No
1,44,M,United States,,No,No,Rarely,More than 1000,No,No,...,Don't know,Don't know,Maybe,No,No,No,No,No,Don't know,No
2,32,Male,Canada,,No,No,Rarely,25-Jun,No,Yes,...,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No
3,31,Male,United Kingdom,,Yes,Yes,Often,26-100,No,Yes,...,No,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes
4,31,Male,United States,,No,No,Never,100-500,Yes,Yes,...,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No
5,33,Male,United States,,Yes,No,Sometimes,25-Jun,No,Yes,...,Don't know,Don't know,No,No,Yes,Yes,No,Maybe,Don't know,No
6,35,Female,United States,,Yes,Yes,Sometimes,5-Jan,Yes,Yes,...,No,Somewhat difficult,Maybe,Maybe,Some of them,No,No,No,Don't know,No
7,39,M,Canada,,No,No,Never,5-Jan,Yes,Yes,...,Yes,Don't know,No,No,No,No,No,No,No,No
8,42,Female,United States,,Yes,Yes,Sometimes,100-500,No,Yes,...,No,Very difficult,Maybe,No,Yes,Yes,No,Maybe,No,No
9,23,Male,Canada,,No,No,Never,26-100,No,Yes,...,Don't know,Don't know,No,No,Yes,Yes,Maybe,Maybe,Yes,No


### DataFrame Column Check

We notice how there are indexes, but with this data we notice how there is not a primary key involved. 
Lets Transpose this dataframe for better visuals 

In [9]:
mhealth_df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1247,1248,1249,1250,1251,1252,1253,1254,1255,1256
age,37,44,32,31,31,33,35,39,42,23,...,30,30,36,29,36,26,32,34,46,25
gender,Female,M,Male,Male,Male,Male,Female,M,Female,Male,...,M,Male,Male,male,Male,male,Male,male,f,Male
country,United States,United States,Canada,United Kingdom,United States,United States,United States,Canada,United States,Canada,...,United States,United States,South Africa,United States,United States,United Kingdom,United States,United States,United States,United States
self_employed,,,,,,,,,,,...,No,Yes,No,No,No,No,No,No,No,No
family_history,No,No,No,Yes,No,Yes,Yes,No,Yes,No,...,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,No,Yes
treatment,Yes,No,No,Yes,No,No,Yes,No,Yes,No,...,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,No,Yes
work_interfere,Often,Rarely,Rarely,Often,Never,Sometimes,Sometimes,Never,Sometimes,Never,...,Sometimes,Often,Often,Sometimes,Rarely,,Often,Sometimes,,Sometimes
no_employees,25-Jun,More than 1000,25-Jun,26-100,100-500,25-Jun,5-Jan,5-Jan,100-500,26-100,...,26-100,26-100,100-500,100-500,More than 1000,26-100,26-100,More than 1000,100-500,26-100
remote_work,No,No,No,No,Yes,No,Yes,Yes,No,No,...,No,No,No,Yes,No,No,Yes,No,Yes,No
tech_company,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,...,Yes,Yes,No,Yes,No,Yes,Yes,Yes,Yes,No


#### Work_Interfere Column

In [10]:
print(mhealth_df['work_interfere'].unique())

['Often' 'Rarely' 'Never' 'Sometimes' 'NA']


#### People that answer NA, do not have a diagnosed medical condition

According to data description 'work_interfere' asks the question, 

-'If you have a mental health condition, do you feel that it interferes with your work?'.

In [11]:
# This will not change my NA Values, due to datatype from Postgresql,

# In theory, this code will change NA values to NA
mhealth_df['work_interfere'] = mhealth_df['work_interfere'].fillna('No Answer')
print("\n",mhealth_df['work_interfere'].unique(),"\n")


 ['Often' 'Rarely' 'Never' 'Sometimes' 'NA'] 



In [12]:
# Replacing all NA Values with No Answer

# Alternative Way to changing NA values
# Could have changed a single column, decided to change whole dataframe, for Visualizations
data=mhealth_df.replace('NA','No Answer')
data.head()

Unnamed: 0,age,gender,country,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,...,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence
0,37,Female,United States,No Answer,No,Yes,Often,25-Jun,No,Yes,...,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No
1,44,M,United States,No Answer,No,No,Rarely,More than 1000,No,No,...,Don't know,Don't know,Maybe,No,No,No,No,No,Don't know,No
2,32,Male,Canada,No Answer,No,No,Rarely,25-Jun,No,Yes,...,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No
3,31,Male,United Kingdom,No Answer,Yes,Yes,Often,26-100,No,Yes,...,No,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes
4,31,Male,United States,No Answer,No,No,Never,100-500,Yes,Yes,...,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No


In [13]:
n = len(pd.unique(data["work_interfere"]))

print("No. of unique values:", n)

print(data["work_interfere"].unique())

No. of unique values: 5
['Often' 'Rarely' 'Never' 'Sometimes' 'No Answer']


## Optimized DataFrame Column Check with Customized For Loop 
Instead of Checking each unique values per line of code, lets create a for loop

In [14]:
unique_column_values= {}

for column in data.columns:
    unique_column_values[column] = data[column].unique()
    

##### Age Column
Dropping all non-relevant ages 

In [15]:
# Call Age Column

unique_column_values["age"]

array([   37,    44,    32,    31,    33,    35,    39,    42,    23,
          29,    36,    27,    46,    41,    34,    30,    40,    38,
          50,    24,    18,    28,    26,    22,    19,    25,    45,
          21,   -29,    43,    56,    60,    54,   329,    55,    48,
          20,    57,    58,    47,    62,    51,    65,    49, -1726,
           5,    53,    61,     8,    11,    -1,    72])

In [16]:
max_age= unique_column_values['age'].max()
print(f'The max age is {max_age}')

The max age is 329


In [17]:
min_age= unique_column_values['age'].min()
print(f'The min age is {min_age}')

The min age is -1726


In [18]:
# Delete rows from a pandas DataFrame based on a conditional expression

data = data.drop(data[(data.age <=0) | (data.age > 120)].index)

data["age"].unique()

array([37, 44, 32, 31, 33, 35, 39, 42, 23, 29, 36, 27, 46, 41, 34, 30, 40,
       38, 50, 24, 18, 28, 26, 22, 19, 25, 45, 21, 43, 56, 60, 54, 55, 48,
       20, 57, 58, 47, 62, 51, 65, 49,  5, 53, 61,  8, 11, 72])

#### Gender Column
For the purpose of this project, making 3 categiories for Gender

In [19]:
# Call Gender Column

unique_column_values["gender"]

array(['Female', 'M', 'Male', 'male', 'female', 'm', 'Male-ish', 'maile',
       'Trans-female', 'Cis Female', 'F', 'Cis Male', 'Woman', 'f', 'Mal',
       'Male (CIS)', 'queer/she/they', 'non-binary', 'Femake', 'woman',
       'Make', 'Nah', 'Enby', 'fluid', 'Genderqueer', 'Female ',
       'Androgyne', 'Agender', 'male leaning androgynous', 'Male ', 'Man',
       'Trans woman', 'msle', 'Neuter', 'Female (trans)', 'queer',
       'Female (cis)', 'Mail', 'cis male', 'A little about you', 'Malr',
       'p', 'femail', 'Cis Man'], dtype=object)

 Lots of unique values here, 

 Make a list of 3 groups and make all uniform 

In [20]:
male = ['male', 'Male','M', 'm', 'Male-ish', 'maile','Cis Male','Mal', 'Male (CIS)','Make','Male ', 'Man',
        'msle','cis male', 'Cis Man','Malr','Mail']
female = ['Female', 'female','Cis Female', 'F','f','Femake', 'woman','Female ','cis-female/femme','Female (cis)','femail','Woman','female']
other = ['Trans-female','something kinda male?','queer/she/they','non-binary','All','fluid', 'Genderqueer','Androgyne', 'Agender','Guy (-ish) ^_^',
        'male leaning androgynous','Trans woman','Neuter', 'Female (trans)','queer','ostensibly male, unsure what that really means','trans','Nah','Enby','A little about you']

data['gender'].replace(to_replace= male , value = 'male', inplace=True)
data['gender'].replace(to_replace= female , value = 'female', inplace=True)
data['gender'].replace(to_replace= other , value = 'other', inplace=True)

data['gender'].unique()

array(['female', 'male', 'other'], dtype=object)

#### Country Column
Make a continent column for easier Visuals

In [104]:
#### Country Column

unique_column_values['country']

array(['United States', 'Canada', 'United Kingdom', 'Bulgaria', 'France',
       'Portugal', 'Netherlands', 'Switzerland', 'Poland', 'Australia',
       'Germany', 'Mexico', 'Brazil', 'Slovenia', 'Costa Rica', 'Austria',
       'Ireland', 'India', 'South Africa', 'Russia', 'Italy', 'Sweden',
       'Colombia', 'Latvia', 'Romania', 'Belgium', 'New Zealand', 'Spain',
       'Finland', 'Uruguay', 'Israel', 'Bosnia and Herzegovina',
       'Hungary', 'Singapore', 'Japan', 'Nigeria', 'Croatia', 'Norway',
       'Thailand', 'Denmark', 'Bahamas, The', 'Greece', 'Moldova',
       'Georgia', 'China', 'Czech Republic', 'Philippines'], dtype=object)

In [106]:
data['country'].replace('Bahamas, The', value = 'The Bahamas', inplace=True)

data['country'].unique()

array(['United States', 'Canada', 'United Kingdom', 'Bulgaria', 'France',
       'Portugal', 'Netherlands', 'Switzerland', 'Poland', 'Australia',
       'Germany', 'Mexico', 'Brazil', 'Slovenia', 'Costa Rica', 'Austria',
       'Ireland', 'India', 'South Africa', 'Russia', 'Italy', 'Sweden',
       'Colombia', 'Latvia', 'Romania', 'Belgium', 'New Zealand', 'Spain',
       'Finland', 'Uruguay', 'Israel', 'Bosnia and Herzegovina',
       'Hungary', 'Singapore', 'Japan', 'Nigeria', 'Croatia', 'Norway',
       'Thailand', 'Denmark', 'The Bahamas', 'Greece', 'Moldova',
       'Georgia', 'China', 'Czech Republic', 'Philippines'], dtype=object)

#### Add Another column, based on the conditions of [Country] Column


In [22]:
continents = {'United States':"North America","Canada":"North America",'United Kingdom':"Europe",'Bulgaria':"Europe",'France':"Europe",
            'Portugal':"Europe",'Netherlands':"Europe",'Switzerland':"Europe",'Poland':"Europe", 'Australia':'Oceania',
              'Germany':"Europe",'Mexico':"North America",'Brazil':"South America",'Slovenia':"Europe",'Costa Rica':"North America",
              'Austria':"Europe",'Ireland':"Europe",'India':"Asia",'South Africa':"Africa",'Russia':"Europe",'Italy':"Europe",
              'Sweden':"Europe",'Colombia':"South America",'Latvia':"Europe",'Romania':"Europe",'Belgium':"Europe",'New Zealand':"Oceania",
              'Spain':"Europe",'Finland':"Europe",'Uruguay':"South America",'Israel':"Europe",'Bosnia and Herzegovina':"Europe",
              'Hungary':"Europe",'Singapore':"Asia",'Japan':"Asia",'Nigeria':"Africa",'Croatia':"Europe",'Norway':"Europe",
              'Thailand':"Asia", 'Denmark':"Europe",'The Bahamas':"North America",'Greece':"Europe",'Moldova':"Europe",'Georgia':"Europe",
              'China':"Asia",'Czech Republic':"Europe",'Philippines':"Asia"
             }

data["continent"] = data["country"].map(continents)
data.head(10)

Unnamed: 0,age,gender,country,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,continent
0,37,female,United States,No Answer,No,Yes,Often,25-Jun,No,Yes,...,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,North America
1,44,male,United States,No Answer,No,No,Rarely,More than 1000,No,No,...,Don't know,Maybe,No,No,No,No,No,Don't know,No,North America
2,32,male,Canada,No Answer,No,No,Rarely,25-Jun,No,Yes,...,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,North America
3,31,male,United Kingdom,No Answer,Yes,Yes,Often,26-100,No,Yes,...,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,Europe
4,31,male,United States,No Answer,No,No,Never,100-500,Yes,Yes,...,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,North America
5,33,male,United States,No Answer,Yes,No,Sometimes,25-Jun,No,Yes,...,Don't know,No,No,Yes,Yes,No,Maybe,Don't know,No,North America
6,35,female,United States,No Answer,Yes,Yes,Sometimes,5-Jan,Yes,Yes,...,Somewhat difficult,Maybe,Maybe,Some of them,No,No,No,Don't know,No,North America
7,39,male,Canada,No Answer,No,No,Never,5-Jan,Yes,Yes,...,Don't know,No,No,No,No,No,No,No,No,North America
8,42,female,United States,No Answer,Yes,Yes,Sometimes,100-500,No,Yes,...,Very difficult,Maybe,No,Yes,Yes,No,Maybe,No,No,North America
9,23,male,Canada,No Answer,No,No,Never,26-100,No,Yes,...,Don't know,No,No,Yes,Yes,Maybe,Maybe,Yes,No,North America


#### Rest of Columns
Checking unique values

In [23]:
print(f'Self_Employed columns unique values are, {unique_column_values["self_employed"]}')
print(f'Family_history columns unique values are, {unique_column_values["family_history"]}')
print(f'Treatment columns unique values are, {unique_column_values["treatment"]}')
print(f'No_employees columns unique values are, {unique_column_values["no_employees"]}')
print(f'Remote_work columns unique values are, {unique_column_values["remote_work"]}')
print(f'Tech_company columns unique values are, {unique_column_values["tech_company"]}')
print(f'Benefits columns unique values are, {unique_column_values["benefits"]}')
print(f'Care_Options columns unique values are, {unique_column_values["care_options"]}')
print(f'Wellness_program columns unique values are, {unique_column_values["wellness_program"]}')
print(f'Seek_help columns unique values are, {unique_column_values["seek_help"]}')
print(f'Anonymity columns unique values are, {unique_column_values["anonymity"]}')
print(f'Leave columns unique values are, {unique_column_values["leave"]}')
print(f'Mental_Health_conseq columns unique values are, {unique_column_values["mental_health_consequence"]}')
print(f'Phys_health_conseq columns unique values are, {unique_column_values["phys_health_consequence"]}')
print(f'Coworkers columns unique values are, {unique_column_values["coworkers"]}')
print(f'Supervisor columns unique values are, {unique_column_values["supervisor"]}')
print(f'Mental_health_interview columns unique values are, {unique_column_values["mental_health_interview"]}')
print(f'Phys_health_interview columns unique values are, {unique_column_values["phys_health_interview"]}')
print(f'Mental_vs_Phys columns unique values are, {unique_column_values["mental_vs_physical"]}')

Self_Employed columns unique values are, ['No Answer' 'Yes' 'No']
Family_history columns unique values are, ['No' 'Yes']
Treatment columns unique values are, ['Yes' 'No']
No_employees columns unique values are, ['25-Jun' 'More than 1000' '26-100' '100-500' '5-Jan' '500-1000']
Remote_work columns unique values are, ['No' 'Yes']
Tech_company columns unique values are, ['Yes' 'No']
Benefits columns unique values are, ['Yes' "Don't know" 'No']
Care_Options columns unique values are, ['Not sure' 'No' 'Yes']
Wellness_program columns unique values are, ['No' "Don't know" 'Yes']
Seek_help columns unique values are, ['Yes' "Don't know" 'No']
Anonymity columns unique values are, ['Yes' "Don't know" 'No']
Leave columns unique values are, ['Somewhat easy' "Don't know" 'Somewhat difficult' 'Very difficult'
 'Very easy']
Mental_Health_conseq columns unique values are, ['No' 'Maybe' 'Yes']
Phys_health_conseq columns unique values are, ['No' 'Yes' 'Maybe']
Coworkers columns unique values are, ['Some o

#### No_Employees Column
No_employees columns unique values are, ['25-Jun' 'More than 1000' '26-100' '100-500' '5-Jan' '500-1000']

Categorize in 5 groups

In [24]:
less_than_25 = ['5-Jan','25-Jun']
twentysix_to_100= ['26-100']
onehundred_to_500= ['100-500']
fivehundred_to_1K= ['500-1000']
greater_than_1K= ['More than 1000']

data['no_employees'].replace(to_replace= less_than_25 , value = '0-25', inplace=True)
data['no_employees'].replace(to_replace= twentysix_to_100 , value = '26-100', inplace=True)
data['no_employees'].replace(to_replace= onehundred_to_500 , value = '100-500', inplace=True)
data["no_employees"].replace(to_replace= fivehundred_to_1K, value= '500-1000', inplace=True)
data["no_employees"].replace(to_replace= greater_than_1K, value= '1000+', inplace=True)

data["no_employees"].unique()

array(['0-25', '1000+', '26-100', '100-500', '500-1000'], dtype=object)

### Export Cleaned CSV File to Folder Structure

In [27]:
data.to_csv('Resources/mental_health_clean_data.csv', index=False)