#### Importing libraries

In [1]:
import pandas as pd
import numpy as np

#### Importing our merged dataset

In [2]:
merged_dataset = pd.read_csv('merged_dataset.csv', low_memory=False)
merged_dataset.head()

Unnamed: 0,index,Entity,Code,Year,Schizophrenia (%),Bipolar disorder (%),Eating disorders (%),Anxiety disorders (%),Drug use disorders (%),Depression (%),Alcohol use disorders (%),Prevalence in males (%),Prevalence in females (%),Population_x,"Suicide rate (deaths per 100,000 individuals)","Depressive disorder rates (number suffering per 100,000)",Population_y,Prevalence - Depressive disorders - Sex: Both - Age: All Ages (Number) (people suffering from depression)
0,0,Afghanistan,AFG,1990,0.16056,0.697779,0.101855,4.82883,1.677082,4.071831,0.672404,3.499982,4.647815,12412000.0,10.318504,4039.755763,12412000.0,318435.81367
1,1,Afghanistan,AFG,1991,0.160312,0.697961,0.099313,4.82974,1.684746,4.079531,0.671768,3.503947,4.655772,13299000.0,10.32701,4046.256034,13299000.0,329044.773956
2,2,Afghanistan,AFG,1992,0.160135,0.698107,0.096692,4.831108,1.694334,4.088358,0.670644,3.508912,4.662066,14486000.0,10.271411,4053.709902,14486000.0,382544.572895
3,3,Afghanistan,AFG,1993,0.160037,0.698257,0.094336,4.830864,1.70532,4.09619,0.669738,3.513429,4.669012,15817000.0,10.376123,4060.203474,15817000.0,440381.507393
4,4,Afghanistan,AFG,1994,0.160022,0.698469,0.092439,4.829423,1.716069,4.099582,0.66926,3.515578,4.67305,17076000.0,10.575915,4062.290365,17076000.0,456916.645489


#### Data Cleaning

In [3]:
# drop columns
merged_dataset = merged_dataset.drop(columns = ['index'])
merged_dataset = merged_dataset.drop(columns = ['Population_y'])

# rename columns 
merged_dataset.rename(columns={
    'Entity': 'country',
    'Code': 'code',
    'Year': 'year',
    'Schizophrenia (%)': 'schizophrenia',
    'Bipolar disorder (%)': 'bipolar_disorder',
    'Eating disorders (%)': 'eating_disorders',
    'Anxiety disorders (%)': 'anxiety_disorders',
    'Drug use disorders (%)': 'drug_use_disorders',
    'Depression (%)': 'depression',
    'Alcohol use disorders (%)': 'alcohol_use_disorders',
    'Prevalence in males (%)': 'prevalence_male',
    'Prevalence in females (%)': 'prevalence_female',
    'Population_x': 'population',
    'Suicide rate (deaths per 100,000 individuals)': 'suicide_rate_per_100k',
    'Depressive disorder rates (number suffering per 100,000)': 'depressive_disorder_rate_per_100k',
    'Prevalence - Depressive disorders - Sex: Both - Age: All Ages (Number) (people suffering from depression)': 'depressive_disorders'
}, inplace=True)

In [4]:
# change datatypes
columns_to_convert = [
    'schizophrenia', 
    'bipolar_disorder', 
    'eating_disorders', 
    'anxiety_disorders', 
    'prevalence_male', 
    'prevalence_female', 
    'population', 
    'suicide_rate_per_100k', 
    'depressive_disorder_rate_per_100k',
    'depressive_disorders'
]

for column in columns_to_convert:
    merged_dataset[column] = merged_dataset[column].astype(float)


In [5]:
merged_dataset.dtypes

country                               object
code                                  object
year                                   int64
schizophrenia                        float64
bipolar_disorder                     float64
eating_disorders                     float64
anxiety_disorders                    float64
drug_use_disorders                   float64
depression                           float64
alcohol_use_disorders                float64
prevalence_male                      float64
prevalence_female                    float64
population                           float64
suicide_rate_per_100k                float64
depressive_disorder_rate_per_100k    float64
depressive_disorders                 float64
dtype: object

In [6]:
merged_dataset['country'].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'American Samoa',
       'Andean Latin America', 'Andorra', 'Angola', 'Antigua and Barbuda',
       'Argentina', 'Armenia', 'Australasia', 'Australia', 'Austria',
       'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados',
       'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan',
       'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia',
       'Cameroon', 'Canada', 'Cape Verde', 'Caribbean',
       'Central African Republic', 'Central Asia', 'Central Europe',
       'Central Europe, Eastern Europe, and Central Asia',
       'Central Latin America', 'Central Sub-Saharan Africa', 'Chad',
       'Chile', 'China', 'Colombia', 'Comoros', 'Congo', 'Costa Rica',
       "Cote d'Ivoire", 'Croatia', 'Cuba', 'Cyprus', 'Czech Republic',
       'Democratic Republic of Congo', 'Denmark', 'Djibouti', 'Dominica',
       'Dominican Republic', 'East Asia', 'Eastern 

In [7]:
regions_to_delete = [
    'Australasia', 'Caribbean', 'North America', 'Micronesia (country)', 'South Asia', 'Oceania', 'World', 'High SDI', 'High-income', 'High-middle SDI', 'Low SDI', 'Low-middle SDI',
    'Middle SDI', 'Western Europe', 'Tropical Latin America', 'Sub-Saharan Africa',
    'Southern Sub-Saharan Africa', 'Southeast Asia, East Asia, and Oceania',
    'Southern Latin America', 'Southeast Asia', 'Latin America and Caribbean',
    'Eastern Sub-Saharan Africa', 'Eastern Europe', 'East Asia', 'Central Latin America',
    'Central Sub-Saharan Africa', 'Central Europe, Eastern Europe, and Central Asia',
    'Central Asia', 'Central Europe', 'High-income Asia Pacific', 'Andean Latin America', 'North Africa and Middle East', 'Western Sub-Saharan Africa'
]

merged_dataset = merged_dataset[~merged_dataset['country'].isin(regions_to_delete)]

In [8]:
merged_dataset['country'].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'American Samoa', 'Andorra',
       'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia',
       'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain',
       'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin',
       'Bermuda', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina',
       'Botswana', 'Brazil', 'Brunei', 'Bulgaria', 'Burkina Faso',
       'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde',
       'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia',
       'Comoros', 'Congo', 'Costa Rica', "Cote d'Ivoire", 'Croatia',
       'Cuba', 'Cyprus', 'Czech Republic', 'Democratic Republic of Congo',
       'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador',
       'Egypt', 'El Salvador', 'England', 'Equatorial Guinea', 'Eritrea',
       'Estonia', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon',
       'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Greenland',
       'Grenada', '

In [9]:
merged_dataset

Unnamed: 0,country,code,year,schizophrenia,bipolar_disorder,eating_disorders,anxiety_disorders,drug_use_disorders,depression,alcohol_use_disorders,prevalence_male,prevalence_female,population,suicide_rate_per_100k,depressive_disorder_rate_per_100k,depressive_disorders
0,Afghanistan,AFG,1990,0.160560,0.697779,0.101855,4.828830,1.677082,4.071831,0.672404,3.499982,4.647815,12412000.0,10.318504,4039.755763,318435.813670
1,Afghanistan,AFG,1991,0.160312,0.697961,0.099313,4.829740,1.684746,4.079531,0.671768,3.503947,4.655772,13299000.0,10.327010,4046.256034,329044.773956
2,Afghanistan,AFG,1992,0.160135,0.698107,0.096692,4.831108,1.694334,4.088358,0.670644,3.508912,4.662066,14486000.0,10.271411,4053.709902,382544.572895
3,Afghanistan,AFG,1993,0.160037,0.698257,0.094336,4.830864,1.705320,4.096190,0.669738,3.513429,4.669012,15817000.0,10.376123,4060.203474,440381.507393
4,Afghanistan,AFG,1994,0.160022,0.698469,0.092439,4.829423,1.716069,4.099582,0.669260,3.515578,4.673050,17076000.0,10.575915,4062.290365,456916.645489
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6463,Zimbabwe,ZWE,2013,0.155670,0.607993,0.117248,3.090168,0.766280,3.128192,1.515641,2.769193,3.424106,13350000.0,28.361200,3048.264249,303564.603590
6464,Zimbabwe,ZWE,2014,0.155993,0.608610,0.118073,3.093964,0.768914,3.140290,1.515470,2.778101,3.437674,13587000.0,27.605547,3056.996704,311665.769283
6465,Zimbabwe,ZWE,2015,0.156465,0.609363,0.119470,3.098687,0.771802,3.155710,1.514751,2.789152,3.455323,13815000.0,27.197061,3068.250731,320638.507158
6466,Zimbabwe,ZWE,2016,0.157111,0.610234,0.121456,3.104294,0.772275,3.174134,1.513269,2.799308,3.479071,14030000.0,26.839591,3081.782858,330437.353798


In [10]:
# NaN values
missing_values = merged_dataset.isnull().sum()
missing_values

country                                0
code                                 112
year                                   0
schizophrenia                          0
bipolar_disorder                       0
eating_disorders                       0
anxiety_disorders                      0
drug_use_disorders                     0
depression                             0
alcohol_use_disorders                  0
prevalence_male                        0
prevalence_female                      0
population                           112
suicide_rate_per_100k                  0
depressive_disorder_rate_per_100k      0
depressive_disorders                   0
dtype: int64

In [11]:
merged_dataset.dropna(subset = ['code', 'population'])

Unnamed: 0,country,code,year,schizophrenia,bipolar_disorder,eating_disorders,anxiety_disorders,drug_use_disorders,depression,alcohol_use_disorders,prevalence_male,prevalence_female,population,suicide_rate_per_100k,depressive_disorder_rate_per_100k,depressive_disorders
0,Afghanistan,AFG,1990,0.160560,0.697779,0.101855,4.828830,1.677082,4.071831,0.672404,3.499982,4.647815,12412000.0,10.318504,4039.755763,318435.813670
1,Afghanistan,AFG,1991,0.160312,0.697961,0.099313,4.829740,1.684746,4.079531,0.671768,3.503947,4.655772,13299000.0,10.327010,4046.256034,329044.773956
2,Afghanistan,AFG,1992,0.160135,0.698107,0.096692,4.831108,1.694334,4.088358,0.670644,3.508912,4.662066,14486000.0,10.271411,4053.709902,382544.572895
3,Afghanistan,AFG,1993,0.160037,0.698257,0.094336,4.830864,1.705320,4.096190,0.669738,3.513429,4.669012,15817000.0,10.376123,4060.203474,440381.507393
4,Afghanistan,AFG,1994,0.160022,0.698469,0.092439,4.829423,1.716069,4.099582,0.669260,3.515578,4.673050,17076000.0,10.575915,4062.290365,456916.645489
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6463,Zimbabwe,ZWE,2013,0.155670,0.607993,0.117248,3.090168,0.766280,3.128192,1.515641,2.769193,3.424106,13350000.0,28.361200,3048.264249,303564.603590
6464,Zimbabwe,ZWE,2014,0.155993,0.608610,0.118073,3.093964,0.768914,3.140290,1.515470,2.778101,3.437674,13587000.0,27.605547,3056.996704,311665.769283
6465,Zimbabwe,ZWE,2015,0.156465,0.609363,0.119470,3.098687,0.771802,3.155710,1.514751,2.789152,3.455323,13815000.0,27.197061,3068.250731,320638.507158
6466,Zimbabwe,ZWE,2016,0.157111,0.610234,0.121456,3.104294,0.772275,3.174134,1.513269,2.799308,3.479071,14030000.0,26.839591,3081.782858,330437.353798


#### Creating a new CSV file from the cleaned data

In [12]:
clean_data = merged_dataset.copy()

In [13]:
clean_data.to_csv('C:/Users/Benji/python_learn/global_mental_disorders/clean_data.csv', index=False)