# In this fourth cleaning notebook, I proceed with the cleaning of Spanish 2020 population database.

## 1. Importing libraries that are used.

In [1]:
import pandas as pd
import numpy as np

## 2. Importing database and data exploration

In [2]:
population2020 = pd.read_csv("../data/population.csv", encoding = "ISO-8859-1",sep = (";"), engine='python', error_bad_lines=False, warn_bad_lines=False)

In [3]:
population2020.shape

(53, 4)

In [4]:
population2020.head()

Unnamed: 0,Provincias,Sexo,Periodo,Total
0,Nacional,Total,2020,47.450.795
1,Albacete,Total,2020,388.270
2,Alicante,Total,2020,1.879.888
3,Almeria,Total,2020,727.945
4,Alava,Total,2020,333.940


In [5]:
population2020.columns

Index(['Provincias', 'Sexo', 'Periodo', 'Total'], dtype='object')

## 3. Cleaning data

### 3.1. Replacing values

In [6]:
population2020.loc[population2020["Provincias"] == "Alicante/Alacant", "Provincias"] = "Alicante"
population2020.loc[population2020["Provincias"] == "Coruña, A", "Provincias"] = "La Coruna"
population2020.loc[population2020["Provincias"] == "Rioja, La", "Provincias"] = "La Rioja"
population2020.loc[population2020["Provincias"] == "Bizkaia", "Provincias"] = "Vizcaya"
population2020.loc[population2020["Provincias"] == "Araba/Alava", "Provincias"] = "Alava"
population2020.loc[population2020["Provincias"] == "Castellon/Castello", "Provincias"] = "Castellon"
population2020.loc[population2020["Provincias"] == "Lleida", "Provincias"] = "Lerida"
population2020.loc[population2020["Provincias"] == "Gipuzkoa", "Provincias"] = "Guipuzcua"
population2020.loc[population2020["Provincias"] == "Girona", "Provincias"] = "Gerona"
population2020.loc[population2020["Provincias"] == "Palmas, Las", "Provincias"] = "Las Palmas"

In [7]:
population2020.Provincias.unique()

array(['Nacional', 'Albacete', 'Alicante', 'Almeria', 'Alava', 'Asturias',
       'Avila', 'Badajoz', 'Islas Baleares', 'Barcelona', 'Vizcaya',
       'Burgos', 'Caceres', 'Cadiz', 'Cantabria', 'Castellon',
       'Ciudad Real', 'Cordoba', 'La Coruna', 'Cuenca', 'Guipuzcua',
       'Gerona', 'Granada', 'Guadalajara', 'Huelva', 'Huesca', 'Jaen',
       'Leon', 'Lerida', 'Lugo', 'Madrid', 'Malaga', 'Murcia', 'Navarra',
       'Ourense', 'Palencia', 'Las Palmas', 'Pontevedra', 'La Rioja',
       'Salamanca', 'Santa Cruz de Tenerife', 'Segovia', 'Sevilla',
       'Soria', 'Tarragona', 'Teruel', 'Toledo', 'Valencia', 'Valladolid',
       'Zamora', 'Zaragoza', 'Ceuta', 'Melilla'], dtype=object)

### 3.2. Renaming column

In [8]:
population2020 = population2020.rename(columns={'Total': 'Population'})

## 4. Joining databases

### 4.1. Uploading dabases we want to join

In [9]:
employment = pd.read_csv("../output/employment.csv", encoding = "ISO-8859-1")
unemployment = pd.read_csv("../output/unemployment.csv", encoding = "ISO-8859-1")

### 4.2. Concatenating dataframes

In [10]:
province_data = pd.concat([population2020,employment,unemployment], axis=1)
province_data[:7]

Unnamed: 0,Provincias,Sexo,Periodo,Population,Provincias.1,Employment rate,Provincias.2,Unemployment rate
0,Nacional,Total,2020,47.450.795,Nacional,4881,Nacional,1613
1,Albacete,Total,2020,388.270,Albacete,4863,Albacete,1744
2,Alicante,Total,2020,1.879.888,Alicante,4592,Alicante,1941
3,Almeria,Total,2020,727.945,Almeria,4709,Almeria,2115
4,Alava,Total,2020,333.940,Alava,496,Alava,999
5,Asturias,Total,2020,1.018.784,Asturias,4341,Asturias,135
6,Avila,Total,2020,157.664,Avila,4669,Avila,1606


### 4.3. Deleting duplicated columns

In [15]:
province_data= province_data.loc[:,~province_data.columns.duplicated(keep='first')]

In [17]:
province_data.head()

Unnamed: 0,Provincias,Sexo,Periodo,Population,Employment rate,Unemployment rate
0,Nacional,Total,2020,47.450.795,4881,1613
1,Albacete,Total,2020,388.270,4863,1744
2,Alicante,Total,2020,1.879.888,4592,1941
3,Almeria,Total,2020,727.945,4709,2115
4,Alava,Total,2020,333.940,496,999


## 5. Exporting joined data.

In [18]:
province_data.to_csv("../output/province_data.csv", index = False)