# In this second cleaning notebook, I proceed with the cleaning of Spanish Q4 2020 unemployment and employment rates database.

## 1. Importing libraries that are used.

In [1]:
import pandas as pd
import numpy as np

## 2. Importing database and data exploration

In [2]:
unemployment = pd.read_csv("../data/paro.csv", encoding = "ISO-8859-1",sep = (";"), engine='python', error_bad_lines=False, warn_bad_lines=False)

In [3]:
unemployment.shape

(53, 5)

In [4]:
unemployment.head()

Unnamed: 0,Sexo,Provincias,Tasas,Periodo,Total
0,Ambos sexos,Total Nacional,Tasa de paro de la población,2020T4,1613
1,Ambos sexos,02 Albacete,Tasa de paro de la población,2020T4,1744
2,Ambos sexos,03 Alicante/Alacant,Tasa de paro de la población,2020T4,1941
3,Ambos sexos,04 Almería,Tasa de paro de la población,2020T4,2115
4,Ambos sexos,01 Araba/Álava,Tasa de paro de la población,2020T4,999


In [5]:
unemployment.columns

Index(['Sexo', 'Provincias', 'Tasas', 'Periodo', 'Total'], dtype='object')

In [6]:
unemployment.dtypes

Sexo          object
Provincias    object
Tasas         object
Periodo       object
Total         object
dtype: object

## 3. Cleaning database

### 3.1. Renaming information content of an specific column.

In [7]:
unemployment = unemployment.rename(columns={'Total': 'Unemployment rate'})

### 3.2. Spliting information conent in column named "Provincias".

In [8]:
provincias = unemployment["Provincias"].str.split(" ", n = 1, expand = True)

In [9]:
unemployment["Provincias"] = provincias[1]

In [10]:
unemployment["Provincias"].unique()

array(['Nacional', 'Albacete', 'Alicante/Alacant', 'Almería',
       'Araba/Álava', 'Asturias', 'Ávila', 'Badajoz', 'Balears, Illes',
       'Barcelona', 'Bizkaia', 'Burgos', 'Cáceres', 'Cádiz', 'Cantabria',
       'Castellón/Castelló', 'Ciudad Real', 'Córdoba', 'Coruña, A',
       'Cuenca', 'Gipuzkoa', 'Girona', 'Granada', 'Guadalajara', 'Huelva',
       'Huesca', 'Jaén', 'León', 'Lleida', 'Lugo', 'Madrid', 'Málaga',
       'Murcia', 'Navarra', 'Ourense', 'Palencia', 'Palmas, Las',
       'Pontevedra', 'Rioja, La', 'Salamanca', 'Santa Cruz de Tenerife',
       'Segovia', 'Sevilla', 'Soria', 'Tarragona', 'Teruel', 'Toledo',
       'Valencia/València', 'Valladolid', 'Zamora', 'Zaragoza', 'Ceuta',
       'Melilla'], dtype=object)

### 3.3. Replacing values.

#### 3.3.1. Replacing vowels

Incluir en archivo .py

In [11]:
def normalize(s):
# Function to replace vowels accented by the same unaccented vowels.

    replacements = (
        ("á", "a"),
        ("é", "e"),
        ("í", "i"),
        ("ó", "o"),
        ("ú", "u")
    )
    for a, b in replacements:
        s = s.replace(a, b).replace(a.upper(), b.upper())
    return s

In [12]:
unemployment["Provincias"] =unemployment["Provincias"].apply(normalize)

#### 3.3.2. Replacing the names of the provinces with the names that appear in the restaurants' database

In [13]:
unemployment.loc[unemployment["Provincias"] == "Alicante/Alacant", "Provincias"] = "Alicante"
unemployment.loc[unemployment["Provincias"] == "Coruña, A", "Provincias"] = "La Coruna"
unemployment.loc[unemployment["Provincias"] == "Rioja, La", "Provincias"] = "La Rioja"
unemployment.loc[unemployment["Provincias"] == "Bizkaia", "Provincias"] = "Vizcaya"
unemployment.loc[unemployment["Provincias"] == "Araba/Alava", "Provincias"] = "Alava"
unemployment.loc[unemployment["Provincias"] == "Castellon/Castello", "Provincias"] = "Castellon"
unemployment.loc[unemployment["Provincias"] == "Lleida", "Provincias"] = "Lerida"
unemployment.loc[unemployment["Provincias"] == "Gipuzkoa", "Provincias"] = "Guipuzcua"
unemployment.loc[unemployment["Provincias"] == "Girona", "Provincias"] = "Gerona"
unemployment.loc[unemployment["Provincias"] == "Valencia/València", "Provincias"] = "Valencia"
unemployment.loc[unemployment["Provincias"] == "Balears, Illes", "Provincias"] = "Islas Baleares"
unemployment.loc[unemployment["Provincias"] == "Palmas, Las", "Provincias"] = "Las Palmas"

In [14]:
unemployment.Provincias.unique()

array(['Nacional', 'Albacete', 'Alicante', 'Almeria', 'Alava', 'Asturias',
       'Avila', 'Badajoz', 'Islas Baleares', 'Barcelona', 'Vizcaya',
       'Burgos', 'Caceres', 'Cadiz', 'Cantabria', 'Castellon',
       'Ciudad Real', 'Cordoba', 'La Coruna', 'Cuenca', 'Guipuzcua',
       'Gerona', 'Granada', 'Guadalajara', 'Huelva', 'Huesca', 'Jaen',
       'Leon', 'Lerida', 'Lugo', 'Madrid', 'Malaga', 'Murcia', 'Navarra',
       'Ourense', 'Palencia', 'Las Palmas', 'Pontevedra', 'La Rioja',
       'Salamanca', 'Santa Cruz de Tenerife', 'Segovia', 'Sevilla',
       'Soria', 'Tarragona', 'Teruel', 'Toledo', 'Valencia', 'Valladolid',
       'Zamora', 'Zaragoza', 'Ceuta', 'Melilla'], dtype=object)

In [15]:
unemployment.shape

(53, 5)

### 3.4. Drop unnecessary column

In [16]:
unemployment = unemployment.drop(['Tasas','Periodo','Sexo'], axis=1)

In [17]:
unemployment.head()

Unnamed: 0,Provincias,Unemployment rate
0,Nacional,1613
1,Albacete,1744
2,Alicante,1941
3,Almeria,2115
4,Alava,999


## 4. Exporting cleaning data

In [18]:
unemployment = unemployment.reset_index(drop = True)

In [19]:
unemployment.head()

Unnamed: 0,Provincias,Unemployment rate
0,Nacional,1613
1,Albacete,1744
2,Alicante,1941
3,Almeria,2115
4,Alava,999


In [20]:
unemployment.to_csv("../output/unemployment.csv", index = False)