# In this notebook, I proceed with the cleaning Spanish tourism during 2019 database.

The database was obtained from [INE website](https://www.ine.es/).

## 1. Importing libraries

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(0, '../src')
import unittest
from call import *

## 2. Importing database and data exploration

In [2]:
tourists2019 = pd.read_csv("../data/tourists2019.csv", encoding = "ISO-8859-1",sep = (";"), engine='python', error_bad_lines=False, warn_bad_lines=False)

In [3]:
tourists2019.shape

(96, 4)

In [4]:
tourists2019.head()

Unnamed: 0,Comunidades autónomas,Tipo de dato,Periodo,Total
0,00 Total,Dato base,2019M12,4.304.741
1,00 Total,Dato base,2019M11,4.655.021
2,00 Total,Dato base,2019M10,7.587.749
3,00 Total,Dato base,2019M09,8.845.152
4,00 Total,Dato base,2019M08,10.118.366


In [5]:
tourists2019.columns

Index(['Comunidades autónomas', 'Tipo de dato', 'Periodo', 'Total'], dtype='object')

In [6]:
tourists2019.dtypes

Comunidades autónomas    object
Tipo de dato             object
Periodo                  object
Total                    object
dtype: object

## 3. Cleaning database

### 3.1. Renaming columns

In [7]:
tourists2019 = tourists2019.rename(columns={'Comunidades autónomas': 'region'})

In [8]:
tourists2019

Unnamed: 0,region,Tipo de dato,Periodo,Total
0,00 Total,Dato base,2019M12,4.304.741
1,00 Total,Dato base,2019M11,4.655.021
2,00 Total,Dato base,2019M10,7.587.749
3,00 Total,Dato base,2019M09,8.845.152
4,00 Total,Dato base,2019M08,10.118.366
...,...,...,...,...
91,18 Otras Comunidades Autónomas,Dato base,2019M05,673.347
92,18 Otras Comunidades Autónomas,Dato base,2019M04,689.245
93,18 Otras Comunidades Autónomas,Dato base,2019M03,487.515
94,18 Otras Comunidades Autónomas,Dato base,2019M02,400.651


### 3.2. Delete worthless information

In [9]:
region = tourists2019["region"].str.split(" ", n = 1, expand = True)

In [10]:
tourists2019["region"] = region[1]

In [11]:
tourists2019["region"].unique()

array(['Total', 'Andaluc\x92a', 'Islas Baleares', 'Canarias',
       'Catalu\x96a', 'C.Valenciana', 'C.Madrid',
       'Otras Comunidades Autónomas'], dtype=object)

### 3.3. Replacing column values

In [12]:
tourists2019.loc[tourists2019["region"] == "Andaluc\x92a", "region"] = "Andalucia"
tourists2019.loc[tourists2019["region"] == "Catalu\x96a", "region"] = "Cataluna"
tourists2019.loc[tourists2019["region"] == "Otras Comunidades Autónomas", "region"] = "Other regions"

In [13]:
tourists2019.shape

(96, 4)

### 3.4. Converting Total values from object to float

In [14]:
tourists2019['Total'] = tourists2019['Total'].apply(lambda x: convert_str(x))

In [15]:
tourists2019.groupby(['region']).sum()['Total']

region
Andalucia         12023153.0
C.Madrid           7640980.0
C.Valenciana       9535495.0
Canarias          13146862.0
Cataluna          19375153.0
Islas Baleares    13679781.0
Other regions      8107727.0
Total             83509151.0
Name: Total, dtype: float64

### 3.5. Grouping by region value

In [16]:
tourists2019 = (tourists2019.groupby(['region']).agg({"Total":"sum"}).reset_index())

In [17]:
tourists2019

Unnamed: 0,region,Total
0,Andalucia,12023153.0
1,C.Madrid,7640980.0
2,C.Valenciana,9535495.0
3,Canarias,13146862.0
4,Cataluna,19375153.0
5,Islas Baleares,13679781.0
6,Other regions,8107727.0
7,Total,83509151.0


## 4. Exporting cleaned data

In [18]:
tourists2019 = tourists2019.reset_index(drop = True)

In [19]:
tourists2019.head()

Unnamed: 0,region,Total
0,Andalucia,12023153.0
1,C.Madrid,7640980.0
2,C.Valenciana,9535495.0
3,Canarias,13146862.0
4,Cataluna,19375153.0


In [20]:
tourists2019.to_csv("../output/tourists2019.csv", index = False)