## Limpieza de ficheros con pandas

Limpiar las columnas y convertir tipos de datos de los ficheros de huracanes. Realizar un estudio de correlación entre el viento y la presión.

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../ficheros_curso/csv/IRMA.csv", sep=";")
df.head()

Unnamed: 0,Date,Time,Lat,Lon,Wind,Pressure,Storm Type,Category
0,Aug 30,15:00 GMT,16.4°,-30.3°,50 mph,1004 mb,Tropical Storm,-
1,Aug 30,21:00 GMT,16.4°,-31.2°,60 mph,1001 mb,Tropical Storm,-
2,Aug 31,03:00 GMT,16.4°,-32.2°,65 mph,999 mb,Tropical Storm,-
3,Aug 31,09:00 GMT,16.5°,-32.9°,70 mph,997 mb,Tropical Storm,-
4,Aug 31,15:00 GMT,16.9°,-33.8°,100 mph,979 mb,Hurricane,2


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Date         130 non-null    object
 1   Time         130 non-null    object
 2   Lat          130 non-null    object
 3   Lon          130 non-null    object
 4   Wind         130 non-null    object
 5   Pressure     130 non-null    object
 6   Storm Type   130 non-null    object
 7   Category     130 non-null    object
dtypes: object(8)
memory usage: 8.2+ KB


In [4]:
df.columns = [col.strip() for col in df.columns]
df.columns

Index(['Date', 'Time', 'Lat', 'Lon', 'Wind', 'Pressure', 'Storm Type',
       'Category'],
      dtype='object')

In [5]:
df['Lat'] = pd.to_numeric(df.Lat.str[:-1], downcast='float')
df['Lon'] = pd.to_numeric(df.Lon.str[:-1], downcast='float')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Date        130 non-null    object 
 1   Time        130 non-null    object 
 2   Lat         130 non-null    float32
 3   Lon         130 non-null    float32
 4   Wind        130 non-null    object 
 5   Pressure    130 non-null    object 
 6   Storm Type  130 non-null    object 
 7   Category    130 non-null    object 
dtypes: float32(2), object(6)
memory usage: 7.2+ KB


In [7]:
df['Wind'] = pd.to_numeric(df.Wind.str.replace(" mph",""), downcast='integer')
df['Pressure'] = pd.to_numeric(df.Pressure.str.replace(" mb",""), downcast='integer')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Date        130 non-null    object 
 1   Time        130 non-null    object 
 2   Lat         130 non-null    float32
 3   Lon         130 non-null    float32
 4   Wind        130 non-null    int16  
 5   Pressure    130 non-null    int16  
 6   Storm Type  130 non-null    object 
 7   Category    130 non-null    object 
dtypes: float32(2), int16(2), object(4)
memory usage: 5.7+ KB


In [10]:
df['DateTime'] = pd.to_datetime("2005 " + df.Date + " " + df.Time.str.replace(" GMT",""), format="%Y %b %d %H:%M")

In [11]:
df.head()

Unnamed: 0,Date,Time,Lat,Lon,Wind,Pressure,Storm Type,Category,DateTime
0,Aug 30,15:00 GMT,16.4,-30.299999,50,1004,Tropical Storm,-,2005-08-30 15:00:00
1,Aug 30,21:00 GMT,16.4,-31.200001,60,1001,Tropical Storm,-,2005-08-30 21:00:00
2,Aug 31,03:00 GMT,16.4,-32.200001,65,999,Tropical Storm,-,2005-08-31 03:00:00
3,Aug 31,09:00 GMT,16.5,-32.900002,70,997,Tropical Storm,-,2005-08-31 09:00:00
4,Aug 31,15:00 GMT,16.9,-33.799999,100,979,Hurricane,2,2005-08-31 15:00:00


In [12]:
df.drop(columns=["Date","Time"], inplace=True)

In [13]:
df.head()

Unnamed: 0,Lat,Lon,Wind,Pressure,Storm Type,Category,DateTime
0,16.4,-30.299999,50,1004,Tropical Storm,-,2005-08-30 15:00:00
1,16.4,-31.200001,60,1001,Tropical Storm,-,2005-08-30 21:00:00
2,16.4,-32.200001,65,999,Tropical Storm,-,2005-08-31 03:00:00
3,16.5,-32.900002,70,997,Tropical Storm,-,2005-08-31 09:00:00
4,16.9,-33.799999,100,979,Hurricane,2,2005-08-31 15:00:00


In [15]:
t = np.corrcoef(df.Wind, df.Pressure)
t

array([[ 1.        , -0.89322419],
       [-0.89322419,  1.        ]])

In [17]:
resul = round(t[0][1] * 100, 2)
resul

-89.32

In [18]:
df = df[['DateTime','Lat','Lon','Wind','Pressure']]
df.head()

Unnamed: 0,DateTime,Lat,Lon,Wind,Pressure
0,2005-08-30 15:00:00,16.4,-30.299999,50,1004
1,2005-08-30 21:00:00,16.4,-31.200001,60,1001
2,2005-08-31 03:00:00,16.4,-32.200001,65,999
3,2005-08-31 09:00:00,16.5,-32.900002,70,997
4,2005-08-31 15:00:00,16.9,-33.799999,100,979


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   DateTime  130 non-null    datetime64[ns]
 1   Lat       130 non-null    float32       
 2   Lon       130 non-null    float32       
 3   Wind      130 non-null    int16         
 4   Pressure  130 non-null    int16         
dtypes: datetime64[ns](1), float32(2), int16(2)
memory usage: 2.7 KB
