In [2]:
import os
import re
import requests
import pandas as pd
import numpy as np
import zipfile
import io

#### Descarga y extracción del conjunto de datos

In [None]:
URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00501/PRSA2017_Data_20130301-20170228.zip"
with requests.get(URL) as response:
    open(os.path.join(os.getcwd(), 'beijing.zip'), "wb").write(response.content)

#creamos un objeto zipfile
z = zipfile.ZipFile(io.BytesIO(response.content))

#extraemos los archivos
z.extractall('./beijing')

##### Al ser varios .csv, los concatenamos para crear nuestro dataframe

In [16]:
#creamos un dataframe vacio
df = pd.DataFrame()

#iteramos sobre los archivos de la carpeta beijing y los leemos con pandas(recuerden que al descomprimir, se creo una carpeta nueva C:\Users\demst\OneDrive\Escritorio\preprocesamiento\beijing\PRSA_Data_20130301-20170228)
for file in os.listdir('./beijing/'):
    if file.endswith('.csv'):
        df = pd.concat([df, pd.read_csv('./beijing/' + file)])

In [17]:
df

Unnamed: 0,No,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,wd,WSPM,station
0,1,2013,3,1,0,4.0,4.0,4.0,7.0,300.0,77.0,-0.7,1023.0,-18.8,0.0,NNW,4.4,Aotizhongxin
1,2,2013,3,1,1,8.0,8.0,4.0,7.0,300.0,77.0,-1.1,1023.2,-18.2,0.0,N,4.7,Aotizhongxin
2,3,2013,3,1,2,7.0,7.0,5.0,10.0,300.0,73.0,-1.1,1023.5,-18.2,0.0,NNW,5.6,Aotizhongxin
3,4,2013,3,1,3,6.0,6.0,11.0,11.0,300.0,72.0,-1.4,1024.5,-19.4,0.0,NW,3.1,Aotizhongxin
4,5,2013,3,1,4,3.0,3.0,12.0,12.0,300.0,72.0,-2.0,1025.2,-19.5,0.0,N,2.0,Aotizhongxin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35059,35060,2017,2,28,19,11.0,32.0,3.0,24.0,400.0,72.0,12.5,1013.5,-16.2,0.0,NW,2.4,Wanshouxigong
35060,35061,2017,2,28,20,13.0,32.0,3.0,41.0,500.0,50.0,11.6,1013.6,-15.1,0.0,WNW,0.9,Wanshouxigong
35061,35062,2017,2,28,21,14.0,28.0,4.0,38.0,500.0,54.0,10.8,1014.2,-13.3,0.0,NW,1.1,Wanshouxigong
35062,35063,2017,2,28,22,12.0,23.0,4.0,30.0,400.0,59.0,10.5,1014.4,-12.9,0.0,NNW,1.2,Wanshouxigong


#### Vemos la información del dataframe y calculamos el porcentaje de valores nulos

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 420768 entries, 0 to 35063
Data columns (total 18 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   No       420768 non-null  int64  
 1   year     420768 non-null  int64  
 2   month    420768 non-null  int64  
 3   day      420768 non-null  int64  
 4   hour     420768 non-null  int64  
 5   PM2.5    412029 non-null  float64
 6   PM10     414319 non-null  float64
 7   SO2      411747 non-null  float64
 8   NO2      408652 non-null  float64
 9   CO       400067 non-null  float64
 10  O3       407491 non-null  float64
 11  TEMP     420370 non-null  float64
 12  PRES     420375 non-null  float64
 13  DEWP     420365 non-null  float64
 14  RAIN     420378 non-null  float64
 15  wd       418946 non-null  object 
 16  WSPM     420450 non-null  float64
 17  station  420768 non-null  object 
dtypes: float64(11), int64(5), object(2)
memory usage: 61.0+ MB


In [19]:
df.isna().sum() / len(df) * 100

No         0.000000
year       0.000000
month      0.000000
day        0.000000
hour       0.000000
PM2.5      2.076916
PM10       1.532674
SO2        2.143937
NO2        2.879497
CO         4.919813
O3         3.155421
TEMP       0.094589
PRES       0.093401
DEWP       0.095777
RAIN       0.092688
wd         0.433018
WSPM       0.075576
station    0.000000
dtype: float64

##### Arreglamos los valores nulos, ya que son un pequeño porcentaje

In [22]:
# Método auxiliar
def replace_missing_data(g):
    # Vemos qué columnas tienen valores nulos
    mis_cols = list(g.isnull().sum(axis=0)[g.isnull().sum(axis=0)>0].index)
    # Iteramos sobre ellas
    for col in mis_cols:
        # Si la variable es discreta,...
        if g[col].dtype in ['object']:
            mode_col = g[col].mode().values[0]
            g[col] = g[col].fillna(mode_col)
        # Si son números enteros
        elif np.issubdtype(g[col].dtype, np.integer):
            g[col] = g[col].fillna(g[col].median())
        # Si son números reales
        elif np.issubdtype(g[col].dtype, np.number):
            g[col] = g[col].fillna(g[col].mean())
    # Devolvemos el DataFrame
    return g

df = replace_missing_data(df)

In [23]:
df.isnull().sum()

No         0
year       0
month      0
day        0
hour       0
PM2.5      0
PM10       0
SO2        0
NO2        0
CO         0
O3         0
TEMP       0
PRES       0
DEWP       0
RAIN       0
wd         0
WSPM       0
station    0
dtype: int64