In [1]:
import os
import re
import requests
import pandas as pd
import numpy as np
import zipfile
import io

#### Descargamos y leemos los distintos ficheros

In [2]:
# Descargamos los ficheros que contienen los datos a nuestro directorio activo
URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/solar-flare/flare.data1"
with requests.get(URL) as response:
    open(os.path.join(os.getcwd(), 'flare.data1'), "wb").write(response.content)
URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/solar-flare/flare.data2"
with requests.get(URL) as response:
    open(os.path.join(os.getcwd(), 'flare.data2'), "wb").write(response.content)
# Descargamos la metadata asociada al conjunto de datos
URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/solar-flare/flare.names"
with requests.get(URL) as response:
    open(os.path.join(os.getcwd(), "flare.names"), "wb").write(response.content)
# Leemos datos
## Leemos primer fichero de datos
with open(os.path.join(os.getcwd(),'flare.data1'),'r') as f:
    data1 = f.read().splitlines() # Dividimos el texto por saltos de línea
    data1 = [elem.split(' ') for elem in data1 if elem!=''] # Dividimos cada línea por las comas y removemos líneas vacías
    data1 = data1[1:] # Quitamos la línea de metadata temporal
## Leemos segundo fichero de datos
with open(os.path.join(os.getcwd(),'flare.data2'),'r') as f:
    data2 = f.read().splitlines() # Dividimos el texto por saltos de línea
    data2 = [elem.split(' ') for elem in data2 if elem!=''] # Dividimos cada línea por las comas y removemos líneas vacías
    data2 = data2[1:] # Quitamos la línea de metadata temporal
## Combinamos ambas listas
data = data1+data2
# Leemos metadata
with open(os.path.join(os.getcwd(),'flare.names'),'r') as f:
    metadata = f.read().splitlines()
## Regex
regex_fn = lambda text: re.findall('^\s+[0-9]+\.{1}\s{1}[a-zA-Z- ]+', text)
reg_text_fn = lambda text : re.findall('[a-zA-Z-]+', text)
metadata_list = [regex_fn(elem)[0].strip() for elem in metadata if regex_fn(elem)]
col_names = [reg_text_fn(elem)[0] for elem in metadata_list if reg_text_fn(elem)]
# Construimos el objeto pd.DataFrame
df = pd.DataFrame(data=data, columns=col_names)

#### Mostramos la información de nuestro dataframe

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1389 entries, 0 to 1388
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Code                  1389 non-null   object
 1   Code                  1389 non-null   object
 2   Code                  1389 non-null   object
 3   Activity              1389 non-null   object
 4   Evolution             1389 non-null   object
 5   Previous              1389 non-null   object
 6   Historically-complex  1389 non-null   object
 7   Did                   1389 non-null   object
 8   Area                  1389 non-null   object
 9   Area                  1389 non-null   object
 10  C-class               1389 non-null   object
 11  M-class               1389 non-null   object
 12  X-class               1389 non-null   object
dtypes: object(13)
memory usage: 141.2+ KB


In [4]:
df.isna().sum()

Code                    0
Code                    0
Code                    0
Activity                0
Evolution               0
Previous                0
Historically-complex    0
Did                     0
Area                    0
Area                    0
C-class                 0
M-class                 0
X-class                 0
dtype: int64

In [5]:
df

Unnamed: 0,Code,Code.1,Code.2,Activity,Evolution,Previous,Historically-complex,Did,Area,Area.1,C-class,M-class,X-class
0,C,S,O,1,2,1,1,2,1,2,0,0,0
1,D,S,O,1,3,1,1,2,1,2,0,0,0
2,C,S,O,1,3,1,1,2,1,1,0,0,0
3,D,S,O,1,3,1,1,2,1,2,0,0,0
4,D,A,O,1,3,1,1,2,1,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1384,H,S,X,1,2,1,1,1,1,1,0,0,0
1385,H,S,X,2,2,1,1,2,1,1,0,0,0
1386,C,S,O,1,2,1,2,2,1,1,0,0,0
1387,H,R,X,1,2,1,1,2,1,1,0,0,0


#### El dataframe tiene tres columnas con el mismo nombre, procedemos a cambiarlas

In [6]:
#Cambiamos los nombres de las tres primeras columnas para poder tratarlas por separado
column_indices_to_change = {0: 'Class_Code', 1:"Spot_Size_Code", 2:"Spot_Distr_Code"}
df.columns = [column_indices_to_change.get(enum, col) for enum, col in enumerate(df.columns)]
df

Unnamed: 0,Class_Code,Spot_Size_Code,Spot_Distr_Code,Activity,Evolution,Previous,Historically-complex,Did,Area,Area.1,C-class,M-class,X-class
0,C,S,O,1,2,1,1,2,1,2,0,0,0
1,D,S,O,1,3,1,1,2,1,2,0,0,0
2,C,S,O,1,3,1,1,2,1,1,0,0,0
3,D,S,O,1,3,1,1,2,1,2,0,0,0
4,D,A,O,1,3,1,1,2,1,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1384,H,S,X,1,2,1,1,1,1,1,0,0,0
1385,H,S,X,2,2,1,1,2,1,1,0,0,0
1386,C,S,O,1,2,1,2,2,1,1,0,0,0
1387,H,R,X,1,2,1,1,2,1,1,0,0,0


#### Además, estas columnas son valores binarios, las tratamos para poder usarlas más adelante

In [7]:
# Importamos la funcionalidad concreta de sklearn
from sklearn.preprocessing import LabelEncoder
# Transformamos esta columna
encoder = LabelEncoder()
encoded_data = encoder.fit_transform(df[['Class_Code']])
# Unimos la información
df["Class_Code"] = pd.DataFrame(data=encoded_data, columns=['Class_Code'])
#0=B/1=C/2=D/3=E/4=F/5=H
df["Class_Code"]

  y = column_or_1d(y, warn=True)


0       1
1       2
2       1
3       2
4       2
       ..
1384    5
1385    5
1386    1
1387    5
1388    0
Name: Class_Code, Length: 1389, dtype: int32

In [8]:
encoded_data = encoder.fit_transform(df[["Spot_Size_Code"]])
df["Spot_Size_Code"] = pd.DataFrame(data=encoded_data, columns=['Spot_Size_Code'])
pd.set_option("display.max_rows", 10)
#0=A/1=H/2=K/3=R/4=S/5=X
df["Spot_Size_Code"]

  y = column_or_1d(y, warn=True)


0       4
1       4
2       4
3       4
4       0
       ..
1384    4
1385    4
1386    4
1387    3
1388    5
Name: Spot_Size_Code, Length: 1389, dtype: int32

In [9]:
encoded_data = encoder.fit_transform(df[["Spot_Distr_Code"]])
df["Spot_Distr_Code"] = pd.DataFrame(data=encoded_data, columns=['Spot_Distr_Code'])
#0=C/1=I/2=O/3=X
df["Spot_Distr_Code"]

  y = column_or_1d(y, warn=True)


0       2
1       2
2       2
3       2
4       2
       ..
1384    3
1385    3
1386    2
1387    3
1388    2
Name: Spot_Distr_Code, Length: 1389, dtype: int32

#### Así quedaría el dataframe listo para ser utilizado

In [10]:
df

Unnamed: 0,Class_Code,Spot_Size_Code,Spot_Distr_Code,Activity,Evolution,Previous,Historically-complex,Did,Area,Area.1,C-class,M-class,X-class
0,1,4,2,1,2,1,1,2,1,2,0,0,0
1,2,4,2,1,3,1,1,2,1,2,0,0,0
2,1,4,2,1,3,1,1,2,1,1,0,0,0
3,2,4,2,1,3,1,1,2,1,2,0,0,0
4,2,0,2,1,3,1,1,2,1,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1384,5,4,3,1,2,1,1,1,1,1,0,0,0
1385,5,4,3,2,2,1,1,2,1,1,0,0,0
1386,1,4,2,1,2,1,2,2,1,1,0,0,0
1387,5,3,3,1,2,1,1,2,1,1,0,0,0
