# S02T01: Pré processamento dos dados

## 📚 Importando as bibliotecas

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.impute import SimpleImputer 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle

## 🎲 Carregando os dados

In [7]:
DATASET_PATH = 'C://Users//alexw//Documents//UFPI//Sistemas_Inteligentes//21-04-01_visualization_pre_processing//'
DATASET_NAME = 'covid-piaui.csv'

In [8]:
def load_data(dataset_path=DATASET_PATH, dataset_name=DATASET_NAME):
    csv_path = os.path.join(dataset_path, dataset_name)
    return pd.read_csv(csv_path)

In [9]:
covid = load_data(DATASET_PATH,DATASET_NAME)
covid = covid.drop(["city_ibge_code", "is_last"], axis=1)
covid

Unnamed: 0,date,state,city,place_type,confirmed,deaths,order_for_place,estimated_population_2019,estimated_population,confirmed_per_100k_inhabitants,death_rate,longitude,latitude
0,21/03/2021,PI,ACAUA,city,151,1,292,7084,7102,212616164,66,-41.083638,-8.220787
1,21/03/2021,PI,AGRICOLANDIA,city,100,0,284,5139,5131,194893783,0,-42.659183,-5.795502
2,21/03/2021,PI,AGUA BRANCA,city,1165,50,333,17411,17470,666857470,429,-42.635131,-5.893266
3,21/03/2021,PI,ALAGOINHA DO PIAUI,city,238,9,298,7651,7665,310502283,378,-40.934635,-7.008175
4,21/03/2021,PI,ALEGRETE DO PIAUI,city,377,4,280,4915,4918,766571777,106,-40.857863,-7.245349
...,...,...,...,...,...,...,...,...,...,...,...,...,...
219,21/03/2021,PI,VARZEA BRANCA,city,84,5,300,4947,4938,170109356,595,-42.969445,-9.236541
220,21/03/2021,PI,VARZEA GRANDE,city,290,0,331,4391,4386,661194710,0,-42.253102,-6.542948
221,21/03/2021,PI,VERA MENDES,city,133,2,315,3077,3080,431818182,150,-41.479737,-7.602849
222,21/03/2021,PI,VILA NOVA DO PIAUI,city,108,1,323,2971,2952,365853659,93,-40.937801,-7.141046


In [10]:
covid_target = covid["deaths"].copy()
covid = covid.drop("deaths", axis=1) 
covid

Unnamed: 0,date,state,city,place_type,confirmed,order_for_place,estimated_population_2019,estimated_population,confirmed_per_100k_inhabitants,death_rate,longitude,latitude
0,21/03/2021,PI,ACAUA,city,151,292,7084,7102,212616164,66,-41.083638,-8.220787
1,21/03/2021,PI,AGRICOLANDIA,city,100,284,5139,5131,194893783,0,-42.659183,-5.795502
2,21/03/2021,PI,AGUA BRANCA,city,1165,333,17411,17470,666857470,429,-42.635131,-5.893266
3,21/03/2021,PI,ALAGOINHA DO PIAUI,city,238,298,7651,7665,310502283,378,-40.934635,-7.008175
4,21/03/2021,PI,ALEGRETE DO PIAUI,city,377,280,4915,4918,766571777,106,-40.857863,-7.245349
...,...,...,...,...,...,...,...,...,...,...,...,...
219,21/03/2021,PI,VARZEA BRANCA,city,84,300,4947,4938,170109356,595,-42.969445,-9.236541
220,21/03/2021,PI,VARZEA GRANDE,city,290,331,4391,4386,661194710,0,-42.253102,-6.542948
221,21/03/2021,PI,VERA MENDES,city,133,315,3077,3080,431818182,150,-41.479737,-7.602849
222,21/03/2021,PI,VILA NOVA DO PIAUI,city,108,323,2971,2952,365853659,93,-40.937801,-7.141046


## Tratando os dados faltantes

In [11]:
covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224 entries, 0 to 223
Data columns (total 12 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   date                            224 non-null    object 
 1   state                           224 non-null    object 
 2   city                            224 non-null    object 
 3   place_type                      224 non-null    object 
 4   confirmed                       224 non-null    int64  
 5   order_for_place                 224 non-null    int64  
 6   estimated_population_2019       224 non-null    int64  
 7   estimated_population            224 non-null    int64  
 8   confirmed_per_100k_inhabitants  224 non-null    int64  
 9   death_rate                      224 non-null    int64  
 10  longitude                       224 non-null    float64
 11  latitude                        224 non-null    float64
dtypes: float64(2), int64(6), object(4)
m

In [12]:
# housing_t = housing.dropna(subset=["total_bedrooms"]) 

In [13]:
# housing_t.info()

In [14]:
covid_t = covid.drop(['date', 'state', 'city', 'place_type'], axis=1)
covid_t.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224 entries, 0 to 223
Data columns (total 8 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   confirmed                       224 non-null    int64  
 1   order_for_place                 224 non-null    int64  
 2   estimated_population_2019       224 non-null    int64  
 3   estimated_population            224 non-null    int64  
 4   confirmed_per_100k_inhabitants  224 non-null    int64  
 5   death_rate                      224 non-null    int64  
 6   longitude                       224 non-null    float64
 7   latitude                        224 non-null    float64
dtypes: float64(2), int64(6)
memory usage: 14.1 KB


## Tratando os dados categóricos

In [20]:
from sklearn.preprocessing import LabelEncoder
#covid_category_date = covid['date']
#covid_category_state = covid['state']
#covid_category_place_type = covid['place_type']
covid_category_city = covid['city']

le = LabelEncoder()

# covid_category_date_le = le.fit_transform(covid_category_date)
# covid_category_state_le = le.fit_transform(covid_category_state)
# covid_category_place_type_le = le.fit_transform(covid_category_place_type)
covid_category_city_le = le.fit_transform(covid_category_city)

In [21]:
# date = pd.DataFrame(covid_category_date_le,columns=['date'])
# state = pd.DataFrame(covid_category_state_le,columns=['state'])
# place_type = pd.DataFrame(covid_category_place_type_le,columns=['place_type'])
city = pd.DataFrame(covid_category_city_le,columns=['city'])

covid_encoded1 = pd.concat([covid_t,city], axis=1)
covid_encoded1
# covid_encoded1.to_csv('NOVO_covid-piaui.csv')

Unnamed: 0,confirmed,order_for_place,estimated_population_2019,estimated_population,confirmed_per_100k_inhabitants,death_rate,longitude,latitude,city
0,151,292,7084,7102,212616164,66,-41.083638,-8.220787,0
1,100,284,5139,5131,194893783,0,-42.659183,-5.795502,1
2,1165,333,17411,17470,666857470,429,-42.635131,-5.893266,2
3,238,298,7651,7665,310502283,378,-40.934635,-7.008175,3
4,377,280,4915,4918,766571777,106,-40.857863,-7.245349,4
...,...,...,...,...,...,...,...,...,...
219,84,300,4947,4938,170109356,595,-42.969445,-9.236541,219
220,290,331,4391,4386,661194710,0,-42.253102,-6.542948,220
221,133,315,3077,3080,431818182,150,-41.479737,-7.602849,221
222,108,323,2971,2952,365853659,93,-40.937801,-7.141046,222


In [22]:
covid_encoded1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224 entries, 0 to 223
Data columns (total 9 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   confirmed                       224 non-null    int64  
 1   order_for_place                 224 non-null    int64  
 2   estimated_population_2019       224 non-null    int64  
 3   estimated_population            224 non-null    int64  
 4   confirmed_per_100k_inhabitants  224 non-null    int64  
 5   death_rate                      224 non-null    int64  
 6   longitude                       224 non-null    float64
 7   latitude                        224 non-null    float64
 8   city                            224 non-null    int32  
dtypes: float64(2), int32(1), int64(6)
memory usage: 15.0 KB


## ➗ Dividindo os dados em treino e teste

In [23]:
X_train, X_test, y_train, y_test = train_test_split(covid_encoded1, covid_target, test_size = 0.2,random_state=1, shuffle=True)

In [24]:
print(X_test.shape)

(45, 9)


In [25]:
print(X_train.shape)

(179, 9)


In [26]:
print(y_test.shape)

(45,)


In [27]:
print(y_train.shape)

(179,)


## 🧮 Feature Scaling

In [28]:
#z = (x - u) / s
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [29]:
print(X_train)

[[ 0.3713979   1.29223314  0.31077078 ... -0.21686346 -1.24629537
   1.37241357]
 [-0.16347942 -1.72880664 -0.15831705 ... -0.49336498 -1.27271502
   1.0314961 ]
 [-0.1453741   0.30948526 -0.17956877 ...  0.87598036 -0.45567566
   1.62035355]
 ...
 [-0.13381752  0.12749491 -0.11858828 ...  0.01090322 -1.42531827
  -0.68858749]
 [-0.13709188 -0.12729157 -0.18090184 ...  0.46610579  0.84109346
   0.38065366]
 [-0.09452513 -0.01809736 -0.09947568 ...  1.216561    0.82653553
  -1.23095619]]


In [30]:
print(X_test)

[[-0.10049603 -2.27477768 -0.17791017 -0.17758065  2.64757171 -0.89961874
   1.49260828 -0.30176321 -0.48713626]
 [-0.10299995 -0.23648578 -0.09815811 -0.09809258 -0.01244601 -0.34682317
   0.59701384  1.48378756 -1.27744494]
 [-0.07064151  0.01830071 -0.0507409  -0.05054181  0.10959139 -0.7025351
  -1.66651363 -1.81493935 -0.76606874]
 [-0.14826325 -0.67326262 -0.15185318 -0.15171267 -0.67982785  0.87413402
   0.56946895  1.12817657 -0.22370004]
 [-0.12784661 -0.30928192 -0.15839455 -0.15813719  0.28072999 -0.94768792
  -0.34297579  0.65392374 -0.71957999]
 [-0.09645122 -1.00084524 -0.14289367 -0.14258551  1.03963937 -0.9140395
   0.02156805  1.84749498 -0.16171504]
 [-0.1492263  -1.58321436 -0.15281424 -0.15276283 -0.70557559  0.10983405
   0.77062357 -0.61231789 -0.84354998]
 [-0.16559813 -0.12729157 -0.17209734 -0.1718202  -1.28385215  0.55207051
   0.04951671 -0.0030445  -1.1999637 ]
 [-0.14017364  0.30948526 -0.12024687 -0.12009967 -0.72006599  1.23945979
   0.72037637 -0.0946817

## 💾 Salvando os conjuntos em pickle

In [31]:
pickle.dump(X_train, open('C://Users//alexw//Documents//UFPI//Sistemas_Inteligentes//PICKLES//X_train.pickle', 'wb'))
pickle.dump(X_test, open('C://Users//alexw//Documents//UFPI//Sistemas_Inteligentes//PICKLES//X_test.pickle', 'wb'))
pickle.dump(y_train, open('C://Users//alexw//Documents//UFPI//Sistemas_Inteligentes//PICKLES//y_train.pickle', 'wb'))
pickle.dump(y_test, open('C://Users//alexw//Documents//UFPI//Sistemas_Inteligentes//PICKLES//y_test.pickle', 'wb'))