<a href="https://colab.research.google.com/github/arielabade/dataAnalysis/blob/main/dataStorageOptimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Dealing with huge datasets


In [None]:
import  pandas as pd

house = pd.read_csv('sample_data/california_housing_train.csv')
house.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [None]:
house.info()

# você não precisa que todos os dados sejam float 64 ou int 64
# não use alocação de memória desnecessária se você não precisar daquele tamanho específico


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17000 entries, 0 to 16999
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           17000 non-null  float64
 1   latitude            17000 non-null  float64
 2   housing_median_age  17000 non-null  float64
 3   total_rooms         17000 non-null  float64
 4   total_bedrooms      17000 non-null  float64
 5   population          17000 non-null  float64
 6   households          17000 non-null  float64
 7   median_income       17000 non-null  float64
 8   median_house_value  17000 non-null  float64
dtypes: float64(9)
memory usage: 1.2 MB


In [None]:
house['total_rooms'] = house['total_rooms'].astype('int64') #conversões
house['total_bedrooms'] = house['total_bedrooms'].astype('int32') ## menos espaço de armazenamento
house['population'] = house['population'].astype('int32') #casting

In [None]:
# dimiinuição considerável do armazenamento

In [None]:
house.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17000 entries, 0 to 16999
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           17000 non-null  float64
 1   latitude            17000 non-null  float64
 2   housing_median_age  17000 non-null  float64
 3   total_rooms         17000 non-null  int64  
 4   total_bedrooms      17000 non-null  int32  
 5   population          17000 non-null  int32  
 6   households          17000 non-null  float64
 7   median_income       17000 non-null  float64
 8   median_house_value  17000 non-null  float64
 9   total___bedrooms    17000 non-null  int32  
dtypes: float64(6), int32(3), int64(1)
memory usage: 1.1 MB


In [None]:
# também é possível escolher o tipo das colunas no momento de importação

### Importing with types

In [None]:
import numpy as np

housePricing = pd.read_csv('sample_data/california_housing_train.csv', dtype = {
    'total_rooms' : np.dtype("int64"),
    'total_bedrooms' : np.dtype("int32"),
    'population' : np.dtype("int32"),
})

In [None]:
housePricing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17000 entries, 0 to 16999
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           17000 non-null  float64
 1   latitude            17000 non-null  float64
 2   housing_median_age  17000 non-null  float64
 3   total_rooms         17000 non-null  int64  
 4   total_bedrooms      17000 non-null  int32  
 5   population          17000 non-null  int32  
 6   households          17000 non-null  float64
 7   median_income       17000 non-null  float64
 8   median_house_value  17000 non-null  float64
dtypes: float64(6), int32(2), int64(1)
memory usage: 1.0 MB


In [None]:
## O código acima se trata de importação por lotes


### Importing only necessary stuff

In [None]:
# Most of the times you do not have interest in importing all the dataset, you only want to import some columns


In [None]:
house = pd.read_csv('sample_data/california_housing_train.csv',
                    usecols=['housing_median_age', 'total_rooms', 'total_bedrooms'])
house.head()


Unnamed: 0,housing_median_age,total_rooms,total_bedrooms
0,15.0,5612.0,1283.0
1,19.0,7650.0,1901.0
2,17.0,720.0,174.0
3,14.0,1501.0,337.0
4,20.0,1454.0,326.0


In [None]:
#import only necessary keys

## Chunk importing


In [None]:
## Chunk importing is used mainly with huge datsets. Basically you break the dataset in chunks in order to use computer parallelism to import data with a better timing

In [None]:
chunk = pd.read_csv('sample_data/california_housing_train.csv', chunksize=1000000)
df = pd.concat(chunk) ## concat chunnk

### Tips for the real world: work with samples

In [None]:
#Its easier to testt wiith samples and later apply the changes in datasets
# You can make datasets changes without testing your features in every dataset

In [None]:
house = pd.read_csv('sample_data/california_housing_train.csv',
                    usecols=['housing_median_age', 'total_rooms', 'total_bedrooms'],
                    nrows=300) #importa as 300 linhas das 3 colunas

In [None]:
house.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   housing_median_age  300 non-null    float64
 1   total_rooms         300 non-null    float64
 2   total_bedrooms      300 non-null    float64
dtypes: float64(3)
memory usage: 7.2 KB


### Otimize colunas automaticamente


In [None]:
#python fornece as colunas para serem otimizadas pelo melhor tipo que ele achar


In [None]:
pip install dtype_diet


Collecting dtype_diet
  Downloading dtype_diet-0.0.2-py3-none-any.whl (7.6 kB)
Installing collected packages: dtype_diet
Successfully installed dtype_diet-0.0.2


In [None]:
#recomendaçõe
from dtype_diet import report_on_dataframe
report_on_dataframe(house)

Unnamed: 0_level_0,Current dtype,Proposed dtype,Current Memory (MB),Proposed Memory (MB),Ram Usage Improvement (MB),Ram Usage Improvement (%)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
housing_median_age,float64,float16,1.234375,0.355469,0.878906,71.202532
total_rooms,float64,float32,1.234375,0.648438,0.585938,47.468354
total_bedrooms,float64,float32,1.234375,0.648438,0.585938,47.468354


In [None]:
# depois disso, é possível aplicar as mudanças nos posicionamentos de dados
house['house_median_age'] = house['housing_median_age'].astype('float16')
house['total_rooms'] = house['total_rooms'].astype('float32')

In [None]:
house.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   housing_median_age  300 non-null    float64
 1   total_rooms         300 non-null    float32
 2   total_bedrooms      300 non-null    float64
 3   house_median_age    300 non-null    float16
dtypes: float16(1), float32(1), float64(2)
memory usage: 6.6 KB


In [None]:
## oh como melhorou esse treino

### Pandas Modin

ferramenta do pandas que faz ser possível utilizar todos os núcleos papra importar dados

In [None]:
!pip install modin

