## Pyarrow
Es una librería que implementa funciones analíticas en Spark pero con la ventaja de de su escalabilidad. Siempre tiene la posibilidad de pasar la información a una estructura de pandas.


In [1]:
from pyarrow import csv
import pyarrow as pa

tab_vgsales = csv.read_csv("D:/DataAnalysis_EBAC/ebac/Python/Modulo19/vgsales.csv")
tab_vgsales

pyarrow.Table
Rank: int64
Name: string
Platform: string
Year: int64
Genre: string
Publisher: string
NA_Sales: double
EU_Sales: double
JP_Sales: double
Other_Sales: double
Global_Sales: double
----
Rank: [[1,2,3,4,5,...,12661,12662,12663,12664,12665],[12666,12667,12668,12669,12670,...,16596,16597,16598,16599,16600]]
Name: [["Wii Sports","Super Mario Bros.","Mario Kart Wii","Wii Sports Resort","Pokemon Red/Pokemon Blue",...,"Hajime no Ippo Portable: Victorious Spirits","Nanostray","CMT Presents: Karaoke Revolution Country","Katekyoo Hitman Reborn! DS: Flame Rumble Mukuro Kyoushuu","Cid to Chocobo no Fushigi na Dungeon: Toki Wasure no Meikyuu DS+"],["Vitamin X to Z","Fatal Frame","Flip's Twisted World","Tom Clancy's Ghost Recon: Advanced Warfighter","Urban Chaos: Riot Response",...,"Woody Woodpecker in Crazy Castle 5","Men in Black II: Alien Escape","SCORE International Baja 1000: The Official Game","Know How 2","Spirits & Spells"]]
Platform: [["Wii","NES","Wii","Wii","GB",...,"PSP","DS",

In [2]:
# largo de la tabla
len(tab_vgsales)

16598

In [3]:
# lista las columnas
tab_vgsales.column_names

['Rank',
 'Name',
 'Platform',
 'Year',
 'Genre',
 'Publisher',
 'NA_Sales',
 'EU_Sales',
 'JP_Sales',
 'Other_Sales',
 'Global_Sales']

In [4]:
tab_vgsales.schema

Rank: int64
Name: string
Platform: string
Year: int64
Genre: string
Publisher: string
NA_Sales: double
EU_Sales: double
JP_Sales: double
Other_Sales: double
Global_Sales: double

In [9]:
import pandas as pd

# Pasar la estructura a pandas
df = tab_vgsales.to_pandas()
df

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.00
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37
...,...,...,...,...,...,...,...,...,...,...,...
16593,16596,Woody Woodpecker in Crazy Castle 5,GBA,2002.0,Platform,Kemco,0.01,0.00,0.00,0.00,0.01
16594,16597,Men in Black II: Alien Escape,GC,2003.0,Shooter,Infogrames,0.01,0.00,0.00,0.00,0.01
16595,16598,SCORE International Baja 1000: The Official Game,PS2,2008.0,Racing,Activision,0.00,0.00,0.00,0.00,0.01
16596,16599,Know How 2,DS,2010.0,Puzzle,7G//AMES,0.00,0.01,0.00,0.00,0.01


In [11]:
tab_genre = tab_vgsales.group_by('Genre').aggregate([('NA_Sales','sum')])
tab_genre

pyarrow.Table
Genre: string
NA_Sales_sum: double
----
Genre: [["Adventure","Action","Platform","Shooter","Role-Playing",...,"Fighting","Puzzle","Misc","Racing","Strategy"]]
NA_Sales_sum: [[105.79999999999981,877.8299999999975,447.0500000000001,582.5999999999983,327.2799999999996,...,223.59000000000017,123.77999999999987,410.2400000000001,359.4199999999999,68.70000000000009]]

In [14]:
# Añade una columna con append_column
tab2 = tab_vgsales.append_column('Test', pa.array(['0'] * len(tab_vgsales), pa.string())).to_pandas()
tab2

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Test
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74,0
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,0
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82,0
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.00,0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37,0
...,...,...,...,...,...,...,...,...,...,...,...,...
16593,16596,Woody Woodpecker in Crazy Castle 5,GBA,2002.0,Platform,Kemco,0.01,0.00,0.00,0.00,0.01,0
16594,16597,Men in Black II: Alien Escape,GC,2003.0,Shooter,Infogrames,0.01,0.00,0.00,0.00,0.01,0
16595,16598,SCORE International Baja 1000: The Official Game,PS2,2008.0,Racing,Activision,0.00,0.00,0.00,0.00,0.01,0
16596,16599,Know How 2,DS,2010.0,Puzzle,7G//AMES,0.00,0.01,0.00,0.00,0.01,0


## Parquet
Parquet es un formato de guardado de información "columnar", esto quiere decir que en vez de guardarlo por filas, replicando la realidad del archivo, este formato lo hacetransponiendo las columnas a las filas. Los archivos son más pequeños, más rápidos (faster scan), más baratos de guardar en la nube. Para visualizar un archivo parquet se usan las "parquet-tools" que vienen en el terminal de Mac. Si se quiere abrir el archivo con un editor de texto, no se puede ya que viene comprimido.

In [16]:
import pyarrow.parquet as pq
import pandas as pd
import pyarrow as pa

df = pd.DataFrame({
    'lin1':[-20,100,200],
    'lin2':['este', 'es un', 'ejemplo'],
    'l3':[False, True, True],}
    , index = list('abc'))

df

Unnamed: 0,lin1,lin2,l3
a,-20,este,False
b,100,es un,True
c,200,ejemplo,True


In [18]:
tab_example = pa.Table.from_pandas(df)
pq.write_table(tab_example, './parquet-example.parquet')

In [21]:
# Leer un parquet
tab2 = pq.read_table('parquet-example.parquet')
tab2.to_pandas()

Unnamed: 0,lin1,lin2,l3
a,-20,este,False
b,100,es un,True
c,200,ejemplo,True
