In [1]:
# 将DataFrame保存为Feather, Pickle, CSV, Excel文件
import pandas as pd
import pyarrow

In [2]:
landtemps = pd.read_csv('C:\dataClean\Python-Data-Cleaning-Cookbook\Chapter01\data\landtempssample.csv',
                        names=['stationid','year','month','avgtemp','latitude','longitude','elevation','station','countryid','country'],
                        skiprows=1,
                        parse_dates=[['month', 'year']],
                        low_memory=False)
landtemps.rename(columns={'month_year':'measuredate'}, inplace=True)
landtemps.dropna(subset=['avgtemp'], inplace=True)

In [7]:
landtemps.dtypes

avgtemp      float64
latitude     float64
longitude    float64
elevation    float64
station       object
countryid     object
country       object
dtype: object

In [8]:
landtemps.shape

(85554, 7)

In [4]:
landtemps.set_index(['measuredate', 'stationid'], inplace=True)

In [5]:
extremevals = landtemps[(landtemps.avgtemp < landtemps.avgtemp.quantile(.001)) | (landtemps.avgtemp > landtemps.avgtemp.quantile(.999))]
extremevals.shape

(171, 7)

In [9]:
extremevals.sample(7)

Unnamed: 0_level_0,Unnamed: 1_level_0,avgtemp,latitude,longitude,elevation,station,countryid,country
measuredate,stationid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1998-01-01,RSM00024606,-39.12,63.58,103.97,209.0,KISLOKAN,RS,Russia
1943-12-01,RSM00024266,-48.18,67.5667,133.4,136.0,VERHOJANSK,RS,Russia
2003-04-01,CD000004700,34.86,12.13,15.03,295.0,NDJAMENA,CD,Chad
2018-09-01,AYM00089606,-63.35,-78.45,106.867,3488.0,VOSTOK,AY,Antarctica
1994-07-01,QAM00041170,35.58,25.261,51.565,10.7,DOHA_INTL,QA,Qatar
1960-12-01,RSM00024641,-37.42,63.7831,121.6167,110.0,VILJUJSK,RS,Russia
1963-08-01,SUM00062660,34.98,18.55,31.85,249.0,KARIMA,SU,Sudan


In [15]:
# saving
extremevals.to_excel('views/extremevals.xlsx')
extremevals.to_csv('views/extremevals.csv')
landtemps.to_pickle('views/landtemps.pkl')

In [16]:
# 保存为feather文件时候需要重置索引
landtemps.reset_index(inplace=True)
landtemps.to_feather('views/landtemps.feather')

In [18]:
# reloading
landp = pd.read_pickle('views/landtemps.pkl')
landp.head(2)                                    # 保留了之前的索引

Unnamed: 0_level_0,Unnamed: 1_level_0,avgtemp,latitude,longitude,elevation,station,countryid,country
measuredate,stationid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2000-04-01,USS0010K01S,5.27,39.9,-110.75,2773.7,INDIAN_CANYON,US,United States
1940-05-01,CI000085406,18.04,-18.35,-70.333,58.0,ARICA,CI,Chile


In [19]:
landp.head(2).T

measuredate,2000-04-01,1940-05-01
stationid,USS0010K01S,CI000085406
avgtemp,5.27,18.040
latitude,39.90,-18.350
longitude,-110.75,-70.333
elevation,2773.70,58.000
station,INDIAN_CANYON,ARICA
countryid,US,CI
country,United States,Chile


In [20]:
landf = pd.read_feather('views/landtemps.feather')
landf.head(2).T

Unnamed: 0,0,1
measuredate,2000-04-01 00:00:00,1940-05-01 00:00:00
stationid,USS0010K01S,CI000085406
avgtemp,5.27,18.040
latitude,39.90,-18.350
longitude,-110.75,-70.333
elevation,2773.70,58.000
station,INDIAN_CANYON,ARICA
countryid,US,CI
country,United States,Chile


保存为什么类型的文件进行共享：因实际而异