In [15]:
import pandas as pd
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.width', 85)
pd.set_option('display.max_columns', 8)

In [24]:
landtemps = pd.read_csv('C:\dataClean\Python-Data-Cleaning-Cookbook\Chapter01\data\landtempssample.csv',
                        sep=',',
                        names=['stationid', 'year', 'month', 'avgtemp', 'latitude', 'longitude', 'elevation', 'station', 'conuntryid', 'country'], 
                        skiprows=1,
                        parse_dates=[['month', 'year']],
                        low_memory=False)
type(landtemps)

pandas.core.frame.DataFrame

In [25]:
# jupyter有自己的截断规则，会覆盖掉pd.set_option对max_columns的设置
landtemps.head(7)

Unnamed: 0,month_year,stationid,avgtemp,latitude,...,elevation,station,conuntryid,country
0,2000-04-01,USS0010K01S,5.27,39.9,...,2773.7,INDIAN_CANYON,US,United States
1,1940-05-01,CI000085406,18.04,-18.35,...,58.0,ARICA,CI,Chile
2,2013-12-01,USC00036376,6.22,34.37,...,61.0,SAINT_CHARLES,US,United States
3,1963-02-01,ASN00024002,22.93,-34.28,...,65.5,BERRI_IRRIGATION,AS,Australia
4,2001-11-01,ASN00028007,,-14.78,...,79.4,MUSGRAVE,AS,Australia
5,1991-04-01,USW00024151,5.59,42.15,...,1362.5,MALAD_CITY,US,United States
6,1993-12-01,RSM00022641,-10.17,63.9,...,13.0,ONEGA,RS,Russia


In [16]:
# 使用Pandas的上下文管理器pd.option_context来临时覆盖显示设置
with pd.option_context('display.max_columns', 8):
    display(landtemps.head(7))

Unnamed: 0,measuredate,stationid,avgtemp,latitude,...,elevation,station,conuntryid,country
0,2000-04-01,USS0010K01S,5.27,39.9,...,2773.7,INDIAN_CANYON,US,United States
1,1940-05-01,CI000085406,18.04,-18.35,...,58.0,ARICA,CI,Chile
2,2013-12-01,USC00036376,6.22,34.37,...,61.0,SAINT_CHARLES,US,United States
3,1963-02-01,ASN00024002,22.93,-34.28,...,65.5,BERRI_IRRIGATION,AS,Australia
5,1991-04-01,USW00024151,5.59,42.15,...,1362.5,MALAD_CITY,US,United States
6,1993-12-01,RSM00022641,-10.17,63.9,...,13.0,ONEGA,RS,Russia
7,1943-01-01,USC00470307,-10.43,43.33,...,317.0,ARLINGTON,US,United States


最左边那一列是索引列

In [5]:
landtemps.dtypes

month_year    datetime64[ns]
stationid             object
avgtemp              float64
latitude             float64
longitude            float64
elevation            float64
station               object
conuntryid            object
country               object
dtype: object

object 混合类型，字符串类型

In [6]:
landtemps.shape

(100000, 9)

In [8]:
landtemps.rename(columns={'month_year':'measuredate'}, inplace=True)
landtemps.head(7)

Unnamed: 0,measuredate,stationid,avgtemp,latitude,longitude,elevation,station,conuntryid,country
0,2000-04-01,USS0010K01S,5.27,39.9,-110.75,2773.7,INDIAN_CANYON,US,United States
1,1940-05-01,CI000085406,18.04,-18.35,-70.333,58.0,ARICA,CI,Chile
2,2013-12-01,USC00036376,6.22,34.3703,-91.1242,61.0,SAINT_CHARLES,US,United States
3,1963-02-01,ASN00024002,22.93,-34.2833,140.6,65.5,BERRI_IRRIGATION,AS,Australia
4,2001-11-01,ASN00028007,,-14.7803,143.5036,79.4,MUSGRAVE,AS,Australia
5,1991-04-01,USW00024151,5.59,42.1492,-112.2872,1362.5,MALAD_CITY,US,United States
6,1993-12-01,RSM00022641,-10.17,63.9,38.1167,13.0,ONEGA,RS,Russia


In [23]:
landtemps.avgtemp.describe()

count   85,554.00
mean        10.92
std         11.52
min        -70.70
25%          3.46
50%         12.22
75%         19.57
max         39.95
Name: avgtemp, dtype: float64

count不统计NaN行： NaN 缺省值（Not a Number）

In [20]:
landtemps.isnull().sum()

month_year        0
stationid         0
avgtemp       14446
latitude          0
longitude         0
elevation         0
station           0
conuntryid        0
country           5
dtype: int64

方法链示例：isnull()返回包含true和false的dataframe, sum()再对每一列求和，它将true解释为1,false解释为0

In [21]:
landtemps.dropna(subset=['avgtemp'], inplace=True)
landtemps.shape

(85554, 9)