## Pandas的数据转换函数map、apply、applymap

#### 区别:
1. map：只适用于Series，实现每一个值->值得映射
2. apply：只适用于Series实现每个值的处理，用于DataFrame实现某个轴的Serise的处理
3. applymap：只能用于DataFrame，用于处理该DataFrame的每一个元素

In [2]:
import pandas as pd
data = pd.read_csv(r'C:\Users\86158\scikit_learn\pollution.csv')
data.head(5)

Unnamed: 0,date,pollution,dew,temp,press,wnd_dir,wnd_spd,snow,rain
0,2010-01-02 00:00:00,129.0,-16,-4.0,1020.0,SE,1.79,0,0
1,2010-01-02 01:00:00,148.0,-15,-4.0,1020.0,SE,2.68,0,0
2,2010-01-02 02:00:00,159.0,-11,-5.0,1021.0,SE,3.57,0,0
3,2010-01-02 03:00:00,181.0,-7,-5.0,1022.0,SE,5.36,1,0
4,2010-01-02 04:00:00,138.0,-7,-5.0,1022.0,SE,6.25,2,0


In [3]:
data['wnd_dir'].unique()

array(['SE', 'cv', 'NW', 'NE'], dtype=object)

In [5]:
wnd_dict = {'se':"西南",'cv':"西北",'nw':"东北",'ne':"西北"}

### map用于Series

In [6]:
# 方法一：
data["风向"] = data['wnd_dir'].str.lower().map(wnd_dict) 
data.head()

Unnamed: 0,date,pollution,dew,temp,press,wnd_dir,wnd_spd,snow,rain,风向
0,2010-01-02 00:00:00,129.0,-16,-4.0,1020.0,SE,1.79,0,0,西南
1,2010-01-02 01:00:00,148.0,-15,-4.0,1020.0,SE,2.68,0,0,西南
2,2010-01-02 02:00:00,159.0,-11,-5.0,1021.0,SE,3.57,0,0,西南
3,2010-01-02 03:00:00,181.0,-7,-5.0,1022.0,SE,5.36,1,0,西南
4,2010-01-02 04:00:00,138.0,-7,-5.0,1022.0,SE,6.25,2,0,西南


In [7]:
# 方法二：
data['方向1'] = data['wnd_dir'].apply(lambda x:wnd_dict[x.lower()])
data.head()

Unnamed: 0,date,pollution,dew,temp,press,wnd_dir,wnd_spd,snow,rain,风向,方向1
0,2010-01-02 00:00:00,129.0,-16,-4.0,1020.0,SE,1.79,0,0,西南,西南
1,2010-01-02 01:00:00,148.0,-15,-4.0,1020.0,SE,2.68,0,0,西南,西南
2,2010-01-02 02:00:00,159.0,-11,-5.0,1021.0,SE,3.57,0,0,西南,西南
3,2010-01-02 03:00:00,181.0,-7,-5.0,1022.0,SE,5.36,1,0,西南,西南
4,2010-01-02 04:00:00,138.0,-7,-5.0,1022.0,SE,6.25,2,0,西南,西南


### apply用于Series和DataFrame的转化

1. Series.appley(function)，函数参数是每一个值
2. DataFrame.apply(function),函数参数是Series

In [8]:
data['风向3'] = data['wnd_dir'].apply(lambda x:wnd_dict[x.lower()])
data.head()

Unnamed: 0,date,pollution,dew,temp,press,wnd_dir,wnd_spd,snow,rain,风向,方向1,风向3
0,2010-01-02 00:00:00,129.0,-16,-4.0,1020.0,SE,1.79,0,0,西南,西南,西南
1,2010-01-02 01:00:00,148.0,-15,-4.0,1020.0,SE,2.68,0,0,西南,西南,西南
2,2010-01-02 02:00:00,159.0,-11,-5.0,1021.0,SE,3.57,0,0,西南,西南,西南
3,2010-01-02 03:00:00,181.0,-7,-5.0,1022.0,SE,5.36,1,0,西南,西南,西南
4,2010-01-02 04:00:00,138.0,-7,-5.0,1022.0,SE,6.25,2,0,西南,西南,西南


In [9]:
data['风向4'] = data.apply(lambda x:wnd_dict[x['wnd_dir'].lower()],axis=1)
data.head(5)

Unnamed: 0,date,pollution,dew,temp,press,wnd_dir,wnd_spd,snow,rain,风向,方向1,风向3,风向4
0,2010-01-02 00:00:00,129.0,-16,-4.0,1020.0,SE,1.79,0,0,西南,西南,西南,西南
1,2010-01-02 01:00:00,148.0,-15,-4.0,1020.0,SE,2.68,0,0,西南,西南,西南,西南
2,2010-01-02 02:00:00,159.0,-11,-5.0,1021.0,SE,3.57,0,0,西南,西南,西南,西南
3,2010-01-02 03:00:00,181.0,-7,-5.0,1022.0,SE,5.36,1,0,西南,西南,西南,西南
4,2010-01-02 04:00:00,138.0,-7,-5.0,1022.0,SE,6.25,2,0,西南,西南,西南,西南


In [20]:
sub_data = data[['pollution','dew','wnd_spd','press']]
sub_data.head(5)

Unnamed: 0,pollution,dew,wnd_spd,press
0,129.0,-16,1.79,1020.0
1,148.0,-15,2.68,1020.0
2,159.0,-11,3.57,1021.0
3,181.0,-7,5.36,1022.0
4,138.0,-7,6.25,1022.0


In [21]:
sub_data.dtypes

pollution    float64
dew            int64
wnd_spd      float64
press        float64
dtype: object

In [22]:
# 将浮点数装化成整数
sub_data.applymap(lambda x:int(x))
sub_data.head()

Unnamed: 0,pollution,dew,wnd_spd,press
0,129.0,-16,1.79,1020.0
1,148.0,-15,2.68,1020.0
2,159.0,-11,3.57,1021.0
3,181.0,-7,5.36,1022.0
4,138.0,-7,6.25,1022.0


In [23]:
sub_data.dtypes

pollution    float64
dew            int64
wnd_spd      float64
press        float64
dtype: object

## groupby分组应用apply

![avatar](scikit_learn/groupby_apply.png)

#### GroupBy.apply(function)
1. function的第一个参数是dataframe
2. function的返回结果，可以是dataframe、series、单个值，甚至于输入dataframe完全没关系

In [27]:
# 按分组的归一化
import pandas as pd
data = data[['date','pollution','dew','wnd_dir','snow','rain','press']]
data['month'] = data['date'].str[:7]
data.head()

Unnamed: 0,date,pollution,dew,wnd_dir,snow,rain,press,month
0,2010-01-02 00:00:00,129.0,-16,SE,0,0,1020.0,2010-01
1,2010-01-02 01:00:00,148.0,-15,SE,0,0,1020.0,2010-01
2,2010-01-02 02:00:00,159.0,-11,SE,0,0,1021.0,2010-01
3,2010-01-02 03:00:00,181.0,-7,SE,1,0,1022.0,2010-01
4,2010-01-02 04:00:00,138.0,-7,SE,2,0,1022.0,2010-01


In [33]:
# 按照month进行分组，然后对pollution列归一化
def pollution_norm(df):
    min_value = df['pollution'].min()
    max_value = df['pollution'].max()
    df['pollution_norm'] = df['pollution'].apply(
        lambda x:(x-min_value)/(max_value-min_value))
    return df

pollution_norm_data = data.groupby('month').apply(pollution_norm)
pollution_norm_data

Unnamed: 0,date,pollution,dew,wnd_dir,snow,rain,press,month,pollution_norm
0,2010-01-02 00:00:00,129.0,-16,SE,0,0,1020.0,2010-01,0.265979
1,2010-01-02 01:00:00,148.0,-15,SE,0,0,1020.0,2010-01,0.305155
2,2010-01-02 02:00:00,159.0,-11,SE,0,0,1021.0,2010-01,0.327835
3,2010-01-02 03:00:00,181.0,-7,SE,1,0,1022.0,2010-01,0.373196
4,2010-01-02 04:00:00,138.0,-7,SE,2,0,1022.0,2010-01,0.284536
...,...,...,...,...,...,...,...,...,...
43795,2014-12-31 19:00:00,8.0,-23,NW,0,0,1034.0,2014-12,0.018018
43796,2014-12-31 20:00:00,10.0,-22,NW,0,0,1034.0,2014-12,0.022523
43797,2014-12-31 21:00:00,10.0,-22,NW,0,0,1034.0,2014-12,0.022523
43798,2014-12-31 22:00:00,8.0,-22,NW,0,0,1034.0,2014-12,0.018018


In [37]:
# 取每个分组的TOPN数据:每个月的pollution最大的时间
def get_pollution_TopN(df,topn):
    return df.sort_values(by="pollution")[['month','wnd_dir']]

data.groupby('month').apply(get_pollution_TopN,topn=2)

Unnamed: 0_level_0,Unnamed: 1_level_0,month,wnd_dir
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01,549,2010-01,NW
2010-01,567,2010-01,NW
2010-01,566,2010-01,NW
2010-01,565,2010-01,NW
2010-01,564,2010-01,NW
...,...,...,...
2014-12,43262,2014-12,SE
2014-12,43706,2014-12,NW
2014-12,43263,2014-12,cv
2014-12,43705,2014-12,NE


In [None]:
# 使用apply函数添加多列
def fun(row):
    new_col1,new_col2 = row['a'] + row['b'],row['a']-row['b']
    return new_col1,new_col2
df[['new_col1','new_col2']] = df.apply(func,axis=1,result_type="expand")  # 将func返回元组拆成的两列