In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('./data/gapminder.tsv', sep='\t')
df

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.853030
2,Afghanistan,Asia,1962,31.997,10267083,853.100710
3,Afghanistan,Asia,1967,34.020,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
...,...,...,...,...,...,...
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.449960
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623


In [3]:
avg_life_exp_by_year = df.groupby('year')['lifeExp'].mean()

In [4]:
avg_life_exp_by_year

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [5]:
years = df.year.unique()

In [6]:
years

array([1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, 2002,
       2007], dtype=int64)

In [7]:
year_means = []
for y in years:
    m = df[df.year == y].lifeExp.mean()
    year_means.append((y,m))
year_means

[(1952, 49.05761971830987),
 (1957, 51.507401126760534),
 (1962, 53.60924901408449),
 (1967, 55.67828957746479),
 (1972, 57.647386478873244),
 (1977, 59.57015746478873),
 (1982, 61.53319718309858),
 (1987, 63.21261267605636),
 (1992, 64.16033802816901),
 (1997, 65.01467605633805),
 (2002, 65.69492253521126),
 (2007, 67.00742253521126)]

In [8]:
df2 = pd.DataFrame({
    'year' : [y for y,m in year_means],
    '' : [m for y,m in year_means]
})
df2

Unnamed: 0,year,Unnamed: 2
0,1952,49.05762
1,1957,51.507401
2,1962,53.609249
3,1967,55.67829
4,1972,57.647386
5,1977,59.570157
6,1982,61.533197
7,1987,63.212613
8,1992,64.160338
9,1997,65.014676


In [9]:
def my_mean(values):
    n = len(values)
    sum = 0
    for value in values:
        sum += value
    return sum / n

In [10]:
df.groupby('year').lifeExp.agg(my_mean)

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [11]:
def my_mean_diff(values, diff_value):
    n = len(values)
    sum = 0
    for value in values:
        sum += value
    mean = sum / n
    return mean - diff_value

In [12]:
global_mean = df['lifeExp'].mean()
global_mean

59.47443936619714

In [13]:
df.groupby('year').lifeExp.agg(
my_mean_diff, diff_value=global_mean)

year
1952   -10.416820
1957    -7.967038
1962    -5.865190
1967    -3.796150
1972    -1.827053
1977     0.095718
1982     2.058758
1987     3.738173
1992     4.685899
1997     5.540237
2002     6.220483
2007     7.532983
Name: lifeExp, dtype: float64

In [14]:
df.groupby('year').lifeExp.agg(
[np.count_nonzero, np.mean, np.std])

Unnamed: 0_level_0,count_nonzero,mean,std
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,142,49.05762,12.225956
1957,142,51.507401,12.231286
1962,142,53.609249,12.097245
1967,142,55.67829,11.718858
1972,142,57.647386,11.381953
1977,142,59.570157,11.227229
1982,142,61.533197,10.770618
1987,142,63.212613,10.556285
1992,142,64.160338,11.22738
1997,142,65.014676,11.559439


In [15]:
df.groupby('year').agg(
    {'lifeExp':'mean', 'pop':'std', 'gdpPercap':'mean'})

Unnamed: 0_level_0,lifeExp,pop,gdpPercap
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,49.05762,58100860.0,3725.276046
1957,51.507401,65504290.0,4299.408345
1962,53.609249,69788650.0,4725.812342
1967,55.67829,78375480.0,5483.653047
1972,57.647386,88646820.0,6770.082815
1977,59.570157,97481090.0,7313.166421
1982,61.533197,105098600.0,7518.901673
1987,63.212613,114756200.0,7900.920218
1992,64.160338,124502600.0,8158.608521
1997,65.014676,133417400.0,9090.175363


In [16]:
def my_zscore(x):
    return (x - x.mean()) / x.std()

In [17]:
df['zscore'] = df.groupby('year').lifeExp.transform(my_zscore)

In [18]:
df

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,zscore
0,Afghanistan,Asia,1952,28.801,8425333,779.445314,-1.656854
1,Afghanistan,Asia,1957,30.332,9240934,820.853030,-1.731249
2,Afghanistan,Asia,1962,31.997,10267083,853.100710,-1.786543
3,Afghanistan,Asia,1967,34.020,11537966,836.197138,-1.848157
4,Afghanistan,Asia,1972,36.088,13079460,739.981106,-1.894173
...,...,...,...,...,...,...,...
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306,-0.081621
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786,-0.336974
1701,Zimbabwe,Africa,1997,46.809,11404948,792.449960,-1.574962
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623,-2.093346


In [19]:
# agg                             # transform
# year묶어서 값을 반환            # 통계컬럼에서 각각의 요소들로 반환

In [20]:
import seaborn as sns
import numpy as np

In [21]:
np.random.seed(1234)
tips_10 = sns.load_dataset('tips').sample(10)
tips_10

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
187,30.46,2.0,Male,Yes,Sun,Dinner,5
179,34.63,3.55,Male,Yes,Sun,Dinner,2
31,18.35,2.5,Male,No,Sat,Dinner,4
52,34.81,5.2,Female,No,Sun,Dinner,4
71,17.07,3.0,Female,No,Sat,Dinner,3
6,8.77,2.0,Male,No,Sun,Dinner,2
95,40.17,4.73,Male,Yes,Fri,Dinner,4
131,20.27,2.83,Female,No,Thur,Lunch,2
157,25.0,3.75,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4


In [22]:
tips_10.loc[np.random.permutation(tips_10.index)[:4], 'total_bill'] = np.NaN

In [23]:
tips_10

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
187,30.46,2.0,Male,Yes,Sun,Dinner,5
179,34.63,3.55,Male,Yes,Sun,Dinner,2
31,,2.5,Male,No,Sat,Dinner,4
52,,5.2,Female,No,Sun,Dinner,4
71,,3.0,Female,No,Sat,Dinner,3
6,8.77,2.0,Male,No,Sun,Dinner,2
95,40.17,4.73,Male,Yes,Fri,Dinner,4
131,20.27,2.83,Female,No,Thur,Lunch,2
157,25.0,3.75,Female,No,Sun,Dinner,4
5,,4.71,Male,No,Sun,Dinner,4


In [24]:
tips_10.loc[[31, 52, 71, 5], 'total_bill'] = np.NaN
tips_10

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
187,30.46,2.0,Male,Yes,Sun,Dinner,5
179,34.63,3.55,Male,Yes,Sun,Dinner,2
31,,2.5,Male,No,Sat,Dinner,4
52,,5.2,Female,No,Sun,Dinner,4
71,,3.0,Female,No,Sat,Dinner,3
6,8.77,2.0,Male,No,Sun,Dinner,2
95,40.17,4.73,Male,Yes,Fri,Dinner,4
131,20.27,2.83,Female,No,Thur,Lunch,2
157,25.0,3.75,Female,No,Sun,Dinner,4
5,,4.71,Male,No,Sun,Dinner,4


In [25]:
def fill_na_mean(x):
    avg = x.mean()
    return x.fillna(avg)


In [26]:
tips_10['fill_total_bill'] = tips_10.groupby(['sex', 'time']).total_bill.transform(fill_na_mean)

In [27]:
tips_10

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,fill_total_bill
187,30.46,2.0,Male,Yes,Sun,Dinner,5,30.46
179,34.63,3.55,Male,Yes,Sun,Dinner,2,34.63
31,,2.5,Male,No,Sat,Dinner,4,28.5075
52,,5.2,Female,No,Sun,Dinner,4,25.0
71,,3.0,Female,No,Sat,Dinner,3,25.0
6,8.77,2.0,Male,No,Sun,Dinner,2,8.77
95,40.17,4.73,Male,Yes,Fri,Dinner,4,40.17
131,20.27,2.83,Female,No,Thur,Lunch,2,20.27
157,25.0,3.75,Female,No,Sun,Dinner,4,25.0
5,,4.71,Male,No,Sun,Dinner,4,28.5075


In [28]:
df = pd.DataFrame({'a' : [10,20,30], 'b':[20,30,40]})
df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [29]:
def plus(x):
    return x+2

In [30]:
df.apply(plus)

Unnamed: 0,a,b
0,12,22
1,22,32
2,32,42


In [31]:
df.agg(plus)

Unnamed: 0,a,b
0,12,22
1,22,32
2,32,42


In [32]:
df.transform(plus)

Unnamed: 0,a,b
0,12,22
1,22,32
2,32,42


In [33]:
df + 2

Unnamed: 0,a,b
0,12,22
1,22,32
2,32,42


In [34]:
df.a.apply(plus), df.a.agg(plus), df.a.transform(plus)

(0    12
 1    22
 2    32
 Name: a, dtype: int64,
 0    12
 1    22
 2    32
 Name: a, dtype: int64,
 0    12
 1    22
 2    32
 Name: a, dtype: int64)

In [35]:
def mean(x):
    return x.mean()

In [36]:
df.apply(mean)

a    20.0
b    30.0
dtype: float64

In [37]:
df.a

0    10
1    20
2    30
Name: a, dtype: int64

In [38]:
df.a.apply(mean)

AttributeError: 'int' object has no attribute 'mean'

In [39]:
df.transform(mean)

ValueError: Function did not transform

In [40]:
df.a.transform(mean)

ValueError: Function did not transform

In [41]:
df.agg(mean)

a    20.0
b    30.0
dtype: float64

In [42]:
df.a.agg(mean)

20.0

In [43]:
df.apply(mean, axis=1)

0    15.0
1    25.0
2    35.0
dtype: float64