In [1]:
import pandas as pd
import numpy as np

In [2]:
data = [[1.4, np.nan],
           [7.1, -4.5],
        [np.nan, np.nan],
        [0.75, -1.3]]
df = pd.DataFrame(data, columns=["one", "two"], index=["a", "b", "c", "d"])

In [4]:
df.sum(axis=0)

one    9.25
two   -5.80
dtype: float64

In [5]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [6]:
df.sum(axis=1)

a    1.40
b    2.60
c     NaN
d   -0.55
dtype: float64

In [7]:
df["one"].sum()

9.25

In [8]:
df.loc["b"].sum()

2.5999999999999996

In [9]:
df.loc["c"].mean()

nan

In [10]:
df.mean(axis=1, skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [11]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [12]:
one_mean = df.mean(axis=0)["one"]

In [14]:
two_min = df.min(axis=0)["two"]

In [15]:
df["one"] = df["one"].fillna(value=one_mean)

In [17]:
df["two"] = df["two"].fillna(value=two_min)

In [18]:
df

Unnamed: 0,one,two
a,1.4,-4.5
b,7.1,-4.5
c,3.083333,-4.5
d,0.75,-1.3


In [19]:
df2 = pd.DataFrame(np.random.randn(6, 4),
                   columns=["A", "B", "C", "D"],
                   index=pd.date_range("20160701", periods=6))

In [20]:
df2

Unnamed: 0,A,B,C,D
2016-07-01,0.302195,2.654065,-2.053054,3.14807
2016-07-02,-1.320869,0.044223,-0.026215,-1.057953
2016-07-03,1.422806,-1.246054,1.089207,-1.008025
2016-07-04,-0.215746,1.84344,-0.322345,1.863487
2016-07-05,0.510253,-0.124425,-1.192993,1.655948
2016-07-06,1.243566,-1.01208,-1.554496,-0.333358


In [22]:
df2["A"].corr(df2["B"])
#A와 B의 상관게수

-0.43669273385161578

In [23]:
df2["A"].cov(df2["B"])
#A와 B의 공분산

-0.68959534700746417

In [24]:
df2.corr()
#한번에 

Unnamed: 0,A,B,C,D
A,1.0,-0.436693,-0.033521,-0.049682
B,-0.436693,1.0,-0.469057,0.84564
C,-0.033521,-0.469057,1.0,-0.666375
D,-0.049682,0.84564,-0.666375,1.0


In [25]:
df2.cov()

Unnamed: 0,A,B,C,D
A,1.017012,-0.689595,-0.038835,-0.087745
B,-0.689595,2.451944,-0.843764,2.319014
C,-0.038835,-0.843764,1.319713,-1.340667
D,-0.087745,2.319014,-1.340667,3.067077


In [26]:
dates = df2.index
random_dates = np.random.permutation(dates)
df2 = df2.reindex(index=random_dates, columns=["D", "B", "C", "A"])

In [27]:
df2

Unnamed: 0,D,B,C,A
2016-07-01,3.14807,2.654065,-2.053054,0.302195
2016-07-05,1.655948,-0.124425,-1.192993,0.510253
2016-07-02,-1.057953,0.044223,-0.026215,-1.320869
2016-07-04,1.863487,1.84344,-0.322345,-0.215746
2016-07-03,-1.008025,-1.246054,1.089207,1.422806
2016-07-06,-0.333358,-1.01208,-1.554496,1.243566


In [29]:
df2.sort_index(axis=0)
#행방향 정렬

Unnamed: 0,D,B,C,A
2016-07-01,3.14807,2.654065,-2.053054,0.302195
2016-07-02,-1.057953,0.044223,-0.026215,-1.320869
2016-07-03,-1.008025,-1.246054,1.089207,1.422806
2016-07-04,1.863487,1.84344,-0.322345,-0.215746
2016-07-05,1.655948,-0.124425,-1.192993,0.510253
2016-07-06,-0.333358,-1.01208,-1.554496,1.243566


In [32]:
df2.sort_index(axis=1)
#열방향 정렬

Unnamed: 0,A,B,C,D
2016-07-01,0.302195,2.654065,-2.053054,3.14807
2016-07-05,0.510253,-0.124425,-1.192993,1.655948
2016-07-02,-1.320869,0.044223,-0.026215,-1.057953
2016-07-04,-0.215746,1.84344,-0.322345,1.863487
2016-07-03,1.422806,-1.246054,1.089207,-1.008025
2016-07-06,1.243566,-1.01208,-1.554496,-0.333358


In [35]:
df2.sort_index(axis=0, ascending=False)
#내림차순 정렬

Unnamed: 0,D,B,C,A
2016-07-06,-0.333358,-1.01208,-1.554496,1.243566
2016-07-05,1.655948,-0.124425,-1.192993,0.510253
2016-07-04,1.863487,1.84344,-0.322345,-0.215746
2016-07-03,-1.008025,-1.246054,1.089207,1.422806
2016-07-02,-1.057953,0.044223,-0.026215,-1.320869
2016-07-01,3.14807,2.654065,-2.053054,0.302195


In [39]:
df2.sort_values(by="D")

Unnamed: 0,D,B,C,A
2016-07-02,-1.057953,0.044223,-0.026215,-1.320869
2016-07-03,-1.008025,-1.246054,1.089207,1.422806
2016-07-06,-0.333358,-1.01208,-1.554496,1.243566
2016-07-05,1.655948,-0.124425,-1.192993,0.510253
2016-07-04,1.863487,1.84344,-0.322345,-0.215746
2016-07-01,3.14807,2.654065,-2.053054,0.302195


In [40]:
df2.sort_values(by="B")

Unnamed: 0,D,B,C,A
2016-07-03,-1.008025,-1.246054,1.089207,1.422806
2016-07-06,-0.333358,-1.01208,-1.554496,1.243566
2016-07-05,1.655948,-0.124425,-1.192993,0.510253
2016-07-02,-1.057953,0.044223,-0.026215,-1.320869
2016-07-04,1.863487,1.84344,-0.322345,-0.215746
2016-07-01,3.14807,2.654065,-2.053054,0.302195


In [41]:
df2["E"] = np.random.randint(0, 6, size=6)
df2["F"] = ["alpha", "beta", "gamma", "gamma", "alpha", "gamma"]

In [42]:
df

Unnamed: 0,one,two
a,1.4,-4.5
b,7.1,-4.5
c,3.083333,-4.5
d,0.75,-1.3


In [43]:
df2

Unnamed: 0,D,B,C,A,E,F
2016-07-01,3.14807,2.654065,-2.053054,0.302195,3,alpha
2016-07-05,1.655948,-0.124425,-1.192993,0.510253,3,beta
2016-07-02,-1.057953,0.044223,-0.026215,-1.320869,4,gamma
2016-07-04,1.863487,1.84344,-0.322345,-0.215746,0,gamma
2016-07-03,-1.008025,-1.246054,1.089207,1.422806,2,alpha
2016-07-06,-0.333358,-1.01208,-1.554496,1.243566,1,gamma


In [45]:
df2.sort_values(by=["E","F"])

Unnamed: 0,D,B,C,A,E,F
2016-07-04,1.863487,1.84344,-0.322345,-0.215746,0,gamma
2016-07-06,-0.333358,-1.01208,-1.554496,1.243566,1,gamma
2016-07-03,-1.008025,-1.246054,1.089207,1.422806,2,alpha
2016-07-01,3.14807,2.654065,-2.053054,0.302195,3,alpha
2016-07-05,1.655948,-0.124425,-1.192993,0.510253,3,beta
2016-07-02,-1.057953,0.044223,-0.026215,-1.320869,4,gamma


In [46]:
df2["F"].unique()

array(['alpha', 'beta', 'gamma'], dtype=object)

In [47]:
df2["F"].value_counts()

gamma    3
alpha    2
beta     1
Name: F, dtype: int64

In [50]:
df2["F"].isin(["alpha", "beta"])

2016-07-01     True
2016-07-05     True
2016-07-02    False
2016-07-04    False
2016-07-03     True
2016-07-06    False
Name: F, dtype: bool

In [51]:
df2.loc[df2["F"].isin(["alpha", "beta"]), :]

Unnamed: 0,D,B,C,A,E,F
2016-07-01,3.14807,2.654065,-2.053054,0.302195,3,alpha
2016-07-05,1.655948,-0.124425,-1.192993,0.510253,3,beta
2016-07-03,-1.008025,-1.246054,1.089207,1.422806,2,alpha


In [52]:
df3 = pd.DataFrame(np.random.randn(4, 3), columns=["b", "d", "e"],
                   index=["Seoul", "Incheon", "Busan", "Daegu"])

In [55]:
func = lambda x: x.max() - x.min()
#사용자지정 함수 사용

In [54]:
func

<function __main__.<lambda>>

In [56]:
df3.apply(func, axis=0)

b    2.555092
d    1.791511
e    3.238324
dtype: float64

In [57]:
df3.apply(func, axis=1)

Seoul      1.524957
Incheon    3.293642
Busan      2.674344
Daegu      1.099665
dtype: float64