# データの連結

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
arr1 = np.arange(9).reshape((3, 3))

In [3]:
arr1

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

## 列方向に連結

In [4]:
np.concatenate([arr1, arr1], axis=1)

array([[0, 1, 2, 0, 1, 2],
       [3, 4, 5, 3, 4, 5],
       [6, 7, 8, 6, 7, 8]])

## 行方向に連結

In [5]:
np.concatenate([arr1, arr1], axis=0)

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8],
       [0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [6]:
ser1 = Series([0, 1, 2], index=['T', 'U', 'V'])

In [7]:
ser2 = Series([3, 4], index=['X', 'Y'])

In [8]:
ser1

T    0
U    1
V    2
dtype: int64

In [9]:
ser2

X    3
Y    4
dtype: int64

In [10]:
pd.concat([ser1, ser2])

T    0
U    1
V    2
X    3
Y    4
dtype: int64

In [12]:
pd.concat([ser1, ser2], axis=1)

Unnamed: 0,0,1
T,0.0,
U,1.0,
V,2.0,
X,,3.0
Y,,4.0


In [13]:
pd.concat([ser1, ser2], keys=['cat1', 'cat2'])

cat1  T    0
      U    1
      V    2
cat2  X    3
      Y    4
dtype: int64

In [14]:
pd.concat([ser1, ser2], axis=1, keys=['cat1', 'cat2'])

Unnamed: 0,cat1,cat2
T,0.0,
U,1.0,
V,2.0,
X,,3.0
Y,,4.0


In [15]:
df1 = DataFrame(np.random.randn(4, 3), columns=['X', 'Y', 'Z'])

In [16]:
df1

Unnamed: 0,X,Y,Z
0,1.242353,-0.309249,0.289885
1,1.063451,0.128782,0.042441
2,-0.523881,-2.594776,-0.64701
3,-0.998205,-1.452273,-0.476235


In [17]:
df2 = DataFrame(np.random.randn(3, 3), columns=['Y', 'Q', 'X'])

In [18]:
df2

Unnamed: 0,Y,Q,X
0,-0.037129,0.059768,-1.649101
1,-2.482105,-1.802293,1.830953
2,-0.711873,0.158742,-0.147931


In [19]:
pd.concat([df1, df2])

Unnamed: 0,Q,X,Y,Z
0,,1.242353,-0.309249,0.289885
1,,1.063451,0.128782,0.042441
2,,-0.523881,-2.594776,-0.64701
3,,-0.998205,-1.452273,-0.476235
0,0.059768,-1.649101,-0.037129,
1,-1.802293,1.830953,-2.482105,
2,0.158742,-0.147931,-0.711873,


In [20]:
pd.concat([df1, df2], ignore_index=True)

Unnamed: 0,Q,X,Y,Z
0,,1.242353,-0.309249,0.289885
1,,1.063451,0.128782,0.042441
2,,-0.523881,-2.594776,-0.64701
3,,-0.998205,-1.452273,-0.476235
4,0.059768,-1.649101,-0.037129,
5,-1.802293,1.830953,-2.482105,
6,0.158742,-0.147931,-0.711873,


## データを組み合わせる

In [22]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [23]:
ser1 = Series([2, np.nan, 4, np.nan, 6, np.nan],
             index = ['Q', 'R', 'S', 'T', 'U', 'V'])

In [24]:
ser1

Q    2.0
R    NaN
S    4.0
T    NaN
U    6.0
V    NaN
dtype: float64

In [26]:
ser2 = Series(np.arange(len(ser1)), dtype=np.float64, index = ['Q', 'R', 'S', 'T', 'U', 'V'])

In [27]:
ser2

Q    0.0
R    1.0
S    2.0
T    3.0
U    4.0
V    5.0
dtype: float64

In [28]:
np.where(pd.isnull(ser1))

(array([1, 3, 5]),)

## ser1でnanの場合はser2の値を使う。ser2の値がnanだったらser1の値を使う

In [29]:
Series(np.where(pd.isnull(ser1), ser2, ser1))

0    2.0
1    1.0
2    4.0
3    3.0
4    6.0
5    5.0
dtype: float64

In [30]:
ser1.combine_first(ser2)

Q    2.0
R    1.0
S    4.0
T    3.0
U    6.0
V    5.0
dtype: float64

In [31]:
df_odds = DataFrame({'X': [1, np.nan, 3, np.nan],
                    'Y':[np.nan, 5, np.nan, 9],
                    'Z':[np.nan, 9, np.nan, 11]})

In [32]:
df_odds

Unnamed: 0,X,Y,Z
0,1.0,,
1,,5.0,9.0
2,3.0,,
3,,9.0,11.0


In [33]:
df_evens = DataFrame({'X': [2, 4, np.nan, 6, 8],
                    'Y':[np.nan, 10, 12, 14, 16]})

In [34]:
df_evens

Unnamed: 0,X,Y
0,2.0,
1,4.0,10.0
2,,12.0
3,6.0,14.0
4,8.0,16.0


In [35]:
df_odds.combine_first(df_evens)

Unnamed: 0,X,Y,Z
0,1.0,,
1,4.0,5.0,9.0
2,3.0,12.0,
3,6.0,9.0,11.0
4,8.0,16.0,


# SeriesとDataFrameのデータの変換

In [36]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [40]:
df1 = DataFrame(np.arange(8).reshape((2, 4)),
               index=pd.Index(['LA', 'SF'], name='city'),
               columns=pd.Index(['A', 'B', 'C', 'D'], name='letter'))

In [41]:
df1

letter,A,B,C,D
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LA,0,1,2,3
SF,4,5,6,7


In [42]:
df1.stack()

city  letter
LA    A         0
      B         1
      C         2
      D         3
SF    A         4
      B         5
      C         6
      D         7
dtype: int64

In [43]:
type(df1.stack())

pandas.core.series.Series

In [44]:
df1_st = df1.stack()

In [45]:
df1_st

city  letter
LA    A         0
      B         1
      C         2
      D         3
SF    A         4
      B         5
      C         6
      D         7
dtype: int64

In [46]:
df1_st.unstack()

letter,A,B,C,D
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LA,0,1,2,3
SF,4,5,6,7


In [47]:
df1_st.unstack(0)

city,LA,SF
letter,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,4
B,1,5
C,2,6
D,3,7


In [48]:
df1_st.unstack('city')

city,LA,SF
letter,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,4
B,1,5
C,2,6
D,3,7


In [49]:
df1_st.unstack('letter')

letter,A,B,C,D
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LA,0,1,2,3
SF,4,5,6,7


In [50]:
ser1 = Series([0, 1, 2], index=['Q', 'X', 'Y'])

In [51]:
ser1

Q    0
X    1
Y    2
dtype: int64

In [52]:
ser2 = Series([4, 5, 6], index=['X', 'Y', 'Z'])

In [53]:
ser2

X    4
Y    5
Z    6
dtype: int64

In [55]:
df = pd.concat([ser1, ser2], keys=['a', 'b'])

In [56]:
df

a  Q    0
   X    1
   Y    2
b  X    4
   Y    5
   Z    6
dtype: int64

In [57]:
df.unstack()

Unnamed: 0,Q,X,Y,Z
a,0.0,1.0,2.0,
b,,4.0,5.0,6.0


In [59]:
df.unstack().stack(dropna=False)

a  Q    0.0
   X    1.0
   Y    2.0
   Z    NaN
b  Q    NaN
   X    4.0
   Y    5.0
   Z    6.0
dtype: float64

# ピボットテーブルの作り方

In [60]:
import numpy as np
from pandas import DataFrame

In [61]:
import pandas.util.testing as tm
tm.N = 3

def unpivot(frame):
    N, K = frame.shape
    data = {'value' : frame.values.ravel('F'),
            'variable' : np.asarray(frame.columns).repeat(N),
            'date' : np.tile(np.asarray(frame.index), K)}
    return DataFrame(data, columns=['date', 'variable', 'value'])

df = unpivot(tm.makeTimeDataFrame())

In [62]:
df

Unnamed: 0,date,variable,value
0,2000-01-03,A,0.56728
1,2000-01-04,A,-0.102236
2,2000-01-05,A,-0.604334
3,2000-01-03,B,0.824941
4,2000-01-04,B,-0.775147
5,2000-01-05,B,-0.511161
6,2000-01-03,C,0.41384
7,2000-01-04,C,-0.027871
8,2000-01-05,C,0.051866
9,2000-01-03,D,0.422639


In [64]:
df_piv = df.pivot('date', 'variable', 'value')

In [65]:
df_piv

variable,A,B,C,D
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-03,0.56728,0.824941,0.41384,0.422639
2000-01-04,-0.102236,-0.775147,-0.027871,-1.608636
2000-01-05,-0.604334,-0.511161,0.051866,1.121884


# 重複したデータの処理

In [66]:
from pandas import DataFrame

In [67]:
df = DataFrame({'key1': ['A'] * 2 + ['B'] * 3,
               'key2': [2, 2, 2, 3, 3]})

In [68]:
df

Unnamed: 0,key1,key2
0,A,2
1,A,2
2,B,2
3,B,3
4,B,3


In [69]:
df.duplicated()

0    False
1     True
2    False
3    False
4     True
dtype: bool

In [70]:
df.drop_duplicates()

Unnamed: 0,key1,key2
0,A,2
2,B,2
3,B,3


In [71]:
df.drop_duplicates(['key1'])

Unnamed: 0,key1,key2
0,A,2
2,B,2


In [72]:
df.drop_duplicates(['key2'])

Unnamed: 0,key1,key2
0,A,2
3,B,3


In [73]:
df

Unnamed: 0,key1,key2
0,A,2
1,A,2
2,B,2
3,B,3
4,B,3


In [79]:
df.duplicated()

0    False
1     True
2    False
3    False
4     True
dtype: bool

In [83]:
df.drop_duplicates(['key1'], keep='last')

Unnamed: 0,key1,key2
1,A,2
4,B,3


In [85]:
df.drop_duplicates(['key1'], keep='first')

Unnamed: 0,key1,key2
0,A,2
2,B,2


# マッピングを使った列の追加

In [86]:
from pandas import DataFrame

In [87]:
df = DataFrame({'city':['Alma','Brian Head','Fox Park'],
                    'altitude':[3158,3000,2762]})

In [88]:
df

Unnamed: 0,altitude,city
0,3158,Alma
1,3000,Brian Head
2,2762,Fox Park


In [89]:
state_map = {'Alma': 'Colorado', 'Brian Head': 'Utha', 'Fox Park': 'Wyoming'}

In [90]:
df['state'] = df['city'].map(state_map)

In [91]:
df

Unnamed: 0,altitude,city,state
0,3158,Alma,Colorado
1,3000,Brian Head,Utha
2,2762,Fox Park,Wyoming


# 置換

In [93]:
import numpy as np
from pandas import Series

In [94]:
ser1 = Series([1, 2, 3, 4, 1, 2, 3, 4])

In [95]:
ser1

0    1
1    2
2    3
3    4
4    1
5    2
6    3
7    4
dtype: int64

In [96]:
ser1.replace(1, np.nan)

0    NaN
1    2.0
2    3.0
3    4.0
4    NaN
5    2.0
6    3.0
7    4.0
dtype: float64

In [97]:
ser1

0    1
1    2
2    3
3    4
4    1
5    2
6    3
7    4
dtype: int64

In [98]:
ser1.replace([1, 4], [100, 400])

0    100
1      2
2      3
3    400
4    100
5      2
6      3
7    400
dtype: int64

In [99]:
ser1.replace({4: np.nan})

0    1.0
1    2.0
2    3.0
3    NaN
4    1.0
5    2.0
6    3.0
7    NaN
dtype: float64

# indexの変更

In [100]:
import numpy as np
from pandas import DataFrame

In [102]:
df = DataFrame(np.arange(12).reshape((3, 4)), index=['NY', 'LA', 'SF'], columns=['A', 'B', 'C', 'D'])

In [103]:
df

Unnamed: 0,A,B,C,D
NY,0,1,2,3
LA,4,5,6,7
SF,8,9,10,11


In [104]:
str.lower('A')

'a'

In [105]:
df.index.map(str.lower)

Index(['ny', 'la', 'sf'], dtype='object')

In [106]:
df.index = df.index.map(str.lower)

In [107]:
df

Unnamed: 0,A,B,C,D
ny,0,1,2,3
la,4,5,6,7
sf,8,9,10,11


In [108]:
str.title('udemy is good')

'Udemy Is Good'

In [109]:
df.rename(index=str.title, columns=str.lower)

Unnamed: 0,a,b,c,d
Ny,0,1,2,3
La,4,5,6,7
Sf,8,9,10,11


In [110]:
df

Unnamed: 0,A,B,C,D
ny,0,1,2,3
la,4,5,6,7
sf,8,9,10,11


In [111]:
df.rename(index={'ny': 'New York'}, columns={'A': 'AAA'})

Unnamed: 0,AAA,B,C,D
New York,0,1,2,3
la,4,5,6,7
sf,8,9,10,11


In [112]:
df

Unnamed: 0,A,B,C,D
ny,0,1,2,3
la,4,5,6,7
sf,8,9,10,11


In [113]:
df.rename(index={'ny': 'New York'}, columns={'A': 'AAA'}, inplace=True)

In [114]:
df

Unnamed: 0,AAA,B,C,D
New York,0,1,2,3
la,4,5,6,7
sf,8,9,10,11


# ビニング

In [115]:
import pandas as pd

In [116]:
years = [1990,1991,1992,2008,2012,2015,1987,1969,2013,2008,1999]

In [117]:
len(years)

11

In [118]:
decade_bins = [1960, 1970, 1980, 1990, 2000, 2010, 2020]

In [121]:
decade_cut = pd.cut(years, decade_bins)

In [122]:
decade_cut

[(1980, 1990], (1990, 2000], (1990, 2000], (2000, 2010], (2010, 2020], ..., (1980, 1990], (1960, 1970], (2010, 2020], (2000, 2010], (1990, 2000]]
Length: 11
Categories (6, interval[int64]): [(1960, 1970] < (1970, 1980] < (1980, 1990] < (1990, 2000] < (2000, 2010] < (2010, 2020]]