# Pandasの基礎

In [1]:
import pandas as pd
import numpy as np

#### Pandasのバージョンをチェック

In [2]:
pd.__version__

'1.1.5'

## 一、ファイルの読み込みと書き出し
### 1. 読み込み
#### （a）csvファイル

In [4]:
df = pd.read_csv('../dataset/Pandas/table.csv')
df.head()

Unnamed: 0,School,Class,ID,Gender,Address,Height,Weight,Math,Physics
0,S_1,C_1,1101,M,street_1,173,63,34.0,A+
1,S_1,C_1,1102,F,street_2,192,73,32.5,B+
2,S_1,C_1,1103,M,street_2,186,82,87.2,B+
3,S_1,C_1,1104,F,street_2,167,81,80.4,B-
4,S_1,C_1,1105,F,street_4,159,64,84.8,B+


#### （b）txtファイル

In [5]:
df_txt = pd.read_table('../dataset/Pandas/table.txt')
df_txt

Unnamed: 0,col1,col2,col3,col4
0,2,a,1.4,apple
1,3,b,3.4,banana
2,6,c,2.5,orange
3,5,d,3.2,lemon


#### （c）Excelファイル

Excelファイルを読み込むには`xlrd`が必要
```python
pip install xlrd
```

In [6]:

df_excel = pd.read_excel('../dataset/Pandas/table.xlsx')
df_excel.head()

Unnamed: 0,School,Class,ID,Gender,Address,Height,Weight,Math,Physics
0,S_1,C_1,1101,M,street_1,173,63,34.0,A+
1,S_1,C_1,1102,F,street_2,192,73,32.5,B+
2,S_1,C_1,1103,M,street_2,186,82,87.2,B+
3,S_1,C_1,1104,F,street_2,167,81,80.4,B-
4,S_1,C_1,1105,F,street_4,159,64,84.8,B+


### 2. 書き出し

#### （a）csvファイル

In [7]:
df.to_csv('../dataset/Pandas/new_table.csv')
# df.to_csv('data/new_table.csv', index=False) #　行のインデックスを保存しない

#### Excelファイル

Excelファイルを書き出すには`openpyxl`が必要
```python
pip install openpyxl
```

In [8]:
df.to_excel('../dataset/Pandas/new_table2.xlsx', sheet_name='Sheet1')

## 二、基本的なデータ構造
### 1. Series
#### （a）作成

#### Seriesの属性の中によく使うのは值（values）、インデックス（index）、名前（name）、タイプ（dtype）

In [13]:
s = pd.Series(np.random.randn(5),
              index=['a','b','c','d','e'],
              name='簡単な例',
              dtype='float64')
s

a   -0.710989
b   -1.573044
c    0.411858
d    0.794574
e   -0.029676
Name: 簡単な例, dtype: float64

#### （b）Seriesのプロパティーをアクセスする

In [14]:
s.values

array([-0.71098932, -1.57304374,  0.41185786,  0.79457449, -0.02967619])

In [15]:
s.name

'簡単な例'

In [16]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [17]:
s.dtype

dtype('float64')

#### （c）エレメントをアクセス

In [18]:
s['a']

-0.710989319109677

#### （d）メソッドの使用

In [19]:
s.mean()

-0.22145537904707271

In [21]:
[attr for attr in dir(s) if not attr.startswith('_')]

['T',
 'a',
 'abs',
 'add',
 'add_prefix',
 'add_suffix',
 'agg',
 'aggregate',
 'align',
 'all',
 'any',
 'append',
 'apply',
 'argmax',
 'argmin',
 'argsort',
 'array',
 'asfreq',
 'asof',
 'astype',
 'at',
 'at_time',
 'attrs',
 'autocorr',
 'axes',
 'b',
 'backfill',
 'between',
 'between_time',
 'bfill',
 'bool',
 'c',
 'clip',
 'combine',
 'combine_first',
 'compare',
 'convert_dtypes',
 'copy',
 'corr',
 'count',
 'cov',
 'cummax',
 'cummin',
 'cumprod',
 'cumsum',
 'd',
 'describe',
 'diff',
 'div',
 'divide',
 'divmod',
 'dot',
 'drop',
 'drop_duplicates',
 'droplevel',
 'dropna',
 'dtype',
 'dtypes',
 'duplicated',
 'e',
 'empty',
 'eq',
 'equals',
 'ewm',
 'expanding',
 'explode',
 'factorize',
 'ffill',
 'fillna',
 'filter',
 'first',
 'first_valid_index',
 'floordiv',
 'ge',
 'get',
 'groupby',
 'gt',
 'hasnans',
 'head',
 'hist',
 'iat',
 'idxmax',
 'idxmin',
 'iloc',
 'index',
 'infer_objects',
 'interpolate',
 'is_monotonic',
 'is_monotonic_decreasing',
 'is_monotonic_i

### 2. DataFrame
#### （a）作成

In [22]:
df = pd.DataFrame({
                   'col1':list('abcde'),
                   'col2':range(5,10),
                   'col3':[1.3,2.5,3.6,4.6,5.8]
                   },
                  index=list('一二三四五'))
df

Unnamed: 0,col1,col2,col3
一,a,5,1.3
二,b,6,2.5
三,c,7,3.6
四,d,8,4.6
五,e,9,5.8


#### （b）DataFrameの列はSeries

In [23]:
df['col1']

一    a
二    b
三    c
四    d
五    e
Name: col1, dtype: object

In [24]:
type(df)

pandas.core.frame.DataFrame

In [25]:
type(df['col1'])

pandas.core.series.Series

#### （c）行名や列名を修正

In [26]:
df.rename(index={'一':'one'}, columns={'col1':'new_col1'})

Unnamed: 0,new_col1,col2,col3
one,a,5,1.3
二,b,6,2.5
三,c,7,3.6
四,d,8,4.6
五,e,9,5.8


#### （d）プロパティーとメソッド

In [27]:
df.index

Index(['一', '二', '三', '四', '五'], dtype='object')

In [28]:
df.columns

Index(['col1', 'col2', 'col3'], dtype='object')

In [29]:
df.values

array([['a', 5, 1.3],
       ['b', 6, 2.5],
       ['c', 7, 3.6],
       ['d', 8, 4.6],
       ['e', 9, 5.8]], dtype=object)

In [30]:
df.shape

(5, 3)

In [31]:
df.mean()

col2    7.00
col3    3.56
dtype: float64

#### （e）インデックス

In [32]:
df1 = pd.DataFrame({'A':[1,2,3]},index=[1,2,3])
df2 = pd.DataFrame({'A':[1,2,3]},index=[3,1,2])
df1 - df2

Unnamed: 0,A
1,-1
2,-1
3,2


#### （f）列の削除と追加
#### 削除：`drop()`、`del()`、`pop()`

In [33]:
df.drop(index='五',columns='col1')

Unnamed: 0,col2,col3
一,5,1.3
二,6,2.5
三,7,3.6
四,8,4.6


In [34]:
df['col1']=[1,2,3,4,5]
del df['col1']
df

Unnamed: 0,col2,col3
一,5,1.3
二,6,2.5
三,7,3.6
四,8,4.6
五,9,5.8


In [35]:
df['col1']=[1,2,3,4,5]
df.pop('col1')

一    1
二    2
三    3
四    4
五    5
Name: col1, dtype: int64

In [36]:
df

Unnamed: 0,col2,col3
一,5,1.3
二,6,2.5
三,7,3.6
四,8,4.6
五,9,5.8


#### 直接追加、もしくは`assign()`

In [37]:
df1

Unnamed: 0,A
1,1
2,2
3,3


In [38]:
df1['B']=list('abc')
df1

Unnamed: 0,A,B
1,1,a
2,2,b
3,3,c


In [40]:
df1.assign(C=pd.Series(list('def')))

Unnamed: 0,A,B,C
1,1,a,e
2,2,b,f
3,3,c,


In [41]:
df1 # assign()は修正を加えない

Unnamed: 0,A,B
1,1,a
2,2,b
3,3,c


#### （g）タイプで選択

In [42]:
df.select_dtypes(include=['number']).head()

Unnamed: 0,col2,col3
一,5,1.3
二,6,2.5
三,7,3.6
四,8,4.6
五,9,5.8


In [43]:
df.select_dtypes(include=['float']).head()

Unnamed: 0,col3
一,1.3
二,2.5
三,3.6
四,4.6
五,5.8


#### （h）SeriesをDataFrameへ変換

In [44]:
s = df.mean()
s.name='to_DataFrame'
s

col2    7.00
col3    3.56
Name: to_DataFrame, dtype: float64

In [45]:
s.to_frame()

Unnamed: 0,to_DataFrame
col2,7.0
col3,3.56


#### transpose

In [38]:
s.to_frame().T

Unnamed: 0,col2,col3
to_DataFrame,7.0,3.56


## 三、基本的な関数やメソッド

In [46]:
df = pd.read_csv('../dataset/Pandas/table.csv')

### 1. `head()`, `tail()`

In [47]:
df.head()

Unnamed: 0,School,Class,ID,Gender,Address,Height,Weight,Math,Physics
0,S_1,C_1,1101,M,street_1,173,63,34.0,A+
1,S_1,C_1,1102,F,street_2,192,73,32.5,B+
2,S_1,C_1,1103,M,street_2,186,82,87.2,B+
3,S_1,C_1,1104,F,street_2,167,81,80.4,B-
4,S_1,C_1,1105,F,street_4,159,64,84.8,B+


In [48]:
df.tail()

Unnamed: 0,School,Class,ID,Gender,Address,Height,Weight,Math,Physics
30,S_2,C_4,2401,F,street_2,192,62,45.3,A
31,S_2,C_4,2402,M,street_7,166,82,48.7,B
32,S_2,C_4,2403,F,street_6,158,60,59.7,B+
33,S_2,C_4,2404,F,street_2,160,84,67.7,B
34,S_2,C_4,2405,F,street_6,193,54,47.6,B


In [42]:
df.head(3) # 行数を指定

Unnamed: 0,School,Class,ID,Gender,Address,Height,Weight,Math,Physics
0,S_1,C_1,1101,M,street_1,173,63,34.0,A+
1,S_1,C_1,1102,F,street_2,192,73,32.5,B+
2,S_1,C_1,1103,M,street_2,186,82,87.2,B+


### 2. `unique()`, `nunique()`

#### `nunique()`は唯一値の数を示す

In [49]:
df['Physics'].nunique()

7

#### `unique()`は全ての唯一值を列挙する

In [50]:
df['Physics'].unique()

array(['A+', 'B+', 'B-', 'A-', 'B', 'A', 'C'], dtype=object)

### 3. `count()`, `value_counts()`

#### `count()`は非Nan値の数を数える

In [51]:
df['Physics'].count()

35

#### `value_counts()`はエレメントの数を示す

In [52]:
df['Physics'].value_counts()

B+    9
B     8
B-    6
A     4
A+    3
A-    3
C     2
Name: Physics, dtype: int64

### 4. `describe()`, `info()`

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   School   35 non-null     object 
 1   Class    35 non-null     object 
 2   ID       35 non-null     int64  
 3   Gender   35 non-null     object 
 4   Address  35 non-null     object 
 5   Height   35 non-null     int64  
 6   Weight   35 non-null     int64  
 7   Math     35 non-null     float64
 8   Physics  35 non-null     object 
dtypes: float64(1), int64(3), object(5)
memory usage: 2.6+ KB


In [54]:
df.describe() # デフォルト25%, 50%, 75%

Unnamed: 0,ID,Height,Weight,Math
count,35.0,35.0,35.0,35.0
mean,1803.0,174.142857,74.657143,61.351429
std,536.87741,13.541098,12.895377,19.915164
min,1101.0,155.0,53.0,31.5
25%,1204.5,161.0,63.0,47.4
50%,2103.0,173.0,74.0,61.7
75%,2301.5,187.5,82.0,77.1
max,2405.0,195.0,100.0,97.0


In [55]:
df.describe(percentiles=[.05, .25, .75, .95]) # 指定可能

Unnamed: 0,ID,Height,Weight,Math
count,35.0,35.0,35.0,35.0
mean,1803.0,174.142857,74.657143,61.351429
std,536.87741,13.541098,12.895377,19.915164
min,1101.0,155.0,53.0,31.5
5%,1102.7,157.0,56.1,32.64
25%,1204.5,161.0,63.0,47.4
50%,2103.0,173.0,74.0,61.7
75%,2301.5,187.5,82.0,77.1
95%,2403.3,193.3,97.6,90.04
max,2405.0,195.0,100.0,97.0


In [56]:
df['Physics'].describe() # 非数字型でも

count     35
unique     7
top       B+
freq       9
Name: Physics, dtype: object

### 5. `idxmax()`, `nlargest()`

In [57]:
df['Math'].idxmax() # 最大値のインデックス

5

In [58]:
df['Math'].nlargest(3)

5     97.0
28    95.5
11    87.7
Name: Math, dtype: float64

### 6. `clip()`, `replace()`

In [63]:
df['Math'].head()

0    34.0
1    32.5
2    87.2
3    80.4
4    84.8
Name: Math, dtype: float64

In [64]:
df['Math'].clip(33,80).head() # >=80 -> 80; <=33 -> 33

0    34.0
1    33.0
2    80.0
3    80.0
4    80.0
Name: Math, dtype: float64

In [66]:
df['Address'].head()

0    street_1
1    street_2
2    street_2
3    street_2
4    street_4
Name: Address, dtype: object

In [67]:
df['Address'].replace(['street_1','street_2'],['one','two']).head()

0         one
1         two
2         two
3         two
4    street_4
Name: Address, dtype: object

In [68]:
df.replace({'Address': {'street_1': 'one', 'street_2': 'two'}}).head()

Unnamed: 0,School,Class,ID,Gender,Address,Height,Weight,Math,Physics
0,S_1,C_1,1101,M,one,173,63,34.0,A+
1,S_1,C_1,1102,F,two,192,73,32.5,B+
2,S_1,C_1,1103,M,two,186,82,87.2,B+
3,S_1,C_1,1104,F,two,167,81,80.4,B-
4,S_1,C_1,1105,F,street_4,159,64,84.8,B+


### 7. `apply()`

In [69]:
df['Math'].apply(lambda x:str(x)+'!').head()

0    34.0!
1    32.5!
2    87.2!
3    80.4!
4    84.8!
Name: Math, dtype: object

In [70]:
df.apply(lambda x:x.apply(lambda x:str(x)+'!')).head()

Unnamed: 0,School,Class,ID,Gender,Address,Height,Weight,Math,Physics
0,S_1!,C_1!,1101!,M!,street_1!,173!,63!,34.0!,A+!
1,S_1!,C_1!,1102!,F!,street_2!,192!,73!,32.5!,B+!
2,S_1!,C_1!,1103!,M!,street_2!,186!,82!,87.2!,B+!
3,S_1!,C_1!,1104!,F!,street_2!,167!,81!,80.4!,B-!
4,S_1!,C_1!,1105!,F!,street_4!,159!,64!,84.8!,B+!


## 四、ソート

### 1. `sort_index()`

In [59]:
df.set_index('Math').head()

Unnamed: 0_level_0,School,Class,ID,Gender,Address,Height,Weight,Physics
Math,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
34.0,S_1,C_1,1101,M,street_1,173,63,A+
32.5,S_1,C_1,1102,F,street_2,192,73,B+
87.2,S_1,C_1,1103,M,street_2,186,82,B+
80.4,S_1,C_1,1104,F,street_2,167,81,B-
84.8,S_1,C_1,1105,F,street_4,159,64,B+


In [60]:
df.set_index('Math').sort_index().head()

Unnamed: 0_level_0,School,Class,ID,Gender,Address,Height,Weight,Physics
Math,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
31.5,S_1,C_3,1301,M,street_4,161,68,B+
32.5,S_1,C_1,1102,F,street_2,192,73,B+
32.7,S_2,C_3,2302,M,street_5,171,88,A
33.8,S_1,C_2,1204,F,street_5,162,63,B
34.0,S_1,C_1,1101,M,street_1,173,63,A+


### 2. `sort_values()`

In [61]:
df.sort_values(by='Class').head()

Unnamed: 0,School,Class,ID,Gender,Address,Height,Weight,Math,Physics
0,S_1,C_1,1101,M,street_1,173,63,34.0,A+
19,S_2,C_1,2105,M,street_4,170,81,34.2,A
18,S_2,C_1,2104,F,street_5,159,97,72.2,B+
16,S_2,C_1,2102,F,street_6,161,61,50.6,B+
15,S_2,C_1,2101,M,street_7,174,84,83.3,C


In [62]:
df.sort_values(by=['Address','Height']).head()

Unnamed: 0,School,Class,ID,Gender,Address,Height,Weight,Math,Physics
0,S_1,C_1,1101,M,street_1,173,63,34.0,A+
11,S_1,C_3,1302,F,street_1,175,57,87.7,A-
23,S_2,C_2,2204,M,street_1,175,74,47.2,B-
33,S_2,C_4,2404,F,street_2,160,84,67.7,B
3,S_1,C_1,1104,F,street_2,167,81,80.4,B-
