# 판다스
---

In [601]:
#!pip install pandas --upgrade --user

In [602]:
import numpy as np
import pandas as pd
pd.__version__

'1.5.2'

# Series

In [603]:
sr = pd.Series([1, 2, 3, 4, 5], name = 'Apple', index=['a', 'b', 'c', 'd', 'e'])
sr

a    1
b    2
c    3
d    4
e    5
Name: Apple, dtype: int64

In [604]:
sr.name

'Apple'

In [605]:
sr.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [606]:
sr[2]

3

In [607]:
sr['c']

3

In [608]:
sr.values

array([1, 2, 3, 4, 5], dtype=int64)

In [609]:
sr.reset_index()

Unnamed: 0,index,Apple
0,a,1
1,b,2
2,c,3
3,d,4
4,e,5


In [610]:
sr = pd.Series([1, np.nan, 2, 3, np.nan, 4, 5])
sr

0    1.0
1    NaN
2    2.0
3    3.0
4    NaN
5    4.0
6    5.0
dtype: float64

In [611]:
sr.reset_index()

Unnamed: 0,index,0
0,0,1.0
1,1,
2,2,2.0
3,3,3.0
4,4,
5,5,4.0
6,6,5.0


In [612]:
# fancy indexing

sr[[1, 2, 4]]

1    NaN
2    2.0
4    NaN
dtype: float64

In [613]:
# boolean indexing

idx = [False, True, True, False, True, False, False] # 오류가 발생하기 때문에 사이즈 크기를 맞춰야한다 
sr[idx]

1    NaN
2    2.0
4    NaN
dtype: float64

In [614]:
# 조건
sr[sr>3]

5    4.0
6    5.0
dtype: float64

In [615]:
# 결측치 (Missing Value) // nan값을 알아내는 것 // is가 앞에 붙으면 불린값 true, false로 반환함

sr.isna(), sr[sr.isna()] # isna == isnull

(0    False
 1     True
 2    False
 3    False
 4     True
 5    False
 6    False
 dtype: bool,
 1   NaN
 4   NaN
 dtype: float64)

In [616]:
sr.isna().sum()

2

In [617]:
x = sr.copy()
x

0    1.0
1    NaN
2    2.0
3    3.0
4    NaN
5    4.0
6    5.0
dtype: float64

In [618]:
x[x.isna()] = x.mean()
x

0    1.0
1    3.0
2    2.0
3    3.0
4    3.0
5    4.0
6    5.0
dtype: float64

In [619]:
#sr.dropna() # nan값이 자리하고 있는 인덱스를 날려버림 하지만 데이터를 변경시키진 않으므로 nan값을 날린 시리즈를 저장하려면 다른 변수를 생성하여 다른 변수에 저장시켜줘야함

In [620]:
sr

0    1.0
1    NaN
2    2.0
3    3.0
4    NaN
5    4.0
6    5.0
dtype: float64

In [621]:
sr.dropna(inplace=True) # inplace = True 를 넣어주면 원본 데이터에서 nan값을 가진 인덱스를 날려버림
sr

0    1.0
2    2.0
3    3.0
5    4.0
6    5.0
dtype: float64

In [622]:
w = sr.copy()
# x[x.isna()] = x.mean()
w.fillna(w.mean())

0    1.0
2    2.0
3    3.0
5    4.0
6    5.0
dtype: float64

# 슬라이싱

In [623]:
sr = pd.Series([10, 15, 12, 17, 13], index = ['john', 'eva', 'james', 'liam', 'zoe'])
sr

john     10
eva      15
james    12
liam     17
zoe      13
dtype: int64

In [624]:
sr[1:4]

eva      15
james    12
liam     17
dtype: int64

In [625]:
sr['eva':'liam'] # 숫자로 슬라이싱하면 마지막값은 포함이 안되서 결과를 출력하는데, 인덱스 번호가 아닌 밸류 원소의 값으로 인덱싱을 하면 마지막값까지 포함하여 결과를 출력한다.

eva      15
james    12
liam     17
dtype: int64

In [626]:
sr[::-1] # :: 스텝 인덱싱

zoe      13
liam     17
james    12
eva      15
john     10
dtype: int64

In [627]:
sr.sort_values(ascending=False)[:3] # ascending 불린값이 False이므로 내림차순으로 설정되어있다.

liam    17
eva     15
zoe     13
dtype: int64

In [628]:
sr.sort_index()

eva      15
james    12
john     10
liam     17
zoe      13
dtype: int64

# DataFrame

In [629]:
doc = [['Joe', 20, 85.10, 'A', 'Swimming'],
        ['Nat', 21, 77.80, 'B', 'Reading'],
        ['Harry', 19, 91.54, 'A', 'Music'],
        ['Sam', 20, 88.78, 'A', 'Painting'],
        ['Monica', 22, 60.55, 'B', 'Dancing']]

c_name = ['Name', 'Age', 'Marks', 'Grade', 'Hobby']
idx = ['s1', 's2', 's3', 's4', 's5']

In [630]:
df = pd.DataFrame(doc, columns=c_name)
df.head() # head함수는 데이터프레임에서 제일 위에있는 5개의 데이터만 읽어들여옴

Unnamed: 0,Name,Age,Marks,Grade,Hobby
0,Joe,20,85.1,A,Swimming
1,Nat,21,77.8,B,Reading
2,Harry,19,91.54,A,Music
3,Sam,20,88.78,A,Painting
4,Monica,22,60.55,B,Dancing


In [631]:
doc = {'Name' :['Joe','Nat','Harry','Sam','Monica'],
        'Age':[20, 21, 19, 20, 22],
        'Marks':[85.10, 77.80, 91.54, 88.78, 60.55],
        'Grade':['A', 'B', 'A', 'A', 'B',],
        'Hobby':['Swmming', 'Reading', 'Music', 'Painting', 'Dancing']}

In [632]:
df = pd.DataFrame(doc)
df.shape

(5, 5)

In [633]:
df.head(3)

Unnamed: 0,Name,Age,Marks,Grade,Hobby
0,Joe,20,85.1,A,Swmming
1,Nat,21,77.8,B,Reading
2,Harry,19,91.54,A,Music


In [634]:
x = df.dtypes
x

Name      object
Age        int64
Marks    float64
Grade     object
Hobby     object
dtype: object

In [635]:
x['Name']

dtype('O')

In [636]:
df.columns

Index(['Name', 'Age', 'Marks', 'Grade', 'Hobby'], dtype='object')

In [637]:
df.columns[[0, 2, 3]]

Index(['Name', 'Marks', 'Grade'], dtype='object')

In [638]:
df[df.columns[[0, 2, 3]]]

Unnamed: 0,Name,Marks,Grade
0,Joe,85.1,A
1,Nat,77.8,B
2,Harry,91.54,A
3,Sam,88.78,A
4,Monica,60.55,B


In [639]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    5 non-null      object 
 1   Age     5 non-null      int64  
 2   Marks   5 non-null      float64
 3   Grade   5 non-null      object 
 4   Hobby   5 non-null      object 
dtypes: float64(1), int64(1), object(3)
memory usage: 328.0+ bytes


In [640]:
doc = {'Name' :['Joe' ,np.nan,'Harry','Sam','Monica'],
        'Age':[20, 21, 19, 20, 22],
        'Marks':[85.10, 77.80, np.nan, np.nan, 60.55],
        'Grade':['A', 'B', 'A', 'A', 'B',],
        'Hobby':['Swmming', 'Reading', 'Music', 'Painting', 'Dancing']}
df = pd.DataFrame(doc)
df.head()

Unnamed: 0,Name,Age,Marks,Grade,Hobby
0,Joe,20,85.1,A,Swmming
1,,21,77.8,B,Reading
2,Harry,19,,A,Music
3,Sam,20,,A,Painting
4,Monica,22,60.55,B,Dancing


In [641]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    4 non-null      object 
 1   Age     5 non-null      int64  
 2   Marks   3 non-null      float64
 3   Grade   5 non-null      object 
 4   Hobby   5 non-null      object 
dtypes: float64(1), int64(1), object(3)
memory usage: 328.0+ bytes


### 컬럼 다루기

In [642]:
doc = {'Name' :['Joe' ,np.nan,'Harry','Sam','Monica'],
        'Age':[20, 21, 19, 20, 22],
        'Marks':[85.10, 77.80, np.nan, np.nan, 60.55],
        'Grade':['A', 'B', 'A', 'A', 'B',],
        'Hobby':['Swmming', 'Reading', 'Music', 'Painting', 'Dancing']}
df = pd.DataFrame(doc)
df.head()

Unnamed: 0,Name,Age,Marks,Grade,Hobby
0,Joe,20,85.1,A,Swmming
1,,21,77.8,B,Reading
2,Harry,19,,A,Music
3,Sam,20,,A,Painting
4,Monica,22,60.55,B,Dancing


In [643]:
df[['Name', 'Age']] # 여러개를 끌어올때는 리스트로 묶어줘야한다 .. fancy indexing 도 동일

Unnamed: 0,Name,Age
0,Joe,20
1,,21
2,Harry,19
3,Sam,20
4,Monica,22


In [644]:
df[['Age']] # 리스트로 묶어서 뽑아오면 데이터프레임으로 뽑아오는거기때문에 하나를 뽑을때는 묶어서 뽑으면 안된다. 시리즈로 뽑기위해서 리스트로 묶지않고 뽑아온다.

Unnamed: 0,Age
0,20
1,21
2,19
3,20
4,22


In [645]:
df.columns

Index(['Name', 'Age', 'Marks', 'Grade', 'Hobby'], dtype='object')

In [646]:
df.columns=['Name', 'Age', 'Score', 'Grade', 'Hobby']

In [647]:
df

Unnamed: 0,Name,Age,Score,Grade,Hobby
0,Joe,20,85.1,A,Swmming
1,,21,77.8,B,Reading
2,Harry,19,,A,Music
3,Sam,20,,A,Painting
4,Monica,22,60.55,B,Dancing


In [648]:
df.rename(columns={'Score':'Marks', 'Hobby':'ets'})

Unnamed: 0,Name,Age,Marks,Grade,ets
0,Joe,20,85.1,A,Swmming
1,,21,77.8,B,Reading
2,Harry,19,,A,Music
3,Sam,20,,A,Painting
4,Monica,22,60.55,B,Dancing


# 파일 입출력

In [649]:
#!pip install gdown

In [650]:
#!gdown https://raw.githubusercontent.com/devdio/datasets/main/doc.csv

In [651]:
pd.read_csv('doc.csv')

Unnamed: 0,Name,Age,Marks,Grade,Hobby
0,Joe,20,85.1,A,Swmming
1,Nat,21,77.8,B,Reading
2,Harry,19,91.54,A,Music
3,Sam,20,88.78,A,Painting
4,Monica,22,60.55,B,Dancing


In [652]:
#!gdown https://raw.githubusercontent.com/devdio/datasets/main/doc_idx.csv

In [653]:
pd.read_csv('doc_idx.csv')

Unnamed: 0.1,Unnamed: 0,Name,Age,Marks,Grade,Hobby
0,s1,Joe,20,85.1,A,Swmming
1,s2,Nat,21,77.8,B,Reading
2,s3,Harry,19,91.54,A,Music
3,s4,Sam,20,88.78,A,Painting
4,s5,Monica,22,60.55,B,Dancing


In [654]:
#!gdown https://raw.githubusercontent.com/devdio/datasets/main/doc_na.csv

In [655]:
df = pd.read_csv('doc_na.csv', index_col=0, na_values = ['?'])
df

Unnamed: 0,Name,Age,Marks,Grade,Hobby
s1,Joe,20.0,,A,Swmming
s2,Nat,21.0,77.8,B,Reading
s3,Harry,,91.54,A,Music
s4,Sam,20.0,88.78,A,
s5,Monica,22.0,60.55,B,Dancing


In [656]:
#!gdown https://raw.githubusercontent.com/devdio/datasets/main/titanic.csv

In [657]:
titanic = pd.read_csv('titanic.csv')
titanic.shape

(891, 12)

In [658]:
df = titanic.copy()

In [659]:
df.columns 

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [660]:
df.columns = [x.lower() for x in df.columns] # upper - lower
df.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

In [661]:
df

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,lost,3rd,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,lost,2nd,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,saved,1st,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,lost,3rd,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,saved,1st,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [662]:
df.info() # 총 891개의 데이터 중에서 714, 204의 개수가 뜨는 이유는 누락된 데이터가 있기 때문이다.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  891 non-null    int64  
 1   survived     891 non-null    object 
 2   pclass       891 non-null    object 
 3   name         891 non-null    object 
 4   sex          891 non-null    object 
 5   age          714 non-null    float64
 6   sibsp        891 non-null    int64  
 7   parch        891 non-null    int64  
 8   ticket       891 non-null    object 
 9   fare         891 non-null    float64
 10  cabin        204 non-null    object 
 11  embarked     889 non-null    object 
dtypes: float64(2), int64(3), object(7)
memory usage: 83.7+ KB


In [663]:
df.describe() # describe()는 데이터프레임 데이터중의 정수형 데이터값만 불러온다.

Unnamed: 0,passengerid,age,sibsp,parch,fare
count,891.0,714.0,891.0,891.0,891.0
mean,446.0,29.699118,0.523008,0.381594,32.204208
std,257.353842,14.526497,1.102743,0.806057,49.693429
min,1.0,0.42,0.0,0.0,0.0
25%,223.5,20.125,0.0,0.0,7.9104
50%,446.0,28.0,0.0,0.0,14.4542
75%,668.5,38.0,1.0,0.0,31.0
max,891.0,80.0,8.0,6.0,512.3292


In [664]:
df.describe().T # T를 붙이면 전치행렬을 표시한다.

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
passengerid,891.0,446.0,257.353842,1.0,223.5,446.0,668.5,891.0
age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
sibsp,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
parch,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292


In [665]:
df.head(10) # head()의 기본값은 상위 다섯개까지 표시.

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,lost,3rd,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,lost,3rd,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,lost,1st,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,lost,3rd,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,saved,3rd,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,saved,2nd,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [666]:
df['embarked'].unique() # unique()는 총 데이터원소 표시.

array(['S', 'C', 'Q', nan], dtype=object)

In [667]:
df['embarked'].value_counts() # value_counts()는 데이터 원소마다의 개수를 표시.

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [668]:
df['survived'].value_counts()

lost     549
saved    342
Name: survived, dtype: int64

In [669]:
df['survived']

0       lost
1      saved
2      saved
3      saved
4       lost
       ...  
886     lost
887    saved
888     lost
889    saved
890     lost
Name: survived, Length: 891, dtype: object

In [670]:
# 남여

df['sex'].value_counts()

male      577
female    314
Name: sex, dtype: int64

In [671]:
## 결측치 개수 계산

df.isnull()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,False,False,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
886,False,False,False,False,False,False,False,False,False,False,True,False
887,False,False,False,False,False,False,False,False,False,False,False,False
888,False,False,False,False,False,True,False,False,False,False,True,False
889,False,False,False,False,False,False,False,False,False,False,False,False


In [672]:
df.isna().sum(axis = 0)

passengerid      0
survived         0
pclass           0
name             0
sex              0
age            177
sibsp            0
parch            0
ticket           0
fare             0
cabin          687
embarked         2
dtype: int64

In [673]:
df.isnull().sum(axis = 0)

passengerid      0
survived         0
pclass           0
name             0
sex              0
age            177
sibsp            0
parch            0
ticket           0
fare             0
cabin          687
embarked         2
dtype: int64

### 인덱싱, 슬라이싱
- loc
- iloc

In [674]:
df.loc[5, 'pclass']

'3rd'

In [675]:
df.loc[5:10, ['pclass','name','survived']]

Unnamed: 0,pclass,name,survived
5,3rd,"Moran, Mr. James",lost
6,1st,"McCarthy, Mr. Timothy J",lost
7,3rd,"Palsson, Master. Gosta Leonard",lost
8,3rd,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",saved
9,2nd,"Nasser, Mrs. Nicholas (Adele Achem)",saved
10,3rd,"Sandstrom, Miss. Marguerite Rut",saved


In [676]:
df.iloc[5:10, [0, 3, 5]]

Unnamed: 0,passengerid,name,age
5,6,"Moran, Mr. James",
6,7,"McCarthy, Mr. Timothy J",54.0
7,8,"Palsson, Master. Gosta Leonard",2.0
8,9,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",27.0
9,10,"Nasser, Mrs. Nicholas (Adele Achem)",14.0


In [677]:
df['age'].min(), df['age'].max()

(0.42, 80.0)

In [678]:
# 조건

df.loc[(df['age'] < 30) & (df['sex'] == 'male'), ['name', 'age']]

Unnamed: 0,name,age
0,"Braund, Mr. Owen Harris",22.0
7,"Palsson, Master. Gosta Leonard",2.0
12,"Saundercock, Mr. William Henry",20.0
16,"Rice, Master. Eugene",2.0
23,"Sloper, Mr. William Thompson",28.0
...,...,...
877,"Petroff, Mr. Nedelio",19.0
883,"Banfield, Mr. Frederick James",28.0
884,"Sutehall, Mr. Henry Jr",25.0
886,"Montvila, Rev. Juozas",27.0


In [679]:
### 결측치 처리

df.isna().sum(axis=0)

passengerid      0
survived         0
pclass           0
name             0
sex              0
age            177
sibsp            0
parch            0
ticket           0
fare             0
cabin          687
embarked         2
dtype: int64

In [680]:
# age가 NaN인 데이터만 추출

df.loc[df['age'].isna()]['embarked'].value_counts()

S    90
Q    49
C    38
Name: embarked, dtype: int64

In [681]:
df.loc[df['age'].isna(),'embarked'].value_counts()

S    90
Q    49
C    38
Name: embarked, dtype: int64

In [682]:
df['embarked'].isna()

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: embarked, Length: 891, dtype: bool

In [683]:
# 1. 어느 항구 탑승자 조사
# 2. embarked 결측치 채워넣기

df.loc[df['embarked'].isna()]

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
61,62,saved,1st,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,saved,1st,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [684]:
df['embarked'].value_counts() 

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [685]:
df['embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [686]:
df['embarked'].isna()

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: embarked, Length: 891, dtype: bool

In [687]:
df['embarked'] = df['embarked'].fillna('S') # fillna()를 사용함으로써 nan값을 괄호안의 값으로 변경시켜줌

In [688]:
df['embarked']

0      S
1      C
2      S
3      S
4      S
      ..
886    S
887    S
888    S
889    C
890    Q
Name: embarked, Length: 891, dtype: object

In [689]:
df.isna().sum(axis=0)

passengerid      0
survived         0
pclass           0
name             0
sex              0
age            177
sibsp            0
parch            0
ticket           0
fare             0
cabin          687
embarked         0
dtype: int64

In [690]:
df.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,lost,3rd,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [691]:
df['embarked'].value_counts()  #############################################

S    646
C    168
Q     77
Name: embarked, dtype: int64

In [692]:
df['embarked'].unique()

array(['S', 'C', 'Q'], dtype=object)

In [693]:
df = df.drop(['cabin', 'ticket', 'passengerid'], axis = 1)

In [694]:
df.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,fare,embarked
0,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,lost,3rd,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


In [695]:
df['family'] = df['sibsp']+df['parch'] # silsp 형제 parch 자매
df

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,fare,embarked,family
0,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,7.2500,S,1
1,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1
2,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,7.9250,S,0
3,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1000,S,1
4,lost,3rd,"Allen, Mr. William Henry",male,35.0,0,0,8.0500,S,0
...,...,...,...,...,...,...,...,...,...,...
886,lost,2nd,"Montvila, Rev. Juozas",male,27.0,0,0,13.0000,S,0
887,saved,1st,"Graham, Miss. Margaret Edith",female,19.0,0,0,30.0000,S,0
888,lost,3rd,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,23.4500,S,3
889,saved,1st,"Behr, Mr. Karl Howell",male,26.0,0,0,30.0000,C,0


In [696]:
x = df.copy()
x.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,fare,embarked,family
0,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,1
1,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1
2,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,0
3,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,1
4,lost,3rd,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,0


In [697]:
df.isna().sum(axis=0)

survived      0
pclass        0
name          0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      0
family        0
dtype: int64

In [698]:
# np.median(x), np.mean(x)
df['age'].fillna(df['age'].mean(), inplace = True) ###################### age의 null 값을 age의 평균값으로 바꾸는데 inplace = True 를 쓰면 원본을 변경시켜준다.

In [699]:
df.isna().sum(axis=0)

survived    0
pclass      0
name        0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    0
family      0
dtype: int64

### groupby()

In [700]:
df

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,fare,embarked,family
0,lost,3rd,"Braund, Mr. Owen Harris",male,22.000000,1,0,7.2500,S,1
1,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,71.2833,C,1
2,saved,3rd,"Heikkinen, Miss. Laina",female,26.000000,0,0,7.9250,S,0
3,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,53.1000,S,1
4,lost,3rd,"Allen, Mr. William Henry",male,35.000000,0,0,8.0500,S,0
...,...,...,...,...,...,...,...,...,...,...
886,lost,2nd,"Montvila, Rev. Juozas",male,27.000000,0,0,13.0000,S,0
887,saved,1st,"Graham, Miss. Margaret Edith",female,19.000000,0,0,30.0000,S,0
888,lost,3rd,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,23.4500,S,3
889,saved,1st,"Behr, Mr. Karl Howell",male,26.000000,0,0,30.0000,C,0


In [701]:
df.groupby(['sex','pclass'])['age'].mean().index

MultiIndex([('female', '1st'),
            ('female', '2nd'),
            ('female', '3rd'),
            (  'male', '1st'),
            (  'male', '2nd'),
            (  'male', '3rd')],
           names=['sex', 'pclass'])

In [702]:
df.groupby(['sex','pclass'])['age'].mean()

sex     pclass
female  1st       34.141405
        2nd       28.748661
        3rd       24.068493
male    1st       39.287717
        2nd       30.653908
        3rd       27.372153
Name: age, dtype: float64

In [703]:
df.groupby(['sex','pclass'])['survived'].value_counts()

sex     pclass  survived
female  1st     saved        91
                lost          3
        2nd     saved        70
                lost          6
        3rd     lost         72
                saved        72
male    1st     lost         77
                saved        45
        2nd     lost         91
                saved        17
        3rd     lost        300
                saved        47
Name: survived, dtype: int64

### 그룹 함수

- apply() 함수 / 문자열을 정수형 데이터로 변환

In [704]:
def myfunc(x):
    if x == 'male':
        return 1
    else:
        return 0

In [705]:
x['sex_encoded'] = x['sex'].apply(myfunc)
x

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,fare,embarked,family,sex_encoded
0,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,7.2500,S,1,1
1,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1,0
2,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,7.9250,S,0,0
3,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1000,S,1,0
4,lost,3rd,"Allen, Mr. William Henry",male,35.0,0,0,8.0500,S,0,1
...,...,...,...,...,...,...,...,...,...,...,...
886,lost,2nd,"Montvila, Rev. Juozas",male,27.0,0,0,13.0000,S,0,1
887,saved,1st,"Graham, Miss. Margaret Edith",female,19.0,0,0,30.0000,S,0,0
888,lost,3rd,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,23.4500,S,3,0
889,saved,1st,"Behr, Mr. Karl Howell",male,26.0,0,0,30.0000,C,0,1


In [706]:
def myfunc1(x):
    if x == 'saved':
        return 1
    else:
        return 0

In [707]:
x['survived_encoded'] = x['survived'].apply(myfunc1)
x

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,fare,embarked,family,sex_encoded,survived_encoded
0,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,7.2500,S,1,1,0
1,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1,0,1
2,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,7.9250,S,0,0,1
3,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1000,S,1,0,1
4,lost,3rd,"Allen, Mr. William Henry",male,35.0,0,0,8.0500,S,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
886,lost,2nd,"Montvila, Rev. Juozas",male,27.0,0,0,13.0000,S,0,1,0
887,saved,1st,"Graham, Miss. Margaret Edith",female,19.0,0,0,30.0000,S,0,0,1
888,lost,3rd,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,23.4500,S,3,0,0
889,saved,1st,"Behr, Mr. Karl Howell",male,26.0,0,0,30.0000,C,0,1,1


In [708]:
def my_func2(x):
    if x == '1st':
        return 1
    elif x == '2nd':
        return 2
    else:
        return 3

In [709]:
x['pclass_encoded'] = x['pclass'].apply(my_func2)
x

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,fare,embarked,family,sex_encoded,survived_encoded,pclass_encoded
0,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,7.2500,S,1,1,0,3
1,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1,0,1,1
2,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,7.9250,S,0,0,1,3
3,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1000,S,1,0,1,1
4,lost,3rd,"Allen, Mr. William Henry",male,35.0,0,0,8.0500,S,0,1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,lost,2nd,"Montvila, Rev. Juozas",male,27.0,0,0,13.0000,S,0,1,0,2
887,saved,1st,"Graham, Miss. Margaret Edith",female,19.0,0,0,30.0000,S,0,0,1,1
888,lost,3rd,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,23.4500,S,3,0,0,3
889,saved,1st,"Behr, Mr. Karl Howell",male,26.0,0,0,30.0000,C,0,1,1,1


In [710]:
a = df.drop(['survived','sex','pclass'],axis=1)
a


Unnamed: 0,name,age,sibsp,parch,fare,embarked,family
0,"Braund, Mr. Owen Harris",22.000000,1,0,7.2500,S,1
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.000000,1,0,71.2833,C,1
2,"Heikkinen, Miss. Laina",26.000000,0,0,7.9250,S,0
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.000000,1,0,53.1000,S,1
4,"Allen, Mr. William Henry",35.000000,0,0,8.0500,S,0
...,...,...,...,...,...,...,...
886,"Montvila, Rev. Juozas",27.000000,0,0,13.0000,S,0
887,"Graham, Miss. Margaret Edith",19.000000,0,0,30.0000,S,0
888,"Johnston, Miss. Catherine Helen ""Carrie""",29.699118,1,2,23.4500,S,3
889,"Behr, Mr. Karl Howell",26.000000,0,0,30.0000,C,0


In [711]:
my_arr = a.values
my_arr

array([['Braund, Mr. Owen Harris', 22.0, 1, ..., 7.25, 'S', 1],
       ['Cumings, Mrs. John Bradley (Florence Briggs Thayer)', 38.0, 1,
        ..., 71.2833, 'C', 1],
       ['Heikkinen, Miss. Laina', 26.0, 0, ..., 7.925, 'S', 0],
       ...,
       ['Johnston, Miss. Catherine Helen "Carrie"', 29.69911764705882, 1,
        ..., 23.45, 'S', 3],
       ['Behr, Mr. Karl Howell', 26.0, 0, ..., 30.0, 'C', 0],
       ['Dooley, Mr. Patrick', 32.0, 0, ..., 7.75, 'Q', 0]], dtype=object)

# 저장


In [712]:
np.savez('my_arr.npz', arr = my_arr)

# 로딩

In [713]:
mydata = np.load('my_arr.npz', allow_pickle=True)

In [714]:
x = mydata['arr']
x

array([['Braund, Mr. Owen Harris', 22.0, 1, ..., 7.25, 'S', 1],
       ['Cumings, Mrs. John Bradley (Florence Briggs Thayer)', 38.0, 1,
        ..., 71.2833, 'C', 1],
       ['Heikkinen, Miss. Laina', 26.0, 0, ..., 7.925, 'S', 0],
       ...,
       ['Johnston, Miss. Catherine Helen "Carrie"', 29.69911764705882, 1,
        ..., 23.45, 'S', 3],
       ['Behr, Mr. Karl Howell', 26.0, 0, ..., 30.0, 'C', 0],
       ['Dooley, Mr. Patrick', 32.0, 0, ..., 7.75, 'Q', 0]], dtype=object)