# 데이터프레임 

- 행과 열로 구성된 표 형태의 데이터

In [2]:
import pandas as pd

# 데이터 확인

In [5]:
df = pd.read_csv("./data/gapminder.tsv", sep = "\t")

In [6]:
# 상위 5개 Default
# df.head(3)는 3개만
df.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


In [9]:
# 하위 5개 Default
# df.tail(3)는 3개만
df.tail()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.44996
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623
1703,Zimbabwe,Africa,2007,43.487,12311143,469.709298


In [10]:
type(df)

pandas.core.frame.DataFrame

In [13]:
# (행, 열)
df.shape

(1704, 6)

- 행
    - 가로로 나열되는 각 데이터의 단위
        - 로우(row) 또는 케이스(case)라고도 불림
- 열
    - 세로로 나열되는 속성
        - 컬럼(column) 또는 변수(variable)라고도 불림

In [16]:
# 데이터프레임의 컬럼명 확인
df.columns

Index(['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap'], dtype='object')

- 판다스와 파이썬 자료형
    - 문자열
        - 파이썬 : string
        - 판다스 : object
    - 정수 / 실수 / datetime
        - 파이썬 : int, float, datetime
        - 판다스 : int64, float64, datetime64

In [18]:
# 데이터프레임 값의 자료형 확인
df.dtypes

country       object
continent     object
year           int64
lifeExp      float64
pop            int64
gdpPercap    float64
dtype: object

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    1704 non-null   object 
 1   continent  1704 non-null   object 
 2   year       1704 non-null   int64  
 3   lifeExp    1704 non-null   float64
 4   pop        1704 non-null   int64  
 5   gdpPercap  1704 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB


# 데이터 추출

In [23]:
df["country"]

0       Afghanistan
1       Afghanistan
2       Afghanistan
3       Afghanistan
4       Afghanistan
           ...     
1699       Zimbabwe
1700       Zimbabwe
1701       Zimbabwe
1702       Zimbabwe
1703       Zimbabwe
Name: country, Length: 1704, dtype: object

In [25]:
type(df["country"])

pandas.core.series.Series

In [28]:
country_se = df["country"]

In [30]:
# 시리즈 데이터 앞부분 확인
country_se.head()

0    Afghanistan
1    Afghanistan
2    Afghanistan
3    Afghanistan
4    Afghanistan
Name: country, dtype: object

In [32]:
# 시리즈 데이터 뒷부분 확인
country_se.tail()

1699    Zimbabwe
1700    Zimbabwe
1701    Zimbabwe
1702    Zimbabwe
1703    Zimbabwe
Name: country, dtype: object

In [33]:
# 데이터프레임에서 여러 컬럼 추출
subset = df[["country", "continent", "year"]]

In [36]:
type(subset)

pandas.core.frame.DataFrame

In [37]:
subset.head()

Unnamed: 0,country,continent,year
0,Afghanistan,Asia,1952
1,Afghanistan,Asia,1957
2,Afghanistan,Asia,1962
3,Afghanistan,Asia,1967
4,Afghanistan,Asia,1972


## 행단위 데이터 추출

- loc
    - 인덱스를 기준으로 행 데이터 추출
- iloc
    - 행 번호를 기준으로 행 데이터 추출

### loc

- 인덱스(index) : 값의 위치를 나타낸 값
    - 데이터프레임을 만들면 자동으로 인덱스 번호가 부여됨
        - 현재 gapminder 데이터의 왼쪽에 세로로 나열된 0, 1, 2, 3, 4.....

In [42]:
df.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


In [44]:
df.loc[0]

country      Afghanistan
continent           Asia
year                1952
lifeExp           28.801
pop              8425333
gdpPercap     779.445314
Name: 0, dtype: object

In [46]:
df.loc[99]

country      Bangladesh
continent          Asia
year               1967
lifeExp          43.453
pop            62821884
gdpPercap    721.186086
Name: 99, dtype: object

In [48]:
# loc 사용 예제 2
# ord() : 문자를 전달하면 해당 문자의 유니코드 값을 반환
# chr() : 특정 문자의 유니코드 값을 전달하면 해당 문자를 반환

print(ord("a"), ord("z"))

97 122


In [50]:
for i in range(97, 123):
    print(chr(i), end = " ")

a b c d e f g h i j k l m n o p q r s t u v w x y z 

In [51]:
df_index = pd.DataFrame({"value" : [i for i in range(26)]}, index = [chr(i) for i in range(97, 123)])

In [54]:
df_index.head()

Unnamed: 0,value
a,0
b,1
c,2
d,3
e,4


In [55]:
df_index.loc["a"]

value    0
Name: a, dtype: int64

In [58]:
df_index.loc["z"]

value    25
Name: z, dtype: int64

In [60]:
# 여러 행 추출
df_index.loc[["a", "z"]]

Unnamed: 0,value
a,0
z,25


In [62]:
# 하나의 행을 데이터프레임으로 추출
df_index.loc[["a"]]

Unnamed: 0,value
a,0


In [64]:
# loc으로 마지막 행 추출하기
df.tail(1)

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1703,Zimbabwe,Africa,2007,43.487,12311143,469.709298


In [65]:
df.loc[1703]

country        Zimbabwe
continent        Africa
year               2007
lifeExp          43.487
pop            12311143
gdpPercap    469.709298
Name: 1703, dtype: object

### iloc

In [69]:
df.iloc[1]

country      Afghanistan
continent           Asia
year                1957
lifeExp           30.332
pop              9240934
gdpPercap      820.85303
Name: 1, dtype: object

In [70]:
df.loc[1]

country      Afghanistan
continent           Asia
year                1957
lifeExp           30.332
pop              9240934
gdpPercap      820.85303
Name: 1, dtype: object

In [71]:
df.iloc[-1]

country        Zimbabwe
continent        Africa
year               2007
lifeExp          43.487
pop            12311143
gdpPercap    469.709298
Name: 1703, dtype: object

In [75]:
df_index.iloc[0]

value    0
Name: a, dtype: int64

In [77]:
df_index.iloc[10]

value    10
Name: k, dtype: int64

## 행과 열 추출

In [80]:
# year열 전체 행
# [행, 열]
df.loc[:, "year"]

0       1952
1       1957
2       1962
3       1967
4       1972
        ... 
1699    1987
1700    1992
1701    1997
1702    2002
1703    2007
Name: year, Length: 1704, dtype: int64

In [82]:
# year열, pop열 전체 행
df.loc[:, ["year", "pop"]].head()

Unnamed: 0,year,pop
0,1952,8425333
1,1957,9240934
2,1962,10267083
3,1967,11537966
4,1972,13079460


In [83]:
df.iloc[:, [2, 4, -1]].head()

Unnamed: 0,year,pop,gdpPercap
0,1952,8425333,779.445314
1,1957,9240934,820.85303
2,1962,10267083,853.10071
3,1967,11537966,836.197138
4,1972,13079460,739.981106


In [86]:
df.iloc[:5, :3]

Unnamed: 0,country,continent,year
0,Afghanistan,Asia,1952
1,Afghanistan,Asia,1957
2,Afghanistan,Asia,1962
3,Afghanistan,Asia,1967
4,Afghanistan,Asia,1972


In [88]:
df.iloc[:, 0:6:2].head()

Unnamed: 0,country,year,pop
0,Afghanistan,1952,8425333
1,Afghanistan,1957,9240934
2,Afghanistan,1962,10267083
3,Afghanistan,1967,11537966
4,Afghanistan,1972,13079460


In [89]:
df.iloc[[0, 99, 999], [0, 3, 5]]

Unnamed: 0,country,lifeExp,gdpPercap
0,Afghanistan,28.801,779.445314
99,Bangladesh,43.453,721.186086
999,Mongolia,51.253,1226.04113


In [90]:
df.loc[[0, 99, 999], ["country", "lifeExp", "gdpPercap"]]

Unnamed: 0,country,lifeExp,gdpPercap
0,Afghanistan,28.801,779.445314
99,Bangladesh,43.453,721.186086
999,Mongolia,51.253,1226.04113


# 기초적인 통계 계산

In [95]:
df.describe()

Unnamed: 0,year,lifeExp,pop,gdpPercap
count,1704.0,1704.0,1704.0,1704.0
mean,1979.5,59.474439,29601210.0,7215.327081
std,17.26533,12.917107,106157900.0,9857.454543
min,1952.0,23.599,60011.0,241.165876
25%,1965.75,48.198,2793664.0,1202.060309
50%,1979.5,60.7125,7023596.0,3531.846988
75%,1993.25,70.8455,19585220.0,9325.462346
max,2007.0,82.603,1318683000.0,113523.1329


## 그룹화한 데이터의 평균 구하기

In [98]:
df.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


In [100]:
df.groupby("year")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000020977D33E90>

In [102]:
# 연도별 lifeExp의 평균
df.groupby("year")["lifeExp"].mean()

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [104]:
df.groupby(["year", "continent"])[["lifeExp", "gdpPercap"]].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,lifeExp,gdpPercap
year,continent,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,Africa,39.1355,1252.572466
1952,Americas,53.27984,4079.062552
1952,Asia,46.314394,5195.484004
1952,Europe,64.4085,5661.057435
1952,Oceania,69.255,10298.08565
1957,Africa,41.266346,1385.236062
1957,Americas,55.96028,4616.043733
1957,Asia,49.318544,5787.73294
1957,Europe,66.703067,6963.012816
1957,Oceania,70.295,11598.522455


## 그룹화한 데이터 개수 세기

In [107]:
# 각 대륙별 country의 종류 수
df.groupby("continent")["country"].nunique()

continent
Africa      52
Americas    25
Asia        33
Europe      30
Oceania      2
Name: country, dtype: int64

In [109]:
# 각 대륙의 각 country별 데이터 개수
df.groupby("continent")["country"].value_counts()

continent  country       
Africa     Algeria           12
           Angola            12
           Benin             12
           Botswana          12
           Burkina Faso      12
                             ..
Europe     Switzerland       12
           Turkey            12
           United Kingdom    12
Oceania    Australia         12
           New Zealand       12
Name: count, Length: 142, dtype: int64

In [111]:
# continent별 데이터 개수
df.groupby("continent")["country"].count()

continent
Africa      624
Americas    300
Asia        396
Europe      360
Oceania      24
Name: country, dtype: int64

# 데이터 생성

In [114]:
pd.Series(["banana", 42])

0    banana
1        42
dtype: object

In [116]:
pd.Series(["Wes McKinney", "Creator of Pandas"], index = ["Person", "Who"])

Person         Wes McKinney
Who       Creator of Pandas
dtype: object

In [118]:
# 데이터프레임 생성
pd.DataFrame(
    data = {"Occupation" : ["Chemist", "Statistician"],
            "Born" : ["1920-07-25", "1876-06-13"],
            "Died" : ["1958-04-16", "1937-10-16"],
            "Age" : [37, 62]},
    index = ["Rosaline Franklin", "William Gosset"],
    columns = ["Occupation", "Born", "Age", "Died"]
)

Unnamed: 0,Occupation,Born,Age,Died
Rosaline Franklin,Chemist,1920-07-25,37,1958-04-16
William Gosset,Statistician,1876-06-13,62,1937-10-16


In [120]:
# scientists 데이터 불러오기
scientists = pd.read_csv("./data/scientists.csv")

In [122]:
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


## 시리즈에 기초통계 메서드 사용

In [125]:
# Age 컬럼의 평균
scientists["Age"].mean()

59.125

In [127]:
# Age 컬럼의 최솟값
scientists["Age"].min()

37

In [129]:
# Age 컬럼의 최댓값
scientists["Age"].max()

90

In [131]:
# Age 컬럼의 표준편차
scientists["Age"].std()

18.325918413937288

## 시리즈 다루기 응용

### 필더링

In [135]:
ages = scientists["Age"]

In [137]:
ages

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64

In [139]:
# 평균보다 나이가 많은 사람의 데이터 추출
ages[ages > ages.mean()]

1    61
2    90
3    66
7    77
Name: Age, dtype: int64

In [141]:
ages.mean()

59.125

In [143]:
ages > ages.mean()

0    False
1     True
2     True
3     True
4    False
5    False
6    False
7     True
Name: Age, dtype: bool

In [145]:
ages[[True, True, False, False, True, True, False, True]]

0    37
1    61
4    56
5    45
7    77
Name: Age, dtype: int64

### 시리즈 연산

In [148]:
# 시리즈와 시리즈 연산
ages + ages

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64

In [150]:
ages * ages

0    1369
1    3721
2    8100
3    4356
4    3136
5    2025
6    1681
7    5929
Name: Age, dtype: int64

In [152]:
# 시리즈와 숫자 연산
ages + 100

0    137
1    161
2    190
3    166
4    156
5    145
6    141
7    177
Name: Age, dtype: int64

In [154]:
ages * 2

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64

In [156]:
# 길이가 서로 다른 시리즈를 연산하는 경우
# 인덱스가 일치하는 데이터만 계산함
ages + pd.Series([1, 100])

0     38.0
1    161.0
2      NaN
3      NaN
4      NaN
5      NaN
6      NaN
7      NaN
dtype: float64

In [157]:
# ages를 인덱스의 역순으로 정렬
ages.sort_index(ascending = False)

7    77
6    41
5    45
4    56
3    66
2    90
1    61
0    37
Name: Age, dtype: int64

In [160]:
# 눈에 보이는 순서대로 연산하는 것이 아닌 인덱스가 일치하는 값끼리 연산
print(ages * 2)
print()
print(ages + ages.sort_index(ascending = False))

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64


## 데이터프레임 다루기

In [163]:
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [164]:
# 필터링
scientists[scientists["Age"] > scientists["Age"].mean()]

Unnamed: 0,Name,Born,Died,Age,Occupation
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [165]:
scientists["Age"] > scientists["Age"].mean()

0    False
1     True
2     True
3     True
4    False
5    False
6    False
7     True
Name: Age, dtype: bool

In [168]:
# 데이터프레임 연산
scientists * 2

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline FranklinRosaline Franklin,1920-07-251920-07-25,1958-04-161958-04-16,74,ChemistChemist
1,William GossetWilliam Gosset,1876-06-131876-06-13,1937-10-161937-10-16,122,StatisticianStatistician
2,Florence NightingaleFlorence Nightingale,1820-05-121820-05-12,1910-08-131910-08-13,180,NurseNurse
3,Marie CurieMarie Curie,1867-11-071867-11-07,1934-07-041934-07-04,132,ChemistChemist
4,Rachel CarsonRachel Carson,1907-05-271907-05-27,1964-04-141964-04-14,112,BiologistBiologist
5,John SnowJohn Snow,1813-03-151813-03-15,1858-06-161858-06-16,90,PhysicianPhysician
6,Alan TuringAlan Turing,1912-06-231912-06-23,1954-06-071954-06-07,82,Computer ScientistComputer Scientist
7,Johann GaussJohann Gauss,1777-04-301777-04-30,1855-02-231855-02-23,154,MathematicianMathematician


## 날짜 데이터 처리

In [172]:
scientists.dtypes

Name          object
Born          object
Died          object
Age            int64
Occupation    object
dtype: object

In [174]:
pd.to_datetime(scientists["Born"], format = "%Y-%m-%d")

0   1920-07-25
1   1876-06-13
2   1820-05-12
3   1867-11-07
4   1907-05-27
5   1813-03-15
6   1912-06-23
7   1777-04-30
Name: Born, dtype: datetime64[ns]

In [176]:
scientists.dtypes

Name          object
Born          object
Died          object
Age            int64
Occupation    object
dtype: object

In [178]:
# Born을 datetime64로 변환한 born_at 컬럼 생성
scientists["born_at"] = pd.to_datetime(scientists["Born"], format = "%Y-%m-%d")

In [180]:
scientists["died_st"] = pd.to_datetime(scientists["Died"], format = "%Y-%m-%d")

In [182]:
scientists.dtypes

Name                  object
Born                  object
Died                  object
Age                    int64
Occupation            object
born_at       datetime64[ns]
died_st       datetime64[ns]
dtype: object

In [184]:
scientists.head()

Unnamed: 0,Name,Born,Died,Age,Occupation,born_at,died_st
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist,1920-07-25,1958-04-16
1,William Gosset,1876-06-13,1937-10-16,61,Statistician,1876-06-13,1937-10-16
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse,1820-05-12,1910-08-13
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist,1867-11-07,1934-07-04
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist,1907-05-27,1964-04-14


In [186]:
scientists["died_st"] - scientists["born_at"]

0   13779 days
1   22404 days
2   32964 days
3   24345 days
4   20777 days
5   16529 days
6   15324 days
7   28422 days
dtype: timedelta64[ns]

# 데이터 삭제

In [191]:
scientists.columns

Index(['Name', 'Born', 'Died', 'Age', 'Occupation', 'born_at', 'died_st'], dtype='object')

In [193]:
# Age 컬럼 제거
scientists.drop(["Age"], axis = 1)

Unnamed: 0,Name,Born,Died,Occupation,born_at,died_st
0,Rosaline Franklin,1920-07-25,1958-04-16,Chemist,1920-07-25,1958-04-16
1,William Gosset,1876-06-13,1937-10-16,Statistician,1876-06-13,1937-10-16
2,Florence Nightingale,1820-05-12,1910-08-13,Nurse,1820-05-12,1910-08-13
3,Marie Curie,1867-11-07,1934-07-04,Chemist,1867-11-07,1934-07-04
4,Rachel Carson,1907-05-27,1964-04-14,Biologist,1907-05-27,1964-04-14
5,John Snow,1813-03-15,1858-06-16,Physician,1813-03-15,1858-06-16
6,Alan Turing,1912-06-23,1954-06-07,Computer Scientist,1912-06-23,1954-06-07
7,Johann Gauss,1777-04-30,1855-02-23,Mathematician,1777-04-30,1855-02-23


In [195]:
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation,born_at,died_st
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist,1920-07-25,1958-04-16
1,William Gosset,1876-06-13,1937-10-16,61,Statistician,1876-06-13,1937-10-16
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse,1820-05-12,1910-08-13
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist,1867-11-07,1934-07-04
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist,1907-05-27,1964-04-14
5,John Snow,1813-03-15,1858-06-16,45,Physician,1813-03-15,1858-06-16
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist,1912-06-23,1954-06-07
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician,1777-04-30,1855-02-23


In [197]:
dropped = scientists.drop(["Age"], axis = 1)

In [199]:
dropped.head()

Unnamed: 0,Name,Born,Died,Occupation,born_at,died_st
0,Rosaline Franklin,1920-07-25,1958-04-16,Chemist,1920-07-25,1958-04-16
1,William Gosset,1876-06-13,1937-10-16,Statistician,1876-06-13,1937-10-16
2,Florence Nightingale,1820-05-12,1910-08-13,Nurse,1820-05-12,1910-08-13
3,Marie Curie,1867-11-07,1934-07-04,Chemist,1867-11-07,1934-07-04
4,Rachel Carson,1907-05-27,1964-04-14,Biologist,1907-05-27,1964-04-14


In [201]:
scientists.drop(["Age"], axis = 1, inplace = True)

In [203]:
scientists.head()

Unnamed: 0,Name,Born,Died,Occupation,born_at,died_st
0,Rosaline Franklin,1920-07-25,1958-04-16,Chemist,1920-07-25,1958-04-16
1,William Gosset,1876-06-13,1937-10-16,Statistician,1876-06-13,1937-10-16
2,Florence Nightingale,1820-05-12,1910-08-13,Nurse,1820-05-12,1910-08-13
3,Marie Curie,1867-11-07,1934-07-04,Chemist,1867-11-07,1934-07-04
4,Rachel Carson,1907-05-27,1964-04-14,Biologist,1907-05-27,1964-04-14


In [205]:
# 여러 컬럼 삭제하기
scientists.drop(["Born", "Died"], axis = 1)

Unnamed: 0,Name,Occupation,born_at,died_st
0,Rosaline Franklin,Chemist,1920-07-25,1958-04-16
1,William Gosset,Statistician,1876-06-13,1937-10-16
2,Florence Nightingale,Nurse,1820-05-12,1910-08-13
3,Marie Curie,Chemist,1867-11-07,1934-07-04
4,Rachel Carson,Biologist,1907-05-27,1964-04-14
5,John Snow,Physician,1813-03-15,1858-06-16
6,Alan Turing,Computer Scientist,1912-06-23,1954-06-07
7,Johann Gauss,Mathematician,1777-04-30,1855-02-23


In [207]:
# 행 삭제하기
scientists.drop([0], axis = 0)

Unnamed: 0,Name,Born,Died,Occupation,born_at,died_st
1,William Gosset,1876-06-13,1937-10-16,Statistician,1876-06-13,1937-10-16
2,Florence Nightingale,1820-05-12,1910-08-13,Nurse,1820-05-12,1910-08-13
3,Marie Curie,1867-11-07,1934-07-04,Chemist,1867-11-07,1934-07-04
4,Rachel Carson,1907-05-27,1964-04-14,Biologist,1907-05-27,1964-04-14
5,John Snow,1813-03-15,1858-06-16,Physician,1813-03-15,1858-06-16
6,Alan Turing,1912-06-23,1954-06-07,Computer Scientist,1912-06-23,1954-06-07
7,Johann Gauss,1777-04-30,1855-02-23,Mathematician,1777-04-30,1855-02-23


# 데이터 입출력

In [210]:
# csv 저장
scientists.to_csv("scientists.csv", index = False)

In [212]:
# csv 불러오기 
pd.read_csv("scientists.csv")

Unnamed: 0,Name,Born,Died,Occupation,born_at,died_st
0,Rosaline Franklin,1920-07-25,1958-04-16,Chemist,1920-07-25,1958-04-16
1,William Gosset,1876-06-13,1937-10-16,Statistician,1876-06-13,1937-10-16
2,Florence Nightingale,1820-05-12,1910-08-13,Nurse,1820-05-12,1910-08-13
3,Marie Curie,1867-11-07,1934-07-04,Chemist,1867-11-07,1934-07-04
4,Rachel Carson,1907-05-27,1964-04-14,Biologist,1907-05-27,1964-04-14
5,John Snow,1813-03-15,1858-06-16,Physician,1813-03-15,1858-06-16
6,Alan Turing,1912-06-23,1954-06-07,Computer Scientist,1912-06-23,1954-06-07
7,Johann Gauss,1777-04-30,1855-02-23,Mathematician,1777-04-30,1855-02-23


In [214]:
# excel 저장
scientists.to_excel("scientists.xlsx", index = False)

In [216]:
# excel 불러오기
pd.read_excel("scientists.xlsx")

Unnamed: 0,Name,Born,Died,Occupation,born_at,died_st
0,Rosaline Franklin,1920-07-25,1958-04-16,Chemist,1920-07-25,1958-04-16
1,William Gosset,1876-06-13,1937-10-16,Statistician,1876-06-13,1937-10-16
2,Florence Nightingale,1820-05-12,1910-08-13,Nurse,1820-05-12,1910-08-13
3,Marie Curie,1867-11-07,1934-07-04,Chemist,1867-11-07,1934-07-04
4,Rachel Carson,1907-05-27,1964-04-14,Biologist,1907-05-27,1964-04-14
5,John Snow,1813-03-15,1858-06-16,Physician,1813-03-15,1858-06-16
6,Alan Turing,1912-06-23,1954-06-07,Computer Scientist,1912-06-23,1954-06-07
7,Johann Gauss,1777-04-30,1855-02-23,Mathematician,1777-04-30,1855-02-23
