# pandas의 고유한 자료구조 - Series와 DataFrame 이해하기

    pandas에서는 고유하게 정의한 자료 구조인 Series와 DataFrame을 사용하여, 빅 데이터 분석에 있어 높은 수준의 성능을 발휘합니다. 
    Series는 동일한 데이터형의 복수 개의 성분으로 구성된 자료 구조이며, DataFrame은 서로 같거나 다른 데이터형의 여러 개의 열에 
    대하여 복수 개의 성분으로 구성된 '표와 같은 형태'의 자료 구조입니다.

## Series
 - Python 리스트나, 혹은 numpy array 등이 함수의 인자로 입력
 - 앞선 1,2차원 Array와 비슷하다고 생각하면 된다. 

In [2]:
import numpy as np
import pandas as pd

In [5]:
obj = pd.Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [7]:
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [9]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [11]:
obj.dtypes

dtype('int64')

In [13]:
obj.value_counts

<bound method IndexOpsMixin.value_counts of 0    4
1    7
2   -5
3    3
dtype: int64>

In [15]:
obj2 = pd.Series([4,7,-5,3], index=['d','b','a','c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [17]:
sdata = {"Charles":35000, "Kilho":71000, "Hayoung":16000, "Sangjae":5000}
sdata

{'Charles': 35000, 'Hayoung': 16000, 'Kilho': 71000, 'Sangjae': 5000}

In [19]:
# Dictionary를 인자로 받을 수 있다.
obj3 = pd.Series(sdata)
obj3

Charles    35000
Hayoung    16000
Kilho      71000
Sangjae     5000
dtype: int64

In [22]:
obj3.name = "Salary"
obj3.index.name = "Names"
obj3

Names
Charles    35000
Hayoung    16000
Kilho      71000
Sangjae     5000
Name: Salary, dtype: int64

In [24]:
obj3.index = ['A','B','C','D']
obj3

A    35000
B    16000
C    71000
D     5000
Name: Salary, dtype: int64

## DataFrame
 - DataFrame에 입력할 데이터는 Python 딕셔너리 혹은 numpy의 2차원 array 등의 형태

In [26]:
data = {"names": ["Kilho", "Kilho", "Kilho", "Charles", "Charles"],
        "year": [2014, 2015, 2016, 2015, 2016],
        "points": [1.5, 1.7, 3.6, 2.4, 2.9]}
df = pd.DataFrame(data)
df

Unnamed: 0,names,points,year
0,Kilho,1.5,2014
1,Kilho,1.7,2015
2,Kilho,3.6,2016
3,Charles,2.4,2015
4,Charles,2.9,2016


In [31]:
df.index

RangeIndex(start=0, stop=5, step=1)

In [33]:
df.columns

Index(['names', 'points', 'year'], dtype='object')

In [35]:
df.values

array([['Kilho', 1.5, 2014],
       ['Kilho', 1.7, 2015],
       ['Kilho', 3.6, 2016],
       ['Charles', 2.4, 2015],
       ['Charles', 2.9, 2016]], dtype=object)

In [37]:
df.index.name = "Num"
df.columns.name = "Info"
df

Info,names,points,year
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Kilho,1.5,2014
1,Kilho,1.7,2015
2,Kilho,3.6,2016
3,Charles,2.4,2015
4,Charles,2.9,2016


In [38]:
df.names

Num
0      Kilho
1      Kilho
2      Kilho
3    Charles
4    Charles
Name: names, dtype: object

 - NaN => Not a Number 값이 없다라는 것. R의 NA와 동일하다고 생각하면 된다. 

In [41]:
df2 = pd.DataFrame(data, columns=["year","name","points","penalty"], index=["one","two","three","four","five"])
df2 #penalty 는 원래 없었다. 

Unnamed: 0,year,name,points,penalty
one,2014,,1.5,
two,2015,,1.7,
three,2016,,3.6,
four,2015,,2.4,
five,2016,,2.9,


## describe 함수 
 - df.describe()
 - 계산이 가능한 컬럼에 한해서 각 컬럼의 평균, 분산, 최솟/최댓값 등 기본 통계량을 산출한 결과를 보여줍

In [45]:
df.describe()

Info,points,year
count,5.0,5.0
mean,2.42,2015.2
std,0.864292,0.83666
min,1.5,2014.0
25%,1.7,2015.0
50%,2.4,2015.0
75%,2.9,2016.0
max,3.6,2016.0


# DataFrame 인덱싱

In [46]:
import numpy as np
import pandas as pd

In [83]:
data = {"names": ["Kilho", "Kilho", "Kilho", "Charles", "Charles"],
           "year": [2014, 2015, 2016, 2015, 2016],
           "points": [1.5, 1.7, 3.6, 2.4, 2.9]}
df = pd.DataFrame(data, columns=["year", "names", "points", "penalty"],
                          index=["one", "two", "three", "four", "five"])
df

Unnamed: 0,year,names,points,penalty
one,2014,Kilho,1.5,
two,2015,Kilho,1.7,
three,2016,Kilho,3.6,
four,2015,Charles,2.4,
five,2016,Charles,2.9,


In [51]:
df["year"]

one      2014
two      2015
three    2016
four     2015
five     2016
Name: year, dtype: int64

In [52]:
df.year

one      2014
two      2015
three    2016
four     2015
five     2016
Name: year, dtype: int64

In [59]:
df[["year","points"]]

Unnamed: 0,year,points
one,2014,1.5
two,2015,1.7
three,2016,3.6
four,2015,2.4
five,2016,2.9


In [61]:
df[["year","points"]][:3]

Unnamed: 0,year,points
one,2014,1.5
two,2015,1.7
three,2016,3.6


In [84]:
df.penalty = 0.5
df

Unnamed: 0,year,names,points,penalty
one,2014,Kilho,1.5,0.5
two,2015,Kilho,1.7,0.5
three,2016,Kilho,3.6,0.5
four,2015,Charles,2.4,0.5
five,2016,Charles,2.9,0.5


In [85]:
df.penalty = [0.1,0.2,0.3,0.4,0.5]
df

Unnamed: 0,year,names,points,penalty
one,2014,Kilho,1.5,0.1
two,2015,Kilho,1.7,0.2
three,2016,Kilho,3.6,0.3
four,2015,Charles,2.4,0.4
five,2016,Charles,2.9,0.5


In [86]:
df["zeros"] = np.arange(5)
df

Unnamed: 0,year,names,points,penalty,zeros
one,2014,Kilho,1.5,0.1,0
two,2015,Kilho,1.7,0.2,1
three,2016,Kilho,3.6,0.3,2
four,2015,Charles,2.4,0.4,3
five,2016,Charles,2.9,0.5,4


In [87]:
val = pd.Series([-1.2, -1.5, -1.7], index=["two", "four", "five"])
df["debt"] = val  # index에 대응 되는 부분에서만 입력이 된다. R에서는 그냥 다 들어갈텐데 또는 Join이 필요. 
df

Unnamed: 0,year,names,points,penalty,zeros,debt
one,2014,Kilho,1.5,0.1,0,
two,2015,Kilho,1.7,0.2,1,-1.2
three,2016,Kilho,3.6,0.3,2,
four,2015,Charles,2.4,0.4,3,-1.5
five,2016,Charles,2.9,0.5,4,-1.7


In [88]:
df["net_points"] = df.points - df.penalty
df

Unnamed: 0,year,names,points,penalty,zeros,debt,net_points
one,2014,Kilho,1.5,0.1,0,,1.4
two,2015,Kilho,1.7,0.2,1,-1.2,1.5
three,2016,Kilho,3.6,0.3,2,,3.3
four,2015,Charles,2.4,0.4,3,-1.5,2.0
five,2016,Charles,2.9,0.5,4,-1.7,2.4


In [89]:
df["high_points"] = df["net_points"] > 2.0
df

Unnamed: 0,year,names,points,penalty,zeros,debt,net_points,high_points
one,2014,Kilho,1.5,0.1,0,,1.4,False
two,2015,Kilho,1.7,0.2,1,-1.2,1.5,False
three,2016,Kilho,3.6,0.3,2,,3.3,True
four,2015,Charles,2.4,0.4,3,-1.5,2.0,False
five,2016,Charles,2.9,0.5,4,-1.7,2.4,True


In [90]:
del df["high_points"]
df

Unnamed: 0,year,names,points,penalty,zeros,debt,net_points
one,2014,Kilho,1.5,0.1,0,,1.4
two,2015,Kilho,1.7,0.2,1,-1.2,1.5
three,2016,Kilho,3.6,0.3,2,,3.3
four,2015,Charles,2.4,0.4,3,-1.5,2.0
five,2016,Charles,2.9,0.5,4,-1.7,2.4


In [91]:
del df["zeros"]
df

Unnamed: 0,year,names,points,penalty,debt,net_points
one,2014,Kilho,1.5,0.1,,1.4
two,2015,Kilho,1.7,0.2,-1.2,1.5
three,2016,Kilho,3.6,0.3,,3.3
four,2015,Charles,2.4,0.4,-1.5,2.0
five,2016,Charles,2.9,0.5,-1.7,2.4


In [92]:
df.columns

Index(['year', 'names', 'points', 'penalty', 'debt', 'net_points'], dtype='object')

In [93]:
df.index.name = "Order"
df.columns.name = "Info"
df

Info,year,names,points,penalty,debt,net_points
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,2014,Kilho,1.5,0.1,,1.4
two,2015,Kilho,1.7,0.2,-1.2,1.5
three,2016,Kilho,3.6,0.3,,3.3
four,2015,Charles,2.4,0.4,-1.5,2.0
five,2016,Charles,2.9,0.5,-1.7,2.4


In [98]:
df[0:3]  # DF를 numpy의 Array라고 생각하여 0~3행까지 가지고 온다. 

Info,year,names,points,penalty,debt,net_points
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,2014,Kilho,1.5,0.1,,1.4
two,2015,Kilho,1.7,0.2,-1.2,1.5
three,2016,Kilho,3.6,0.3,,3.3


In [100]:
df["two":"four"]

Info,year,names,points,penalty,debt,net_points
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
two,2015,Kilho,1.7,0.2,-1.2,1.5
three,2016,Kilho,3.6,0.3,,3.3
four,2015,Charles,2.4,0.4,-1.5,2.0


### 가장 좋은 인덱싱
 - .loc, .iloc 함수를 활용하여 행을 가지고 온다.

In [106]:
df.loc["two"]

Info
year           2015
names         Kilho
points          1.7
penalty         0.2
debt           -1.2
net_points      1.5
Name: two, dtype: object

In [107]:
df.loc["two":"four"]

Info,year,names,points,penalty,debt,net_points
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
two,2015,Kilho,1.7,0.2,-1.2,1.5
three,2016,Kilho,3.6,0.3,,3.3
four,2015,Charles,2.4,0.4,-1.5,2.0


In [109]:
df.loc["two":"four","points"]

Order
two      1.7
three    3.6
four     2.4
Name: points, dtype: float64

In [111]:
df.loc[:,"year"]

Order
one      2014
two      2015
three    2016
four     2015
five     2016
Name: year, dtype: int64

In [113]:
df["year"]

Order
one      2014
two      2015
three    2016
four     2015
five     2016
Name: year, dtype: int64

In [115]:
df.loc[:,["year","names"]]

Info,year,names
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
one,2014,Kilho
two,2015,Kilho
three,2016,Kilho
four,2015,Charles
five,2016,Charles


In [117]:
df.loc["three":"five","year":"penalty"]

Info,year,names,points,penalty
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
three,2016,Kilho,3.6,0.3
four,2015,Charles,2.4,0.4
five,2016,Charles,2.9,0.5


In [131]:
df.loc["six", :] = [2013, "Hayoung", 4.0, 0.1, 2.1,2.2]
df

Info,year,names,points,penalty,debt,net_points
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,2014.0,Kilho,1.5,0.1,,1.4
two,2015.0,Kilho,1.7,0.2,-1.2,1.5
three,2016.0,Kilho,3.6,0.3,,3.3
four,2015.0,Charles,2.4,0.4,-1.5,2.0
five,2016.0,Charles,2.9,0.5,-1.7,2.4
six,2013.0,Hayoung,4.0,0.1,2.1,2.2
seven,,,,,,


### .iloc
 - Array의 index과 비슷하다.

In [133]:
df.iloc[3]

Info
year             2015
names         Charles
points            2.4
penalty           0.4
debt             -1.5
net_points          2
Name: four, dtype: object

In [135]:
df.iloc[3:5,0:2]

Info,year,names
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
four,2015.0,Charles
five,2016.0,Charles


In [137]:
df.iloc[[0, 1, 3], [1, 2]]

Info,names,points
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
one,Kilho,1.5
two,Kilho,1.7
four,Charles,2.4


In [139]:
df.iloc[:,1:4]

Info,names,points,penalty
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,Kilho,1.5,0.1
two,Kilho,1.7,0.2
three,Kilho,3.6,0.3
four,Charles,2.4,0.4
five,Charles,2.9,0.5
six,Hayoung,4.0,0.1
seven,,,


In [141]:
df.iloc[1,1]

'Kilho'

### boolean Indexing

In [143]:
df["year"] > 2014

Order
one      False
two       True
three     True
four      True
five      True
six      False
seven    False
Name: year, dtype: bool

In [146]:
df.loc[df["year"] > 2014,:]

Info,year,names,points,penalty,debt,net_points
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
two,2015.0,Kilho,1.7,0.2,-1.2,1.5
three,2016.0,Kilho,3.6,0.3,,3.3
four,2015.0,Charles,2.4,0.4,-1.5,2.0
five,2016.0,Charles,2.9,0.5,-1.7,2.4


In [149]:
df.loc[df["names"]=="Kilho", ["names","points"]]

Info,names,points
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
one,Kilho,1.5
two,Kilho,1.7
three,Kilho,3.6


In [151]:
df.loc[(df["points"]>2) & (df["points"]<3),:]

Info,year,names,points,penalty,debt,net_points
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
four,2015.0,Charles,2.4,0.4,-1.5,2.0
five,2016.0,Charles,2.9,0.5,-1.7,2.4


In [152]:
df.loc[df["points"]>3, "penalty"] = 0
df

Info,year,names,points,penalty,debt,net_points
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,2014.0,Kilho,1.5,0.1,,1.4
two,2015.0,Kilho,1.7,0.2,-1.2,1.5
three,2016.0,Kilho,3.6,0.0,,3.3
four,2015.0,Charles,2.4,0.4,-1.5,2.0
five,2016.0,Charles,2.9,0.5,-1.7,2.4
six,2013.0,Hayoung,4.0,0.0,2.1,2.2
seven,,,,,,
