In [2]:
import pandas as pd
import numpy as np

In [2]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'

# seperate = 크기에 상관 없이 모든 빈공간
# url에 header는 포함되어있지 않다
df = pd.read_csv(url, sep='\s+', header=None)

# 처음 다섯개 출력 (기본값 n=5)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [5]:
# columns를 할당 가능
df.columns = []

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], dtype='int64')

### series

In [8]:
l = [1, 2, 3, 4, 5]
# index를 직접 지정 가능
obj = pd.Series(data = l, index = ['a', 'b', 'c', 'd', 'e'])
obj

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [10]:
# dict로 index : data 한꺼번에 설정
dict_data = {'a':1, 'b':2, 'c':3}
obj2 = pd.Series(dict_data, name='ex')
obj2

a    1
b    2
c    3
Name: ex, dtype: int64

In [11]:
obj2.index

Index(['a', 'b', 'c'], dtype='object')

In [12]:
obj2.values

array([1, 2, 3], dtype=int64)

In [14]:
type(obj2.values)

numpy.ndarray

### data frame

In [1]:
raw_data = {'first name': ['jason', 'molly', 'tina', 'jake', 'amy'],
           'age': [33, 41, 23, 42, 55],
           'country': ['USA', 'korea', 'japan', 'china', 'italy']}

In [3]:
df = pd.DataFrame(raw_data)
df

Unnamed: 0,first name,age,country
0,jason,33,USA
1,molly,41,korea
2,tina,23,japan
3,jake,42,china
4,amy,55,italy


In [4]:
# 일부 column만 지정 가능
df = pd.DataFrame(raw_data, columns = ['first name', 'age'])
df

Unnamed: 0,first name,age
0,jason,33
1,molly,41
2,tina,23
3,jake,42
4,amy,55


In [5]:
# 접근 방식1
df.age

0    33
1    41
2    23
3    42
4    55
Name: age, dtype: int64

In [6]:
# 접근 방식2
df['first name']

0    jason
1    molly
2     tina
3     jake
4      amy
Name: first name, dtype: object

In [7]:
type(df.age)  # series 타입

pandas.core.series.Series

In [8]:
# indexing
s = pd.Series(np.nan, index=[10, 11, 12, 1, 2, 3, 4, 5])

# 처음부터 지정된 인덱스 3번까지
print(s.loc[:3])

# 처음부터 3개 (순서대로의 숫자로 치환된 인덱스)
print(s.iloc[:3])

10   NaN
11   NaN
12   NaN
1    NaN
2    NaN
3    NaN
dtype: float64
10   NaN
11   NaN
12   NaN
dtype: float64


In [9]:
# 새로운 데이터 할당
df.age > 40

0    False
1     True
2    False
3     True
4     True
Name: age, dtype: bool

In [10]:
df.debt = df.age > 40
df.debt

  df.debt = df.age > 40


0    False
1     True
2    False
3     True
4     True
Name: age, dtype: bool

In [11]:
df.temp = pd.Series([0, 0, 1, 1, 1])
df.temp

  df.temp = pd.Series([0, 0, 1, 1, 1])


0    0
1    0
2    1
3    1
4    1
dtype: int64

In [13]:
# json 데이터
json_data = {'a': {2001: 1.0, 2002: 1.5}, 'b': {2000: 1.4, 2001: 1.2}}

pd.DataFrame(json_data)

Unnamed: 0,a,b
2001,1.0,1.2
2002,1.5,
2000,,1.4


### selection & drop

In [15]:
# index가 하나 더 생김
df.reset_index()

Unnamed: 0,index,first name,age
0,0,jason,33
1,1,molly,41
2,2,tina,23
3,3,jake,42
4,4,amy,55


In [16]:
# 기존의 index를 없앰
df.reset_index(drop=True)

Unnamed: 0,first name,age
0,jason,33
1,molly,41
2,tina,23
3,jake,42
4,amy,55


In [21]:
# df 자체에 변화가 일어남
df.reset_index(inplace=True)
df

Unnamed: 0,level_0,index,first name,age
0,0,0,jason,33
1,1,1,molly,41
2,2,2,tina,23
3,3,3,jake,42
4,4,4,amy,55


In [22]:
df.drop('index', axis=1, inplace=True)
df

Unnamed: 0,level_0,first name,age
0,0,jason,33
1,1,molly,41
2,2,tina,23
3,3,jake,42
4,4,amy,55


In [23]:
df.drop('level_0', axis=1, inplace=True)
df

Unnamed: 0,first name,age
0,jason,33
1,molly,41
2,tina,23
3,jake,42
4,amy,55


### dataframe operation

In [24]:
df1 = pd.DataFrame(np.arange(9).reshape(3, 3), columns=list('abc'))
df1

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8


In [25]:
df2 = pd.DataFrame(np.arange(16).reshape(4, 4), columns=list('abcd'))
df2

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [26]:
# fill value 사용 불가능 => NaN 결과가 나옴
df1 + df2

Unnamed: 0,a,b,c,d
0,0.0,2.0,4.0,
1,7.0,9.0,11.0,
2,14.0,16.0,18.0,
3,,,,


In [27]:
# fill value 사용. df1의 없는 부분을 0으로 대체.
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d
0,0.0,2.0,4.0,3.0
1,7.0,9.0,11.0,7.0
2,14.0,16.0,18.0,11.0
3,12.0,13.0,14.0,15.0


In [29]:
# series, dataframe 연산
s2 = pd.Series(np.arange(4))
s2

0    0
1    1
2    2
3    3
dtype: int32

In [30]:
# 제대로 연산이 되지 않음.
df2 + s2

Unnamed: 0,a,b,c,d,0,1,2,3
0,,,,,,,,
1,,,,,,,,
2,,,,,,,,
3,,,,,,,,


In [31]:
# axis를 지정하여 연산
# axis 0의 index를 기준으로 연산
# broadcasting이 발생
df2.add(s2, axis=0)

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,5,6,7,8
2,10,11,12,13
3,15,16,17,18


In [43]:
# axis 1의 index를 기준으로 연산
df2.add(s2, axis=1)

Unnamed: 0,a,b,c,d,0,1,2,3
0,,,,,,,,
1,,,,,,,,
2,,,,,,,,
3,,,,,,,,


### map

In [33]:
s1 = pd.Series(np.arange(10))
s1

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32

In [34]:
s1.map(lambda x: x ** 2)

0     0
1     1
2     4
3     9
4    16
5    25
6    36
7    49
8    64
9    81
dtype: int64

In [36]:
d = {1: 'A', 2: 'B', 3: 'C'}
s1.map(d)

0    NaN
1      A
2      B
3      C
4    NaN
5    NaN
6    NaN
7    NaN
8    NaN
9    NaN
dtype: object

In [37]:
s1.map(s2)

0    0.0
1    1.0
2    2.0
3    3.0
4    NaN
5    NaN
6    NaN
7    NaN
8    NaN
9    NaN
dtype: float64

In [46]:
df = pd.DataFrame({'sex': ['male', 'female', 'female', 'male', 'male']})
df

Unnamed: 0,sex
0,male
1,female
2,female
3,male
4,male


In [40]:
df.sex.unique()

array(['male', 'female'], dtype=object)

In [49]:
df['sex_code'] = df.sex.map({'male': 0, 'female': 1})

In [42]:
df

Unnamed: 0,sex,sex_code
0,male,0
1,female,1
2,female,1
3,male,0
4,male,0


In [48]:
# replace
df.sex.replace({'male': 0, 'female': 1})

0    0
1    1
2    1
3    0
4    0
Name: sex, dtype: int64

In [None]:
# input : target list, conversion list
df.sex.replace(['male', 'female'], [0, 1])

In [45]:
# inplace를 True로 하면 데이터 프레임에 있는 데이터를 변환시켜버림
df.sex.replace({'male': 0, 'female': 1}, inplace=True)
df

Unnamed: 0,sex,sex_code
0,0,0
1,1,1
2,1,1
3,0,0
4,0,0


In [47]:
# 함수 사용 가능 (df는 안 바뀜)
def change_sex(x):
    if x == 'male':
        return 0
    return 1

df.sex.map(change_sex)

0    0
1    1
2    1
3    0
4    0
Name: sex, dtype: int64

### apply

In [51]:
df_info = df[['sex', 'sex_code']]
type(df_info)

pandas.core.frame.DataFrame

In [56]:
def f(x):
    m = 0
    f = 0
    for i in x:
        if i == 'male':
            m += 1
        else:
            f += 1
    return pd.Series([m, f], index=['male count', 'female count'])

dd = df.apply(f)

Unnamed: 0,sex,sex_code
male count,3,0
female count,2,5


In [58]:
# applymap

f = lambda x: -x
dd.applymap(f)

Unnamed: 0,sex,sex_code
male count,-3,0
female count,-2,-5


### built-in function

In [59]:
type(df.sex.unique())

numpy.ndarray

In [61]:
print(df.sex.unique())

['male' 'female']


In [65]:
# isnull : null 여부를 반환.
arr = np.array([np.NaN, 1, 2, np.NaN])
df3 = pd.DataFrame(arr)
df3.isnull()

Unnamed: 0,0
0,True
1,False
2,False
3,True


In [None]:
# null의 갯수
df3.isnull().sum()

In [66]:
# sort_vaules : 정렬
df3.sort_values([0], ascending=True)

Unnamed: 0,0
1,1.0
2,2.0
0,
3,


In [68]:
df.dtypes

sex         object
sex_code     int64
dtype: object

In [70]:
df.sex.value_counts()

male      3
female    2
Name: sex, dtype: int64