# 할당과 복사

In [3]:
my_list1 = [1,2,3]

my_list1은 1,2,3 원소를 가진 리스트([1,2,3])를 참조하고 있다.

In [4]:
my_list2 = my_list1

In [5]:
print(my_list1)
print(my_list2)

[1, 2, 3]
[1, 2, 3]


In [6]:
my_list2.append(4)

In [7]:
print(my_list1)
print(my_list2)

[1, 2, 3, 4]
[1, 2, 3, 4]


## 복사
* copy()
* slice[:]
* list()

In [8]:
my_list_original = [10,20,30]

In [9]:
# 1. copy() 함수 사용해서 리스트 복사하기
my_list_copy = my_list_original.copy()
my_list_copy.append(4)

print(my_list_original)
print(my_list_copy)

[10, 20, 30]
[10, 20, 30, 4]


In [10]:
# 2. slice 기법을 사용해서 리스트 복사하기
my_list_slice = my_list_original[:]
my_list_slice.append(4)

print(my_list_original)
print(my_list_slice)

[10, 20, 30]
[10, 20, 30, 4]


# 딕셔너리
딕셔너리를 다루는 방법과 데이터 분석을 하기 위해 사용하는 pandas의 사용법이 매우 유사

In [11]:
my_dict = {"name" : "소민호", "age" : 33}
print(my_dict)

{'name': '소민호', 'age': 33}


In [12]:
print(my_dict['name'])

소민호


In [13]:
print(my_dict['age'])

33


문법상 딕셔너리의 
없으면 새로 만들고, 있으면 수정된다.

In [14]:
my_dict['email'] = 'mhso.dev@gmail.com'
print(my_dict)

{'name': '소민호', 'age': 33, 'email': 'mhso.dev@gmail.com'}


In [15]:
my_dict['age'] = 34
print(my_dict)

{'name': '소민호', 'age': 34, 'email': 'mhso.dev@gmail.com'}


딕셔너리의 결함

In [16]:
dict_a = {'a' : 1, 'b' : 2}
dict_b = {'c' : 3, 'd' : 4}

dict_a.update(dict_b)

print(dict_a)

{'a': 1, 'b': 2, 'c': 3, 'd': 4}


In [18]:
dict_a = {'a' : 1, 'b' : 2}
dict_b = {'b' : 3, 'd' : 4}

dict_a.update(dict_b)

print(dict_a)

{'a': 1, 'b': 3, 'd': 4}


In [19]:
my_dict = {'a' : 10, 'b' : 20}

In [20]:
'a' in my_dict # 'a'라는 키값이 my_dict 안에 있다.

True

In [21]:
'c' in my_dict

False

In [22]:
my_dict['a']

10

In [23]:
my_dict['c']

KeyError: 'c'

In [24]:
my_dict.keys()

dict_keys(['a', 'b'])

In [25]:
my_dict.values()

dict_values([10, 20])

In [26]:
my_dict.items()

dict_items([('a', 10), ('b', 20)])

In [29]:
# 딕셔너리 삭제
del my_dict['a']
my_dict

{'b': 20}

In [31]:
# 딕셔너리 전체 삭제
my_dict.clear()

In [32]:
my_dict

{}

# Set(집합)
* 중복된 데이터는 저장되지 않는다.

In [33]:
set1 = {1,2,3,3,4,6,7,1,2,3,4,1,2,3,9,1,1,2,3}
print(set1)

{1, 2, 3, 4, 6, 7, 9}


In [34]:
my_list = [1,5,1,2,3,4,1,20,1,2,3,1,2,3,5,4,6,7,8,9]
set2 = set(my_list)
print(set2)

{1, 2, 3, 4, 5, 6, 7, 8, 9, 20}


In [36]:
my_str = 'letter'
set3 = set(my_str)
print(set3)

{'t', 'e', 'l', 'r'}


In [37]:
list(set3)

['t', 'e', 'l', 'r']

데이터 분석을 하기 위한 자료구조

* 통계적, 수학적 분석 (보통 pandas)
* 머신러닝이나 딥러닝 분석 (tensor 개념)

In [38]:
list_2dim = [[10, 20, 30],
            [40, 50, 60]]
print(list_2dim)

[[10, 20, 30], [40, 50, 60]]


In [39]:
list_2dim[1]

[40, 50, 60]

In [40]:
list_2dim[1][1]

50

# 판다스 사용하기
* 설치 : pip install pandas

In [41]:
import pandas as pd
import numpy as np

In [43]:
s = pd.Series([1, 2, 3, np.nan, 6, 7])
s

0    1.0
1    2.0
2    3.0
3    NaN
4    6.0
5    7.0
dtype: float64

In [45]:
dates = pd.date_range('20200828', periods=10)
dates

DatetimeIndex(['2020-08-28', '2020-08-29', '2020-08-30', '2020-08-31',
               '2020-09-01', '2020-09-02', '2020-09-03', '2020-09-04',
               '2020-09-05', '2020-09-06'],
              dtype='datetime64[ns]', freq='D')

In [46]:
df = pd.DataFrame(np.random.rand(5, 4),
                 index = [1, 2, 3, 4, 5],
                 columns = ['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
1,0.493838,0.842812,0.200522,0.492447
2,0.168047,0.342746,0.635836,0.606635
3,0.647717,0.009465,0.206832,0.709503
4,0.435998,0.011946,0.419373,0.61675
5,0.996153,0.663407,0.401474,0.752816


In [47]:
df['A']

1    0.493838
2    0.168047
3    0.647717
4    0.435998
5    0.996153
Name: A, dtype: float64

In [49]:
df[['A', 'C']]

Unnamed: 0,A,C
1,0.493838,0.200522
2,0.168047,0.635836
3,0.647717,0.206832
4,0.435998,0.419373
5,0.996153,0.401474


In [50]:
column_names = ['A', 'C']
df[column_names]

Unnamed: 0,A,C
1,0.493838,0.200522
2,0.168047,0.635836
3,0.647717,0.206832
4,0.435998,0.419373
5,0.996153,0.401474


In [51]:
df.loc[3]

A    0.647717
B    0.009465
C    0.206832
D    0.709503
Name: 3, dtype: float64

In [52]:
df.loc[:, ['A','C']]

Unnamed: 0,A,C
1,0.493838,0.200522
2,0.168047,0.635836
3,0.647717,0.206832
4,0.435998,0.419373
5,0.996153,0.401474


In [53]:
df.loc[1:3, ['B','D']]

Unnamed: 0,B,D
1,0.842812,0.492447
2,0.342746,0.606635
3,0.009465,0.709503


In [55]:
df['A'] > 0.5

1    False
2    False
3     True
4    False
5     True
Name: A, dtype: bool

In [56]:
df[ df['A'] > 0.3 ]

Unnamed: 0,A,B,C,D
1,0.493838,0.842812,0.200522,0.492447
3,0.647717,0.009465,0.206832,0.709503
4,0.435998,0.011946,0.419373,0.61675
5,0.996153,0.663407,0.401474,0.752816


In [57]:
df[ df > 0.3 ] #전체를 대상으로 비교

Unnamed: 0,A,B,C,D
1,0.493838,0.842812,,0.492447
2,,0.342746,0.635836,0.606635
3,0.647717,,,0.709503
4,0.435998,,0.419373,0.61675
5,0.996153,0.663407,0.401474,0.752816


B 컬럼의 값이 0.3보다 큰 A,D컬럼만 뽑아주세요

In [63]:
df.loc[df['B'] > 0.3, ['A','D']]

Unnamed: 0,A,D
1,0.493838,0.492447
2,0.168047,0.606635
5,0.996153,0.752816
