# Python 주요 패키지 소개

## numpy

In [1]:
import numpy as np 

## 1차원 벡터 생성하기

In [2]:
ar1 = np.array([1, 2, 3]) 

In [3]:
type(ar1)

numpy.ndarray

In [4]:
print(ar1)

[1 2 3]


In [5]:
ar1.ndim

1

In [6]:
ar1.shape

(3,)

In [7]:
np.arange(11)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [8]:
np.arange(1, 11)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [9]:
np.arange(1, 21, 2)

array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19])

## 벡터 원소의 형 변환

In [10]:
a = [1, 2, 3.0]

In [11]:
ar = np.array(a)

In [12]:
print(ar)

[1. 2. 3.]


In [13]:
ar.astype(int)

array([1, 2, 3])

In [14]:
ar.astype(str)

array(['1.0', '2.0', '3.0'], dtype='<U32')

## 2차원 행렬 생성하기

In [15]:
ar2 = np.array([[1, 2, 3], 
                [4, 5, 6]]) 

In [16]:
type(ar2)

numpy.ndarray

In [17]:
print(ar2)

[[1 2 3]
 [4 5 6]]


In [18]:
ar2.ndim

2

In [19]:
ar2.shape

(2, 3)

In [20]:
np.zeros([2, 3], dtype = 'int')

array([[0, 0, 0],
       [0, 0, 0]])

In [21]:
np.zeros([3, 2], dtype = 'float')

array([[0., 0.],
       [0., 0.],
       [0., 0.]])

## 배열의 차원 변환

In [22]:
ar1 = np.arange(12)

In [23]:
print(ar1)

[ 0  1  2  3  4  5  6  7  8  9 10 11]


In [24]:
ar1.reshape(2, 6)

array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11]])

In [25]:
ar1.reshape(3, 4)

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [26]:
ar1.reshape(3, 5)  # 에러가 발생합니다. 

ValueError: cannot reshape array of size 12 into shape (3,5)

In [31]:
ar2 = ar1.reshape(4, 3)

In [32]:
print(ar2)

[[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]]


In [33]:
ar2.reshape(6, -1)

array([[ 0,  1],
       [ 2,  3],
       [ 4,  5],
       [ 6,  7],
       [ 8,  9],
       [10, 11]])

## 배열의 인덱싱

In [34]:
ar1 = np.arange(6)

In [35]:
ar1[1]

1

In [36]:
ar2 = ar1.reshape(2, 3)

In [37]:
ar2[0, 0]

0

In [38]:
ar2[0, 1]

1

## 배열의 슬라이싱

In [39]:
ar1[:2]

array([0, 1])

In [40]:
ar1[2:]

array([2, 3, 4, 5])

In [41]:
ar1[:]

array([0, 1, 2, 3, 4, 5])

In [42]:
ar2[0:1, 0:1]

array([[0]])

In [43]:
ar2[:, 1:3]

array([[1, 2],
       [4, 5]])

In [56]:
ar2[:, :]

array([[0, 1, 2],
       [3, 4, 5]])

## 배열의 원소 정렬하기

In [None]:
ar3 = np.array([1, 5, 6, 3, 4, 7, 2]) 

In [None]:
print(ar3)

In [None]:
np.sort(ar3)

In [None]:
print(ar3)

In [None]:
ar3.sort()

In [None]:
print(ar3)

In [None]:
ar3[::-1]

## 벡터의 덧셈, 뺄셈 연산

In [None]:
a = np.array([1, 1])
b = np.array([-1, 1])

In [None]:
a + b

In [None]:
a - b

In [None]:
c = [1, 1]
d = [-1, 1]

In [None]:
c + d

## 벡터의 스칼라배 연산

In [None]:
a * 0

In [None]:
a * 2

In [None]:
b * 2

In [None]:
(a + b) * 2

In [None]:
a * 2 + b * 2

In [None]:
a * (2 + 3)

In [None]:
a * 2 + a * 3

## 벡터의 내적

In [None]:
def innerProd(x, y):
    return sum(x*y)

In [None]:
innerProd(a, b)

In [None]:
np.dot(a, b)

## 벡터의 크기

In [None]:
def vecLen(x):
    return sum(x**2)**(1/2)

In [None]:
vecLen(a)

In [None]:
np.dot(a, a)**(1/2)

In [None]:
innerProd(a, b) / (vecLen(a) * vecLen(b))

## 행렬의 덧셈, 뺄셈 연산

In [None]:
A = np.array([[1, 2], [3, 4]])
B = np.array([[1, 3], [2, 4]])
C = np.ones([2, 2])

In [None]:
A + B

In [None]:
A - B

In [None]:
(A + B) + C

In [None]:
A + (B + C)

## 행렬의 스칼라배 연산

In [None]:
A * 0

In [None]:
A * 2

In [None]:
B * 2

In [None]:
A * 2 + B * 2

In [None]:
(A + B) * 2

In [None]:
A * (2 + 3)

In [None]:
A * 2 + A * 3

## 행렬의 곱셈 연산

In [None]:
np.matmul(A, B)

In [None]:
np.matmul(B, A)

## 역행렬

In [None]:
import numpy.linalg as lin

In [None]:
AI = lin.inv(A)

In [None]:
np.matmul(A, AI)

In [None]:
np.matmul(AI, A)

## 행렬식

In [None]:
lin.det(A)

In [None]:
lin.det(B)

In [None]:
M = np.array([[1, 2], [2, 4]])

In [None]:
lin.det(M)

## 가상의 키(신장) 데이터 생성

In [None]:
np.random.seed(seed = 1234)

In [None]:
heights = np.random.normal(170, 10, size = 100)

## 기술 통계량

In [None]:
np.mean(heights)

In [None]:
np.median(heights)

In [None]:
np.min(heights)

In [None]:
np.max(heights)

In [None]:
np.max(heights) - np.min(heights)

In [27]:
np.percentile(heights, 25)

NameError: name 'heights' is not defined

In [None]:
np.percentile(heights, 75)

In [None]:
np.percentile(heights, 75) - np.percentile(heights, 25)

In [None]:
np.percentile(heights, [0, 25, 50, 75, 100])

In [None]:
np.var(heights)

In [None]:
np.std(heights)

In [None]:
math = [82, 58, 64, 76, 55, 89, 98, 92, 46, 87]
korean = [98, 78, 87, 78, 72, 87, 99, 83, 76, 90]

In [None]:
np.cov(math, korean)

In [None]:
np.corrcoef(math, korean)

## [참고] 데이터 표준화

In [None]:
import scipy.stats as stats

In [None]:
scaled = stats.zscore(heights)

In [None]:
np.mean(scaled)

In [None]:
np.std(scaled)

## 도수분포표

In [None]:
bins = np.arange(130, 211, 10)

In [None]:
hist, bins = np.histogram(heights, bins)

In [None]:
print(hist)

In [None]:
total = len(heights)

In [None]:
prop = hist / total

In [None]:
print(prop)

In [None]:
sum(prop)

# matplotlib

## 히스토그램

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.hist(heights, bins)
plt.xlabel('Height(cm)', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.show()

## 상자수염그림

In [None]:
plt.boxplot(heights, sym = 'bo')
plt.title('Boxplot of Heights', fontsize = 16)
plt.xticks([1], ['Heights'], fontsize = 12)
plt.show()

## 산점도

In [None]:
plt.scatter(math, korean, color = 'b', marker = 'o')
plt.title('Scatter plot', fontsize = 16)
plt.xlabel('Math', fontsize = 14)
plt.ylabel('Korean', fontsize = 14)
plt.show()

# pandas

In [None]:
import pandas as pd

## 1차원 시리즈 생성하기

In [None]:
sr = pd.Series([1, 3, 5, 7, 9])

In [None]:
print(sr)

In [None]:
print(sr.values)

In [None]:
print(sr.index)

In [None]:
sr.index = ['a', 'b', 'c', 'd', 'e']
print(sr)

In [None]:
sr = pd.Series([1, 3, 5, 7, 9], index = ['a', 'b', 'c', 'd', 'e'])
print(sr)

In [None]:
sr = pd.Series({'a':1, 'b':3, 'c':5, 'd':7, 'e':9})
print(sr)

## 시리즈의 인덱싱과 슬라이싱

In [None]:
sr[0]

In [None]:
sr['a']

In [None]:
sr[[1, 2]]

In [None]:
sr[['b', 'c']]

In [None]:
sr[:2]

In [None]:
sr[2:]

In [None]:
sr[sr >= 3]

## 시리즈의 값 변경

In [None]:
sr + 1

In [None]:
sr * 2

In [None]:
sr[sr >= 5] = ['가', '나', '다']

In [28]:
sr[sr == '다'] = np.nan

NameError: name 'sr' is not defined

In [None]:
sr.fillna(0)

In [None]:
sr.dropna()

In [None]:
nr = pd.Series('라', index = ['f'])

In [None]:
sr = sr.append(nr)
print(sr)

## 2차원 데이터프레임 생성하기

In [None]:
df = pd.DataFrame([[85, 79, 92], [57, 76, 69], [98, 89, 74]], 
                  index = [1, 2, 3], 
                  columns = ['A반', 'B반', 'C반']) 

In [None]:
print(df) 

In [None]:
df = pd.DataFrame({'A반':[85, 79, 92], 'B반':[57, 76, 69], 'C반':[98, 89, 74]},
                  index = [1, 2, 3]) 

In [None]:
print(df) 

## [참고] 파일과 폴더 다루기

In [None]:
import os, shutil 

In [None]:
os.getcwd()

In [None]:
os.mkdir('../temp')

In [None]:
shutil.copy('test.txt', 'test_new.txt')

In [None]:
shutil.copy2('test.txt', '../temp')

In [None]:
shutil.move('test_new.txt', '../temp')

In [None]:
os.chdir('../temp')

In [None]:
os.rename('test.txt', 'test_old.txt')

In [None]:
os.listdir()

In [None]:
os.remove('test_old.txt')

In [None]:
for i in os.listdir():
    os.remove(i)

In [None]:
os.chdir('../') 

In [None]:
os.getcwd()

In [None]:
os.rmdir('./temp')

In [None]:
# shutil.rmtree('./temp')

## 엑셀 파일을 데이터프레임으로 불러오기

In [None]:
df = pd.read_excel('./data/2019_KBO_Win.xlsx', 
                   sheet_name = 'Sheet1')

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.index

In [None]:
df.dtypes

## 데이터프레임 다루기 : 인덱스로 일부 확인

In [None]:
df.head(5)

In [None]:
df.tail(5)

In [None]:
df.iloc[0]

In [None]:
df.iloc[0:10]

In [None]:
df.iloc[0:10, 0:12]

In [None]:
df.loc[0]

In [None]:
df.loc[0:10]

In [None]:
df.loc[0:10, '선수명':'도루']

## 데이터프레임 다루기 : 일부 선택 또는 삭제

In [None]:
df.팀명

In [None]:
df['팀명'] = pd.Categorical(df['팀명'])

In [29]:
df[['선수명', '팀명']]

NameError: name 'df' is not defined

In [None]:
df.drop(range(100, 300), axis = 0)

In [None]:
df.drop('BABIP', axis = 1)
print(df)

In [None]:
df.drop('BABIP', axis = 1, inplace = True)
print(df)

In [None]:
df.drop(['wOBA', 'WAR'], axis = 1, inplace = True)
print(df)

In [None]:
df.drop(range(0, 12), axis = 1)  # 에러가 발생합니다. 

## 데이터프레임 다루기 : 새로운 컬럼 생성

In [None]:
df['팀명'].value_counts()

In [None]:
df['닉네임'] = '별명'
print(df)

In [None]:
df['타수*'] = df['타석'] - df['타수']
print(df)

In [None]:
df['타율*'] = np.round(df['안타'] / df['타수'], 3)
print(df)

In [None]:
df['OPS*'] = df['출루율'] + df['장타율']
print(df)

In [None]:
df = df.loc[ : , '선수명':'OPS']
print(df)

In [None]:
df.isna().sum()

In [None]:
df = df.dropna()
df.shape

## 데이터프레임 다루기 : 조건을 만족하는 행 확인

In [None]:
df = df[df.타석 >= 144 * 3.1]
print(df)

In [None]:
df = df.reset_index().drop('index', axis = 1)

In [None]:
df.sort_values(by = '안타', ascending = False).head(5)

In [None]:
df.sort_values(by = ['홈런', '삼진'], ascending = False)

In [None]:
df.sort_values(by = ['홈런', '삼진'], ascending = [False, True])

In [None]:
df[(df.홈런 >= 20) & (df.OPS >= 0.9)]

In [None]:
df[(df.홈런 >= 20) & (df.도루 >= 20)]

## 다른 데이터 타입으로 변경하기

In [None]:
df.values

In [None]:
df.values.tolist()

In [None]:
df.to_dict('list')

In [None]:
df.to_excel('./data/test.xlsx', sheet_name = '2019_KBO')

## [참고] csv, txt 파일 입출력

In [None]:
df.to_csv('./data/test.csv', index = None)

In [None]:
df1 = pd.read_csv('./data/test.csv')

In [None]:
df1.head(5)

In [None]:
df.to_csv('./data/test.txt', sep = ' ', index = None) 

In [None]:
df2 = pd.read_csv('./data/test.txt', sep = ' ')

In [None]:
df2.head(5)

## scikit-learn

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz 
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import roc_curve, roc_auc_score

## 분석 데이터 준비

In [None]:
iris = load_iris()
print(iris)

In [None]:
type(iris)

In [None]:
iris.data

In [None]:
iris.target

In [None]:
iris.target_names

In [30]:
iris.feature_names

NameError: name 'iris' is not defined

In [None]:
df = pd.DataFrame(data = iris.data, 
                  columns = iris.feature_names)
print(df)

In [None]:
df['label'] = iris.target
print(df)

In [None]:
df = df[df.label != 2]
print(df)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 0:4], 
                                                    df.iloc[:, 4],
                                                    test_size = 0.3, 
                                                    random_state = 1)

## 분류모형 적합 및 추정값 생성

In [None]:
tree = DecisionTreeClassifier(random_state = 1, 
                              criterion = 'gini',
                              max_depth = 10, 
                              min_samples_leaf = 1)

In [None]:
tree.fit(X_train, y_train)

In [None]:
tree.get_depth()

In [None]:
tree.get_n_leaves()

In [None]:
tree.get_params()

## 나무모형 시각화

In [None]:
export_graphviz(tree, out_file = './data/tree_iris.dot', 
                class_names = ['Setosa', 'Versicolor'], 
                feature_names = iris.feature_names, 
                impurity = False, 
                filled = True)

In [None]:
import graphviz

In [None]:
with open('./data/tree_iris.dot') as file:
    dot_graph = file.read()

In [None]:
display(graphviz.Source(dot_graph))

## 나무모형 이미지로 저장

In [None]:
os.mkdir('./image')

In [None]:
import pydot

In [None]:
(graph,) = pydot.graph_from_dot_file('./data/tree_iris.dot')
graph.write_png('./image/tree_iris.png')

In [None]:
from IPython.display import Image

In [None]:
Image('./image/tree_iris.png')

## 변수의 중요도 시각화

In [None]:
n_feature = iris.data.shape[1]

In [None]:
importance = tree.feature_importances_

In [None]:
plt.barh(np.arange(n_feature), importance, align = 'center')
plt.yticks(np.arange(n_feature), iris.feature_names)
plt.xlabel('Feature Importances')
plt.ylabel('Feature')
plt.ylim(-1, n_feature)
plt.show()

In [None]:
plt.barh(np.arange(n_feature), importance, align = 'center')
plt.yticks(np.arange(n_feature), iris.feature_names)
plt.xlabel('Feature Importances')
plt.ylabel('Feature')
plt.ylim(-1, n_feature)
plt.savefig('./image/varImp.png', bbox_inches = 'tight')

In [None]:
Image('./image/varImp.png')

## 모형의 성능 평가하기

In [None]:
pred = tree.predict(X_test)

In [None]:
confusion_matrix(y_test, pred)

In [None]:
accuracy_score(y_test, pred)

In [None]:
precision_score(y_test, pred)

In [None]:
recall_score(y_test, pred)

In [None]:
f1_score(y_test, pred)

## ROC 곡선 시각화 및 AUC 확인

In [None]:
proba = tree.predict_proba(X_test)[:, 1]

In [None]:
fprs, tprs, thresholds = roc_curve(y_test, proba)

In [None]:
plt.plot(fprs, tprs, label = 'ROC')
plt.plot([0, 1], [0, 1], 'r--', label = 'Random')
plt.xlabel('FPR'); plt.ylabel('TPR')
plt.legend(loc = 'lower right')
plt.show()

In [None]:
roc_auc_score(y_test, pred)

## End of Document