## 데이터 읽어오기

In [None]:
# CSV 파일에서 읽어오기

import pandas

df1 = pandas.read_csv("https://raw.githubusercontent.com/YONESI-DBIS/DS_Lecture/main/UB_Data.csv")
print(df1)
print("=" * 20)
df1.head()

## 기본적인 데이터 통계 (Revisit)

In [None]:
# 통계 요약

df1.describe()

In [None]:
# 그룹

df1.groupby('Age')['Income'].describe()

- 여기서 잠깐! 함수(function)을 배워봅시다)

In [None]:
def sum(a,b):
    return a+b

a = 3
b = 4

c = sum(a,b)
print(c)

In [None]:
def sum_and_mul(a,b):
    return a+b, a*b

c, d = sum_and_mul(4,5) 

print(c, d)

In [None]:
# lambda를 사용한 간단한 함수

add = lambda a, b : a + b
add(2,3)

## t-검정

- 모집단의 분산이나 표준편차를 알지 못할 때
- 모집단을 대표하는 표본으로부터 추정된 분산이나 표준편차를 가지고 검정하는 방법
- “두 모집단의 평균간의 차이는 없다”라는 귀무가설과 “두 모집단의 평균 간에 차이가 있다”라는 대립가설 중에 하나를 선택할 수 있도록 하는 통계적 검정방법


In [None]:
from scipy import stats

age1_income = df1.loc[df1['Age'] == 27, 'Income']
age2_income = df1.loc[df1['Age'] == 28, 'Income']

# 두 개의 독립적인 표본에 대한 t값과 p값 계산 (통계학적으로 p > 0.05 면 두 표본이 유사하다고 판정)
print(stats.ttest_ind(age1_income, age2_income))

In [None]:
ages = df1.Age.unique()
print(ages)

ages.sort()
print(ages)

In [None]:
for i in ages:
  for j in ages:
    age1_income = df1.loc[df1['Age'] == i, 'Income']
    age2_income = df1.loc[df1['Age'] == j, 'Income']
    print(i, "-", j, " : ", stats.ttest_ind(age1_income, age2_income)) 

In [None]:
def getAgeIncome (a, b):
  age1_income = df1.loc[df1['Age'] == a, 'Income']
  age2_income = df1.loc[df1['Age'] == b, 'Income']
  return age1_income, age2_income

for i in ages:
  for j in ages:
    age1, age2 = getAgeIncome(i, j)
    print(i, "-", j, " : ", stats.ttest_ind(age1, age2))


## 카이제곱 검정

In [None]:
import scipy.stats
import numpy as np

chi2, pvalue, dof, expected = scipy.stats.chi2_contingency(df1[['Age', 'Income']])
print('검정통계량', chi2)
print('p값', pvalue)
print('자유도', dof)

## 선형회귀분석

In [None]:
# 데이터 준비

from statsmodels.formula.api import ols, glm
import seaborn

df1_desc = df1.groupby('Age')['Income'].describe()
#print(df1_desc)
df1_desc = df1_desc.reset_index()
#print(df1_desc)

In [None]:
# 시각적으로 확인

seaborn.regplot('Age', 'mean', lowess=True, data = df1_desc)     # LOWESS (locally weighted scatterplot smoothing)

In [None]:
# Ordinary Least Squares
# 종속변수 ~ 독립변수1 + 독립변수2 + ... 의 형태

res = ols('mean ~ Age', data=df1_desc).fit()   # fit()은 가중치 값을 추정
res.summary()

In [None]:
res = ols('Income ~ Age + Experience + Education', data=df1).fit()
res.summary()

In [None]:
# Generalized Linear Model

res = glm('Income ~ Age + Experience + Education', data=df1).fit()
res.summary()


In [None]:
## 예측

# 데이터 준비

df_test = df1[df1.columns.difference(['Income'])]
df_test = df_test[0:5][:]
print(df_test)

In [None]:
res = ols('Income ~ Age + Experience + Education', data=df1).fit()

predict_result = res.predict(df_test)
print(predict_result)
print('=' * 20)
print(df1[0:5]['Income'])

In [None]:
res = glm('Income ~ Age + Experience + Education', data=df1).fit()

predict_result = res.predict(df_test)
print(predict_result)
print('=' * 20)
print(df1[0:5]['Income'])

In [None]:
seaborn.regplot('Age', 'Income', lowess=True, data = df1)

## 로지스틱 회귀분석

In [None]:
import numpy
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score


# Data 읽어오기
cancer_data = sklearn.datasets.load_breast_cancer()
#print(cancer_data)
#print(cancer_data.DESCR)
#print(cancer_data.feature_names)
#print(cancer_data.target)

In [None]:
# Dataframe 만들기
cancer_df = pandas.DataFrame(cancer_data.data, columns = cancer_data.feature_names)
#print(cancer_df)
cancer_df['diagnosis'] = cancer_data.target
#print(cancer_df)

#print(cancer_df['diagnosis'].head(30))

# Datafrema 확인
cancer_df.info()

In [None]:
# 데이터 변환 (평균 1, 분산 1)
scaler = sklearn.preprocessing.StandardScaler()
scaled_cancer_df = scaler.fit_transform(cancer_df)

#print(cancer_df)
#print(scaled_cancer_df)

In [None]:
# 모델 학습

Y = cancer_df['diagnosis']
X = scaled_cancer_df

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 1)

model = sklearn.linear_model.LogisticRegression()
model.fit(X_train, Y_train)

In [None]:
# 예측

predict = model.predict(X_test)

print("=" * 20)
print(list(Y_test))
print("=" * 20)
print(predict)

In [None]:
# 예측결과 분석

confusion_matrix(Y_test, predict)

acccuracy = accuracy_score(Y_test, predict)
precision = precision_score(Y_test, predict)
recall = recall_score(Y_test, predict)
f1 = f1_score(Y_test, predict)
roc_auc = roc_auc_score(Y_test, predict)

print('정확도: {0:.3f}, 정밀도: {1:.3f}, 재현율: {2:.3f},  F1: {3:.3f}'.format(acccuracy,precision,recall,f1))

## 상관관계 분석

In [None]:
# 전체 

print(df1.corr())

In [None]:
# 특정 컬럼

print(df1.corr()['Income'])

In [None]:
column_names = list(df1.columns)

for name in column_names:
  print("==============", name)
  print(df1.corr()[name]) 

In [None]:
print("=== Pearson correlation coefficient (DEFAULT)")
print(df1.corr(method = 'pearson')['Age'])

print("=== Spearman rank correlation")
print(df1.corr(method = 'spearman')['Age'])

print("=== Kendall Tau correlation coefficient")
print(df1.corr(method = 'kendall')['Age'])