In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 데이터 읽어오기
df = pd.read_csv('data_v1.csv')
df = pd.read_csv(aidu.framework.config.data_dir + '/data_v1.csv')

# Null 데이터 확인
df.isnull().sum()

# 통계정보 (숫자형만 보기)
df.describe()

# column 삭제
df.drop('customerID', axis=1, inplace=True)

# column 분포 확인
df['Churn'].value_counts()

# 결측치 처리
# float: 0, object: ' '으로 처리
df['%%'].fillna('0', inplace=True)
df['&&'].fillna(' ', inplace=True)
# 결측치 작은 row
df['##'].dropna(inplace=True)

# 한번에 여러 columns 삭제
df.drop(columns = ['~~', '@@', '$$'], inplace=True)

# 시각화
# plot bar 차트
df['gender'].value_counts().plot(kind='bar')
# 한번에 (object) columns에 대해 bar 차트
object_list = df.select_dtypes(object).columns.values
for col in object_list:
  df[col].value_counts().plot(kind='bar')
  plt.title(col)
  plt.show()

# matplotlib으로 histogram
plt.hist(df['수주여부'])

# seaborn hisplot - seaborn plot 옵션 대략 형태: data=df, x='aaa', y='bbb', hue='ccc'(hue: 구분)
sns.histplot(data=df, x='tenure', hue='Churn')
# histogram을 곡선으로
sns.kdeplot(data=df, x='tenure', hue='Churn')
# countplot 갯수 분포 확인
sns.countplot(data=df, s='MultipleLines', hue='Churn')

# heatmap 상관관계
sns.heatmap(df.corr(), annot=True)
# 몇가지 columns만 확인
corr = df[['tenure','MonthlyCharges','TotalCharges']].corr()
sns.heapmap(corr, annot=True)

# 결과 저장
df.to_csv('data_v1_save.csv', index=False)

# LabelEncoder : 범주형 -> 수치형 데이터로 변환
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['선주사'] = le.fit_transform(df['선주사'])
df['선종'] = le.fit_transform(df['선종'])

# One-hot encoding : 범주형 -> 1,0 이진형 벡터로 변환
# from sklearn.preprocessing import OneHotEncoder
# ohe = OneHotEncoder(sparse=False)
df1 = pd.get_dummies(data=df, columns=['유사선박수주경험', '중국입찰여부', '국내경쟁사입찰여부'], drop_first=True)

# Train, Test 분할
X = df1.drop('Churn', axis=1).values
y = df1['Churn'].values
X.shape, y.shape

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Data 정규화, 스케일링
# MinMaxScaler 'scaler'
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# StandardScaler
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
X_train = std.fit_transform(X_train)
X_test = std.transform(X_test)

# 모델 학습
# 1) LogisticRegression 분류
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

lg = LogisticRegression()
lg.fit(X_train, y_train)
# 성능 평가
lg.score(X_test, y_test)
# 예측
lg_pred = lg.predict(X_test)
# confusion_matrix (오차행렬)
confusion_matrix(y_test, lg_pred)
# classification_report 출력
print(classification_report(y_test, lg_pred))
# 성능 그래프 그리기
recall_eval('LogisticRegression', lg_pred, y_test)

# 2) KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
# 예측
knn_pred = knn.predict(X_test)
# 성능 그래프 그리기
recall_eval('K-Nearest Neighbor', knn_pred, y_test)

# 3) DecisionTree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=10, random_state=42)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
recall_eval('DecisionTree', dt_pred, y_test)

# ensemble 기법
# <RandomForest>
# random_state: 랜덤 시드 고정 값. 고정해두고 튜닝할 것!
# n_jobs: CPU 사용 갯수
# max_depth: 깊어질 수 있는 최대 깊이. 과대적합 방지용
# n_estimators: 앙상블하는 트리의 갯수
# max_features: 최대로 사용할 feature의 갯수. 과대적합 방지용
# min_samples_split: 트리가 분할할 때 최소 샘플의 갯수. default=2. 과대적합 방지용
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, max_features=9, max_depth=15, random_state=42)
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)
cm = confusion_matrix(y_test, rfc_pred)
# figsize 설정
plt.figure(figsize=(10,10))
sns.heatmap(cm, annot=True)
print(classification_report(y_test, rfc_pred))

# 딥러닝 모델
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

model = Sequential()
model.add(Dense(4, input_shape=(39,), activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(3, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
#확인
model.summary()
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# 모델 컴파일 – 다중 분류 모델 (Y값을 One-Hot-Encoding 한경우)
# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# 모델 컴파일 – 다중 분류 모델 (Y값을 One-Hot-Encoding 하지 않은 경우)
# model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# 모델 컴파일 – 예측 모델 model.compile(optimizer='adam', loss='mse')

es = EarlyStopping(monitor='val_loss', mode='min', patience=5, verbose=1)
cp = ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', mode='min', save_best_only=True, verbose=1)
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=20, callbacks=[es, cp])

# 성능 평가
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])

plt.title('Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Acc')
plt.legend(['loss', 'val_loss', 'acc', 'val_acc'])
plt.show()