# 빅분기 실기 2유형

## import 파일

In [1]:
from mmap import MAP_EXECUTABLE
import pandas as pd
import numpy as np

# 전처리
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# 겸증/테스트 데이터 나누기
from sklearn.model_selection import train_test_split

# 모델
# =======================================================
# - 회귀 (랜덤 포레스트)
from sklearn.ensemble import RandomForestRegressor
# - 분류 (랜덤 포레스트)
from sklearn.ensemble import RandomForestClassifier
# =======================================================

# 모델성능지표
# =======================================================
# - 회귀
from sklearn.metrics import mean_squared_error as MSE   # RMSE = MSE ** 0.5
from sklearn.metrics import mean_absolute_percentage_error as MAPE
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_log_error as MSLE # RMSLE = MSLE ** 0.6
# - 분류
# =======================================================
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score


# pandas 설정??
pd.set_option('display.max_columns', 30)
pd.set_option('display.float_format', '{:.4f}'.format)

## sklearn help

### sklearn 전체 라이브러리 확인

In [2]:
import sklearn
# print(sklearn.__all__)
print(help(sklearn))

Help on package sklearn:

NAME
    sklearn - Configure global settings and get information about the working environment.

PACKAGE CONTENTS
    __check_build (package)
    _build_utils (package)
    _built_with_meson
    _config
    _distributor_init
    _isotonic
    _loss (package)
    _min_dependencies
    base
    calibration
    cluster (package)
    compose (package)
    conftest
    covariance (package)
    cross_decomposition (package)
    datasets (package)
    decomposition (package)
    discriminant_analysis
    dummy
    ensemble (package)
    exceptions
    experimental (package)
    externals (package)
    feature_extraction (package)
    feature_selection (package)
    gaussian_process (package)
    impute (package)
    inspection (package)
    isotonic
    kernel_approximation
    kernel_ridge
    linear_model (package)
    manifold (package)
    metrics (package)
    mixture (package)
    model_selection (package)
    multiclass
    multioutput
    naive_bayes
    neig

### 전처리 라이브러리

In [9]:
import sklearn.preprocessing

dir(sklearn.preprocessing)

['Binarizer',
 'FunctionTransformer',
 'KBinsDiscretizer',
 'KernelCenterer',
 'LabelBinarizer',
 'LabelEncoder',
 'MaxAbsScaler',
 'MinMaxScaler',
 'MultiLabelBinarizer',
 'Normalizer',
 'OneHotEncoder',
 'OrdinalEncoder',
 'PolynomialFeatures',
 'PowerTransformer',
 'QuantileTransformer',
 'RobustScaler',
 'SplineTransformer',
 'StandardScaler',
 'TargetEncoder',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_csr_polynomial_expansion',
 '_data',
 '_discretization',
 '_encoders',
 '_function_transformer',
 '_label',
 '_polynomial',
 '_target_encoder',
 '_target_encoder_fast',
 'add_dummy_feature',
 'binarize',
 'label_binarize',
 'maxabs_scale',
 'minmax_scale',
 'normalize',
 'power_transform',
 'quantile_transform',
 'robust_scale',
 'scale']

### 성능지표

In [7]:
import sklearn.metrics

# 모델성능지표
#dir(sklearn.metrics)
print([x for x in dir(sklearn.metrics) if x.startswith('mean') if x.endswith('error')])

['mean_absolute_error', 'mean_absolute_percentage_error', 'mean_squared_error', 'mean_squared_log_error']


In [5]:
import sklearn.ensemble
display(list(dir(sklearn.ensemble)))

['AdaBoostClassifier',
 'AdaBoostRegressor',
 'BaggingClassifier',
 'BaggingRegressor',
 'BaseEnsemble',
 'ExtraTreesClassifier',
 'ExtraTreesRegressor',
 'GradientBoostingClassifier',
 'GradientBoostingRegressor',
 'HistGradientBoostingClassifier',
 'HistGradientBoostingRegressor',
 'IsolationForest',
 'RandomForestClassifier',
 'RandomForestRegressor',
 'RandomTreesEmbedding',
 'StackingClassifier',
 'StackingRegressor',
 'VotingClassifier',
 'VotingRegressor',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_bagging',
 '_base',
 '_forest',
 '_gb',
 '_gradient_boosting',
 '_hist_gradient_boosting',
 '_iforest',
 '_stacking',
 '_voting',
 '_weight_boosting']

##분류 모델

In [3]:
# [1] 파일 가져오기 (2개 - XX_train.csv, XX_test.csv)
XY = pd.read_csv('https://raw.githubusercontent.com/Soyoung-Yoon/bigdata/main/customer_train.csv')
X_submission = pd.read_csv('https://raw.githubusercontent.com/Soyoung-Yoon/bigdata/main/customer_test.csv')

# print(XY.head(2))
# print(X_submission.head(2))

X = XY.drop(columns = ['성별'], axis = 1)
Y = XY['성별']
# print(X.shape, Y.shape, X_submission.shape)   # (3500, 10) (3500,) (2482, 10)

# [2] 데이터 탐색 (XY.info(), X_submission.info())
# XY.info()           # object : 주구매상품, 주구매지점
# XY.isnull().sum()   # 결측치 : 환불금액
# XY.select_dtypes(include = ['object']).columns  # object 컬럼 확인

# [3] 데이터 전처리
# [3-1] X, X_submission -> X_all
# [3-2] X_all : 컬럼제거, 컬럼 dtype변경(컬럼의 값을 대체), Encoding(범주형 -> 수치형)
# [3-2] X_all : Scaling (안함. MinMaxScaler, StandardScaler....)
# [3-2] X_all : X, X_submission 분리
X_all = pd.concat([X, X_submission], axis = 0)
# 컬럼제거 : 회원ID
X_all = X_all.drop(columns = ['회원ID'])

# Encoding : obj_columns -> LabelEncoding
X_all['주구매상품'] = LabelEncoder().fit_transform(X_all['주구매상품'])
X_all['주구매지점'] = LabelEncoder().fit_transform(X_all['주구매지점'])

#원핫인코딩
# X_all = pd.get_dummies(X_all)

# 결측치 처리
X_all['환불금액'] = X_all['환불금액'].fillna(0)
# Scaling
temp = MinMaxScaler().fit_transform(X_all)
# print(type(temp))   # <class 'numpy.ndarray'>
X_all = pd.DataFrame(temp, columns = X_all.columns)

# X_all.info()
X = X_all.iloc[:len(X),]
X_submission = X_all.iloc[len(X):,]
# print(X.shape, X_submission.shape)    # (3500, 9) (2482, 9)

# [4] 모델링
# [4-1] train_test_split : (X, Y) -> (x.train, x_test, y_train, y_test)
# [4-2] 모델객체 생성, 학습 (x_train, y_train)
# [4-3] 평가 (x_train, y_train), (x_test, y_Test)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state = 123)     # stratify 분류모델 시,,
# print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)   # (2800, 9) (700, 9) (2800,) (700,)
model = RandomForestClassifier(500, max_depth=3, random_state = 0)
model.fit(x_train, y_train)
pred = model.predict(x_test)

acc = accuracy_score(y_test, pred)
proba = model.predict_proba(x_test)[:, 1]
roc = roc_auc_score(y_test, proba)

# print(acc, roc)

# [5] 최종모델 선택, 예측값(X_submission), 제출파일생성

y_pred = model.predict(X_submission)
pd.DataFrame({'pred' : y_pred}).to_csv('result.csv', index = False)

# [6] 제출한 파일 확인
df4 = pd.read_csv('result.csv')
df4.shape   # (2482, 1)
# df4.head(10)

print(Y.value_counts(normalize=True))
print(df4['pred'].value_counts(normalize=True))

성별
0   0.6240
1   0.3760
Name: proportion, dtype: float64
pred
0   0.8131
1   0.1869
Name: proportion, dtype: float64
