# 데이터 전처리

In [53]:
import pandas as pd

train_df = pd.read_csv('./data_base/train.csv')
test_df = pd.read_csv('./data_base/test.csv')

In [None]:
train_df.isnull().sum()

In [None]:
train_df.head()
# user_id -> 삭제

In [None]:
train_df.drop('user_id',axis=1,inplace=True)
train_df.head()

### 범주형 문자 데이터 -> Label Encoding

In [None]:
# dtype_ object -> preferred_difficulty_level	preferred_difficulty_level -> label encoding
train_df['preferred_difficulty_level'].value_counts()


In [None]:
train_df['preferred_difficulty_level'].value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder

le_sub = LabelEncoder()
le_sub = le_sub.fit(train_df['subscription_type'])

train_df['subscription_type'] = le_sub.transform(train_df['subscription_type'])
train_df['subscription_type'].head()

In [None]:
le_lev = LabelEncoder()
le_lev = le_lev.fit(train_df['preferred_difficulty_level'])

train_df['preferred_difficulty_level'] = le_lev.transform(train_df['preferred_difficulty_level'])
train_df['preferred_difficulty_level'].head()

In [None]:
train_df.info()

In [None]:
train_df

### 이상치 확인

In [None]:
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

fig, ax = plt.subplots(4,3, figsize = (20,30))

for idx,col in enumerate(train_df.columns[1:13]):
    
    row_index = idx // 3
    col_index = idx % 3

    sns.boxplot(x = train_df['target'],y = train_df[col], color = 'darkseagreen', ax = ax[row_index,col_index])
    ax[row_index,col_index].set_title('boxplot for %s' % col)
plt.show()

plt.tight_layout(pad=5)
plt.subplots_adjust(wspace=0, hspace=5)

#### 이상치 제거 없이 머신러닝 해보기

In [None]:
# import pandas as pd
# import numpy as np

# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import StratifiedKFold
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score

# y_train = train_df['target']
# X_train = train_df.drop('target',axis=1)

# train_X,test_X,train_y,test_y = train_test_split(X_train,y_train,
#                                                test_size=0.2,random_state=11)

# dt_clf = DecisionTreeClassifier(random_state=11)
# rf_clf = RandomForestClassifier(random_state=11)
# lr_clf = LogisticRegression(random_state=11)
    
# dt_clf.fit(train_X,train_y)
# rf_clf.fit(train_X,train_y)
# lr_clf.fit(train_X,train_y)
                        
# accuracy_dt = accuracy_score(dt_clf.predict(test_X),test_y)
# accuracy_rf = accuracy_score(rf_clf.predict(test_X),test_y)
# accuracy_lr = accuracy_score(lr_clf.predict(test_X),test_y)


In [None]:
# # 충격적인 결과
# print(accuracy_dt)
# print(accuracy_rf)
# print(accuracy_lr)

#### 이상치 제거 : IQR

In [None]:
train_df['customer_inquiry_history'].index

In [None]:
# outlier 찾기

weight = 1.5
Q1 = train_df['customer_inquiry_history'].quantile(0.25)
Q3 = train_df['customer_inquiry_history'].quantile(0.75)
IQR = Q3-Q1
max_value = Q3 + (IQR * weight)
min_value = Q1 - (IQR * weight)

outlier = []
outlier_index = []
for i in train_df['customer_inquiry_history'].index:
    value = train_df['customer_inquiry_history'][i]

    if value > max_value or value < min_value:
        outlier.append(value)       
        outlier_index.append(i)

In [None]:
def outlier_feature(dataFrame):
    columns = dataFrame.columns
    outlier = []
    outlier_index = []
    
    for column in columns:
        # 칼럼별 이상치 범위 계산
        weight = 1.5
        Q1 = dataFrame[column].quantile(0.25)
        Q3 = dataFrame[column].quantile(0.75)
        IQR = Q3-Q1
        max_value = Q3 + (IQR * weight)
        min_value = Q1 - (IQR * weight)

        # 칼럼별 이상치 여부 확인 후 인덱스 담기
        for i in train_df[column].index:
            value = train_df[column][i]

            if value > max_value or value < min_value:
                outlier.append(value)       
                outlier_index.append(i)
                
    # 칼럼별 이상치가 있는 행 인덱스 중복 제거
    outlier_index = list(set(outlier_index))
    
    # 이상치 있는 행 제거
    dataFrame.drop(outlier_index, axis=0, inplace=True)
        
    return dataFrame

In [None]:
train_df.info()

### 제거 확인

In [None]:
outlier_feature(train_df)
train_df.info()

In [None]:
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

fig, ax = plt.subplots(4,3, figsize = (20,30))

for idx,col in enumerate(train_df.columns[1:13]):
    
    row_index = idx // 3
    col_index = idx % 3

    sns.boxplot(x = train_df['target'],y = train_df[col], color = 'darkseagreen', ax = ax[row_index,col_index])
    ax[row_index,col_index].set_title('boxplot for %s' % col)
plt.show()

plt.tight_layout(pad=5)
plt.subplots_adjust(wspace=0, hspace=5)

### 전처리 내용 정리
 1. user_id 칼럼 삭제
 2. object 타입 칼럼 라벨인코딩
 3. 이상치 제거 -> IQR

In [54]:

from sklearn import preprocessing

def drop_feature(dataFrame):
    dataFrame.drop('user_id',axis=1,inplace=True)
    return dataFrame
    
def encoding_feature(dataFrame):    
    object_columns = dataFrame.select_dtypes(include='object')
    for column in object_columns.columns:
        le = preprocessing.LabelEncoder()
        le = le.fit(dataFrame[column])
        dataFrame[column] = le.transform(dataFrame[column])
    return dataFrame

def outlier_feature(dataFrame):
    columns = dataFrame.columns
    outlier = []
    outlier_index = []
    
    for column in columns:
        # 칼럼별 이상치 범위 계산
        weight = 1.5
        Q1 = dataFrame[column].quantile(0.25)
        Q3 = dataFrame[column].quantile(0.75)
        IQR = Q3-Q1
        max_value = Q3 + (IQR * weight)
        min_value = Q1 - (IQR * weight)

        # 칼럼별 이상치 여부 확인 후 인덱스 담기
        for i in train_df[column].index:
            value = train_df[column][i]

            if value > max_value or value < min_value:
                outlier.append(value)       
                outlier_index.append(i)
                
    # 칼럼별 이상치가 있는 행 인덱스 중복 제거
    outlier_index = list(set(outlier_index))
    
    # 이상치 있는 행 제거
    dataFrame.drop(outlier_index, axis=0, inplace=True)
        
    return dataFrame
        
        
def preprocessing_df(dataFrame):
    dataFrame = drop_feature(dataFrame)
    dataFrame = encoding_feature(dataFrame)
    dataFrame = outlier_feature(dataFrame)
    return dataFrame

In [55]:
train_df = preprocessing_df(train_df)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9115 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   subscription_duration              9115 non-null   int64  
 1   recent_login_time                  9115 non-null   int64  
 2   average_login_time                 9115 non-null   float64
 3   average_time_per_learning_session  9115 non-null   float64
 4   monthly_active_learning_days       9115 non-null   int64  
 5   total_completed_courses            9115 non-null   int64  
 6   recent_learning_achievement        9115 non-null   float64
 7   abandoned_learning_sessions        9115 non-null   int64  
 8   community_engagement_level         9115 non-null   int64  
 9   preferred_difficulty_level         9115 non-null   int64  
 10  subscription_type                  9115 non-null   int64  
 11  customer_inquiry_history           9115 non-null   int64  
 1

In [56]:
test_df = preprocessing_df(test_df)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   subscription_duration              10000 non-null  int64  
 1   recent_login_time                  10000 non-null  int64  
 2   average_login_time                 10000 non-null  float64
 3   average_time_per_learning_session  10000 non-null  float64
 4   monthly_active_learning_days       10000 non-null  int64  
 5   total_completed_courses            10000 non-null  int64  
 6   recent_learning_achievement        10000 non-null  float64
 7   abandoned_learning_sessions        10000 non-null  int64  
 8   community_engagement_level         10000 non-null  int64  
 9   preferred_difficulty_level         10000 non-null  int64  
 10  subscription_type                  10000 non-null  int64  
 11  customer_inquiry_history           10000 non-null  int6

# 머신러닝

## 기본 머신러닝

In [None]:
train_df['target'].head(20)

In [58]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

y_train = train_df['target']
X_train = train_df.drop('target',axis=1)

train_X,test_X,train_y,test_y = train_test_split(X_train,y_train,
                                               test_size=0.2,random_state=11)

dt_clf = DecisionTreeClassifier(random_state=11)
rf_clf = RandomForestClassifier(random_state=11)
lr_clf = LogisticRegression(random_state=11)
    
dt_clf.fit(train_X,train_y)
rf_clf.fit(train_X,train_y)
lr_clf.fit(train_X,train_y)
                        
accuracy_dt = accuracy_score(dt_clf.predict(test_X),test_y)
accuracy_rf = accuracy_score(rf_clf.predict(test_X),test_y)
accuracy_lr = accuracy_score(lr_clf.predict(test_X),test_y)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [59]:
print(accuracy_dt)
print(accuracy_rf)
print(accuracy_lr)

0.5117937465715853
0.5655512890839276
0.578167855183763


## 교차검증

In [60]:
from sklearn.model_selection import cross_val_score

CV_scores_dt = cross_val_score(dt_clf,train_X,train_y,scoring='accuracy',cv=5)
CV_scores_rf = cross_val_score(rf_clf,train_X,train_y,scoring='accuracy',cv=5)
CV_scores_lr = cross_val_score(lr_clf,train_X,train_y,scoring='accuracy',cv=5)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [61]:
print(np.mean(CV_scores_dt))
print(np.mean(CV_scores_rf))
print(np.mean(CV_scores_lr))

0.5128881705811618
0.5807720115718998
0.6060066133200954


# 추가 전처리

## 피쳐 스케일링

In [85]:
categorical = ['preferred_difficulty_level','subscription_type','target']
uncategorical = train_df.drop(categorical,axis=1)


In [86]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(train_df)
train_scaled = scaler.transform(train_df)
train_df_scaled = pd.DataFrame(data=train_scaled, columns=train_df.columns)

train_df_scaled[categorical] = train_df[categorical]

In [87]:
train_df_scaled.describe()

Unnamed: 0,subscription_duration,recent_login_time,average_login_time,average_time_per_learning_session,monthly_active_learning_days,total_completed_courses,recent_learning_achievement,abandoned_learning_sessions,community_engagement_level,preferred_difficulty_level,subscription_type,customer_inquiry_history,payment_pattern,target
count,9115.0,9115.0,9115.0,9115.0,9115.0,9115.0,9115.0,9115.0,9115.0,8300.0,8300.0,9115.0,9115.0,8300.0
mean,5.924438e-17,4.6771880000000007e-17,6.920289e-16,7.951219000000001e-17,4.5212810000000005e-17,-2.408752e-16,6.587039e-16,1.492802e-16,1.262841e-16,1.099759,0.383012,8.574844e-18,1.24725e-17,0.598313
std,1.000055,1.000055,1.000055,1.000055,1.000055,1.000055,1.000055,1.000055,1.000055,0.707439,0.48615,1.000055,1.000055,0.490269
min,-1.646819,-1.676844,-2.735967,-1.178933,-1.672269,-2.593915,-2.737837,-1.808359,-2.249703,0.0,0.0,-1.43321,-1.51547,0.0
25%,-0.8904038,-0.8410551,-0.6739461,-0.8100372,-0.8052603,-0.6080169,-0.6967423,-0.5918871,-0.6781178,1.0,0.0,-0.7108597,-1.083014,0.0
50%,0.01729423,-0.005265845,-0.005818882,-0.2822042,0.06174798,-0.04061746,0.004971651,0.01634863,0.1076747,1.0,0.0,0.01149104,0.2143539,1.0
75%,0.9249923,0.8305234,0.6813112,0.5584841,0.9287562,0.8104817,0.6840769,0.6245844,0.8934672,2.0,1.0,0.7338418,1.079266,1.0
max,1.681407,1.666313,2.74184,3.075431,1.651263,2.79638,2.749703,2.449292,0.8934672,2.0,1.0,2.900894,1.511722,1.0


In [88]:
train_df_scaled.fillna(0,inplace=True)

In [89]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

y_train = train_df_scaled['target']
X_train = train_df_scaled.drop('target',axis=1)

train_X,test_X,train_y,test_y = train_test_split(X_train,y_train,
                                               test_size=0.2,random_state=11)

# print(train_X)
# print(test_X)
# print(train_y)
# print(test_y)



dt_clf = DecisionTreeClassifier(random_state=11)
rf_clf = RandomForestClassifier(random_state=11)
lr_clf = LogisticRegression(random_state=11)
    
dt_clf.fit(train_X,train_y)
rf_clf.fit(train_X,train_y)
lr_clf.fit(train_X,train_y)
                        
accuracy_dt = accuracy_score(dt_clf.predict(test_X),test_y)
accuracy_rf = accuracy_score(rf_clf.predict(test_X),test_y)
accuracy_lr = accuracy_score(lr_clf.predict(test_X),test_y)


In [90]:
print(accuracy_dt)
print(accuracy_rf)
print(accuracy_lr)

0.5095995611629183
0.5792649478880966
0.5929786066922655


# 하이퍼 파라미터 튜닝

* 의사결정나무

In [62]:
from sklearn.model_selection import GridSearchCV

parameters = {'max_depth':[2,5,10],'min_samples_leaf':[2,5,10]}
grid_dt_clf = GridSearchCV(dt_clf,param_grid = parameters,cv=5, refit=True)
grid_dt_clf.fit(X_train,y_train)

scores_df_dt = pd.DataFrame(grid_dt_clf.cv_results_)


In [63]:
scores_df_dt[['params','mean_test_score','rank_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score
0,"{'max_depth': 2, 'min_samples_leaf': 2}",0.600549,1
1,"{'max_depth': 2, 'min_samples_leaf': 5}",0.600549,1
2,"{'max_depth': 2, 'min_samples_leaf': 10}",0.600549,1
3,"{'max_depth': 5, 'min_samples_leaf': 2}",0.598354,4
4,"{'max_depth': 5, 'min_samples_leaf': 5}",0.598025,5
5,"{'max_depth': 5, 'min_samples_leaf': 10}",0.597257,6
6,"{'max_depth': 10, 'min_samples_leaf': 2}",0.580143,9
7,"{'max_depth': 10, 'min_samples_leaf': 5}",0.580911,7
8,"{'max_depth': 10, 'min_samples_leaf': 10}",0.580911,8


In [69]:
print('GridSearchCV 최적 하이퍼 파라미터: ',grid_dt_clf.best_params_)
print('GridSearchCV 최고 정확도: {0:.4f}'.format(grid_dt_clf.best_score_))
best_dt_clf = grid_dt_clf.best_estimator_

GridSearchCV 최적 하이퍼 파라미터:  {'max_depth': 2, 'min_samples_leaf': 2}
GridSearchCV 최고 정확도: 0.6005


In [72]:
# GridSearchCV의 최적 하이퍼 파라미터로 학습된 Estimator로 예측 및 평가 수행

best_dt_pred = best_dt_clf.predict(test_X)

accuracy = accuracy_score(test_y,best_dt_pred)

print('테스트 세트에서의 DecisionTreeClassifier 정확도: {0:.4f}'.format(accuracy))

테스트 세트에서의 DecisionTreeClassifier 정확도: 0.5782


In [73]:
# 교차검증
best_dt_clf_score = cross_val_score(best_dt_clf,train_X,train_y,scoring='accuracy',cv=5)
print(np.round(np.mean(best_dt_clf_score),4))

0.6059


* 로지스틱회귀

In [92]:
from sklearn.model_selection import GridSearchCV

params = {'penalty': ['l2'], 'C': [0.01, 0.1, 1, 5, 10]}
grid_lr_clf = GridSearchCV(lr_clf, param_grid=params, cv=5, refit=True)
grid_lr_clf.fit(X_train,y_train)

scores_df_lr = pd.DataFrame(grid_lr_clf.cv_results_)


In [93]:
scores_df_lr[['params','mean_test_score','rank_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score
0,"{'C': 0.01, 'penalty': 'l2'}",0.595283,1
1,"{'C': 0.1, 'penalty': 'l2'}",0.591991,2
2,"{'C': 1, 'penalty': 'l2'}",0.591772,3
3,"{'C': 5, 'penalty': 'l2'}",0.591662,4
4,"{'C': 10, 'penalty': 'l2'}",0.591662,4


# to do
* 추가 전처리
1. feature들 중에 연관있는 애들끼리 묶어보기 (학습시간 / 커뮤니티 참여 등)
2. 