In [193]:
import pandas as pd
train_df = pd.read_csv('./input/train.csv')
test_df = pd.read_csv('./input/test.csv')
y = train_df['Transported']
train_df = train_df.drop(columns=['Transported'])

In [194]:
all_data = pd.concat([train_df, test_df])


In [195]:
for column in all_data.columns:
    mode = all_data[column].mode()[0]  # 최빈값 계산
    all_data[column].fillna(mode, inplace=True)  # 결측치 대체
    if column == 'Transported':
        continue
    all_data[column].fillna(mode, inplace=True)  # 결측치 대체

In [196]:
all_data['Passenger_Group_ID'] = all_data['PassengerId'].apply(lambda x: int(x.split('_')[0]))

def cabin_feature_engineering(df):
    deck_list = []
    num_list = []
    side_list = []

    for tmp_cabin in df['Cabin']:
        tmp_cabin = str(tmp_cabin)
        if tmp_cabin == 'nan':
            deck_list.append(None)
            num_list.append(None)
            side_list.append(None)
        else:
            tmp_cabin_str = str(tmp_cabin).split('/')
            deck_list.append(tmp_cabin_str[0])
            num_list.append(int(tmp_cabin_str[1]))
            side_list.append(tmp_cabin_str[2])

    df['Cabin_Deck'] = deck_list
    df['Cabin_Num'] = num_list
    df['Cabin_Side'] = side_list
    return df

all_data = cabin_feature_engineering(all_data)

In [197]:
all_data = all_data.drop(columns=['PassengerId','Cabin','Spa','Name'])

In [198]:
binary_encoding_list = ['CryoSleep', 'VIP']
for tmp_feature in binary_encoding_list:
    all_data[tmp_feature] = all_data[tmp_feature].map({False: 0,True: 1})


In [199]:
from sklearn.preprocessing import OneHotEncoder
one_hot_encoding_list = ['HomePlanet', 'Destination', 'Cabin_Deck','Cabin_Side']

one_hot_encoder = OneHotEncoder()
encoded_matrix = one_hot_encoder.fit_transform(all_data[one_hot_encoding_list])

# 원핫 인코딩 리스트
# HomePlanet, Destination,  Cabin_Deck, Cabin_Side
print(encoded_matrix.shape)

(12970, 16)


In [200]:
all_data = all_data.drop(columns=one_hot_encoding_list)

In [202]:
all_data.head().T

Unnamed: 0,0,1,2,3,4
CryoSleep,0.0,0.0,0.0,0.0,0.0
Age,39.0,24.0,58.0,33.0,16.0
VIP,0.0,0.0,1.0,0.0,0.0
RoomService,0.0,109.0,43.0,0.0,303.0
FoodCourt,0.0,9.0,3576.0,1283.0,70.0
ShoppingMall,0.0,25.0,0.0,371.0,151.0
VRDeck,0.0,44.0,49.0,193.0,2.0
Passenger_Group_ID,1.0,2.0,3.0,3.0,4.0
Cabin_Num,0.0,0.0,0.0,0.0,1.0


In [201]:
train_y = [1 if tmp==True else 0 for tmp in y]

In [163]:
from scipy import sparse

all_data_sprs = sparse.hstack([sparse.csr_matrix(all_data),
                               encoded_matrix],
                               format='csr')

In [192]:
X_train.T

<25x8693 sparse matrix of type '<class 'numpy.float64'>'
	with 75788 stored elements in Compressed Sparse Column format>

In [175]:
num_train = len(train_df) # 훈련 데이터 개수

# 훈련 데이터와 테스트 데이터 나누기
X_train = all_data_sprs[:num_train] # 0 ~ num_train -1 행
X_test = all_data_sprs[num_train:] # num_train ~ 마지막 행

y_train = train_y

In [173]:
from sklearn.model_selection import train_test_split

# 훈련 데이터, 검증 데이터 분리
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y,
                                                      test_size=0.1,
                                                      stratify=y,
                                                      random_state=10)

In [182]:
X_train.T

<25x8693 sparse matrix of type '<class 'numpy.float64'>'
	with 75788 stored elements in Compressed Sparse Column format>

In [176]:
%%time
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# 로지스틱 회귀 모델 생성
logistic_model = LogisticRegression()

# 하이퍼 파라미터 값 목록
lr_params = {
    'C': [0.1, 0.125, 0.2], 'max_iter':[800, 900, 1000],
    'solver': ['liblinear'], 'random_state': [42]
}

# 그리드 서치 객체 생성
gridsearch_logistic_model = GridSearchCV(
    estimator = logistic_model,
    param_grid=lr_params,
    scoring='accuracy',
    cv=5
)

# 그리드서치 수행
gridsearch_logistic_model.fit(X_train, y_train)

print('최적 하이퍼파라미터: ', gridsearch_logistic_model.best_params_)

최적 하이퍼파라미터:  {'C': 0.125, 'max_iter': 800, 'random_state': 42, 'solver': 'liblinear'}
Wall time: 373 ms


In [178]:
# 최적의 모델에 대해서 predict 수행
y_pred = gridsearch_logistic_model.predict(X_test)

# 0 또는 1로 예측된 결과 출력
print(len(y_pred))

4277


In [132]:
y[:5]

0    0
1    1
2    0
3    0
4    1
Name: Transported, dtype: int64

In [179]:
submission_df = pd.read_csv('./input/sample_submission.csv')

y_pred_real = [False if y==0 else True for y in y_pred]

In [180]:
submission_df['Transported'] = y_pred_real

In [181]:
submission_df.to_csv('./output/jhkim.csv', index=False)

In [137]:
train_df.head().T

Unnamed: 0,0,1,2,3,4
CryoSleep,0.0,0.0,0.0,0.0,0.0
Age,39.0,24.0,58.0,33.0,16.0
VIP,0.0,0.0,1.0,0.0,0.0
FoodCourt,0.0,9.0,3576.0,1283.0,70.0
VRDeck,0.0,44.0,49.0,193.0,2.0
Passenger_Group_ID,1.0,2.0,3.0,3.0,4.0
Cabin_Num,0.0,0.0,0.0,0.0,1.0
