In [1]:
import pandas as pd
train_df = pd.read_csv('./input/train.csv')
train_df = train_df.dropna(axis=1)
test_df = pd.read_csv('./input/test.csv')

train_df = train_df.drop(columns=['Transported'])

In [2]:
all_data = pd.concat([train_df, test_df])


In [3]:
for column in all_data.columns:
    mode = all_data[column].mode()[0]  # 최빈값 계산
    all_data[column].fillna(mode, inplace=True)  # 결측치 대체
    if column == 'Transported':
        continue
    all_data[column].fillna(mode, inplace=True)  # 결측치 대체

In [4]:
all_data['Passenger_Group_ID'] = all_data['PassengerId'].apply(lambda x: int(x.split('_')[0]))

def cabin_feature_engineering(df):
    deck_list = []
    num_list = []
    side_list = []

    for tmp_cabin in df['Cabin']:
        tmp_cabin = str(tmp_cabin)
        if tmp_cabin == 'nan':
            deck_list.append(None)
            num_list.append(None)
            side_list.append(None)
        else:
            tmp_cabin_str = str(tmp_cabin).split('/')
            deck_list.append(tmp_cabin_str[0])
            num_list.append(int(tmp_cabin_str[1]))
            side_list.append(tmp_cabin_str[2])

    df['Cabin_Deck'] = deck_list
    df['Cabin_Num'] = num_list
    df['Cabin_Side'] = side_list
    return df

all_data = cabin_feature_engineering(all_data)

In [5]:
all_data = all_data.drop(columns=['PassengerId','Cabin','Spa','Name'])

In [6]:
binary_encoding_list = ['CryoSleep', 'VIP']
for tmp_feature in binary_encoding_list:
    all_data[tmp_feature] = all_data[tmp_feature].map({False: 0,True: 1})


In [7]:
from sklearn.preprocessing import OneHotEncoder
one_hot_encoding_list = ['HomePlanet', 'Destination', 'Cabin_Deck','Cabin_Side']

one_hot_encoder = OneHotEncoder()
encoded_matrix = one_hot_encoder.fit_transform(all_data[one_hot_encoding_list])

# 원핫 인코딩 리스트
# HomePlanet, Destination,  Cabin_Deck, Cabin_Side
print(encoded_matrix.shape)

(12970, 16)


In [8]:
all_data = all_data.drop(columns=one_hot_encoding_list)

In [9]:
all_data.head().T

Unnamed: 0,0,1,2,3,4
CryoSleep,0.0,0.0,0.0,0.0,0.0
Age,18.0,18.0,18.0,18.0,18.0
VIP,0.0,0.0,0.0,0.0,0.0
RoomService,0.0,0.0,0.0,0.0,0.0
FoodCourt,0.0,0.0,0.0,0.0,0.0
ShoppingMall,0.0,0.0,0.0,0.0,0.0
VRDeck,0.0,0.0,0.0,0.0,0.0
Passenger_Group_ID,1.0,2.0,3.0,3.0,4.0
Cabin_Num,160.0,160.0,160.0,160.0,160.0


In [10]:
train_y = [1 if tmp==True else 0 for tmp in y]

In [11]:
from scipy import sparse

all_data_sprs = sparse.hstack([sparse.csr_matrix(all_data),
                               encoded_matrix],
                               format='csr')

In [13]:
all_data.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4267,4268,4269,4270,4271,4272,4273,4274,4275,4276
CryoSleep,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
Age,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,...,3.0,20.0,43.0,43.0,40.0,34.0,42.0,18.0,18.0,43.0
VIP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
RoomService,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,47.0,0.0,0.0,0.0,0.0,0.0,0.0
FoodCourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,601.0,0.0,0.0,865.0,0.0,847.0,0.0,2680.0,0.0
ShoppingMall,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,103.0,0.0,3851.0,0.0,0.0,17.0,0.0,0.0,0.0
VRDeck,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,144.0,0.0,523.0,0.0
Passenger_Group_ID,1.0,2.0,3.0,3.0,4.0,5.0,6.0,6.0,7.0,8.0,...,9260.0,9262.0,9263.0,9265.0,9266.0,9266.0,9269.0,9271.0,9273.0,9277.0
Cabin_Num,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,...,1503.0,1795.0,1495.0,278.0,1796.0,1496.0,160.0,296.0,297.0,1498.0


In [14]:
num_train = len(train_df) # 훈련 데이터 개수

# 훈련 데이터와 테스트 데이터 나누기
X_train = all_data_sprs[:num_train] # 0 ~ num_train -1 행
X_test = all_data_sprs[num_train:] # num_train ~ 마지막 행

y_train = train_y

In [15]:
from sklearn.model_selection import train_test_split

# 훈련 데이터, 검증 데이터 분리
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y,
                                                      test_size=0.1,
                                                      stratify=y,
                                                      random_state=10)

In [16]:
X_train.T

<25x7823 sparse matrix of type '<class 'numpy.float64'>'
	with 54761 stored elements in Compressed Sparse Column format>

In [17]:
%%time
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# 로지스틱 회귀 모델 생성
logistic_model = LogisticRegression()

# 하이퍼 파라미터 값 목록
lr_params = {
    'C': [0.1, 0.125, 0.2], 'max_iter':[800, 900, 1000],
    'solver': ['liblinear'], 'random_state': [42]
}

# 그리드 서치 객체 생성
gridsearch_logistic_model = GridSearchCV(
    estimator = logistic_model,
    param_grid=lr_params,
    scoring='accuracy',
    cv=5
)

# 그리드서치 수행
gridsearch_logistic_model.fit(X_train, y_train)

print('최적 하이퍼파라미터: ', gridsearch_logistic_model.best_params_)

최적 하이퍼파라미터:  {'C': 0.1, 'max_iter': 800, 'random_state': 42, 'solver': 'liblinear'}
Wall time: 1.27 s


In [18]:
# 최적의 모델에 대해서 predict 수행
y_pred = gridsearch_logistic_model.predict(X_test)

# 0 또는 1로 예측된 결과 출력
print(len(y_pred))

4277


In [19]:
y[:5]

0    False
1     True
2    False
3    False
4     True
Name: Transported, dtype: bool

In [20]:
submission_df = pd.read_csv('./input/sample_submission.csv')

y_pred_real = [False if y==0 else True for y in y_pred]

In [21]:
submission_df['Transported'] = y_pred_real

In [22]:
submission_df.to_csv('./output/jhkim.csv', index=False)

In [23]:
train_df.head().T

Unnamed: 0,0,1,2,3,4
PassengerId,0001_01,0002_01,0003_01,0003_02,0004_01
