# 加载库和数据

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [3]:
print(train_df['CryoSleep'].dtype)

object


# 探索性数据分析

In [4]:
# train_len:8693,test_len:4277
# print("train_len:{},test_len:{}".format(len(train_df), len(test_df)))
# print(train_df.head())
# print(train_df.columns) 
# print(train_df.isnull().sum())
# print(test_df.isnull().sum())

## HomePlanet: 缺失(train,test)201,87, 来自的星球(Earth, Europa, Mars)
HomePlanet_mode = train_df['HomePlanet'].mode()[0]
train_df['HomePlanet'].fillna(HomePlanet_mode)
test_df['HomePlanet'].fillna(HomePlanet_mode)

## CryoSleep: 缺失(train,test)217,93, (取值为True或False, 表示乘客是否选择在航程期间进入休眠状态。处于冷冻睡眠状态的乘客被限制在舱内。)
train_df['CryoSleep'] = train_df['CryoSleep'].astype(bool)
test_df['CryoSleep'] = test_df['CryoSleep'].astype(bool)
CryoSleep_mode = train_df['CryoSleep'].mode()[0]
train_df['CryoSleep'].fillna(CryoSleep_mode)
test_df['CryoSleep'].fillna(CryoSleep_mode)

## Cabin: 缺失(train,test)199,100,船舱号

## Destination: 缺失(train,test)182,92, 目标星球, 不能跟出发地一样
Destination_mode = train_df['Destination'].mode()[0]
train_df['Destination'].fillna(Destination_mode)
test_df['Destination'].fillna(Destination_mode)

## Age: 缺失(train,test)179,91，年龄
age_median = train_df['Age'].median()
train_df['Age'].fillna(age_median)
test_df['Age'].fillna(age_median)

## VIP: 缺失(train,test)203,93，取值为True或False，钞能力，可以根据船舱号推断插补
train_df['VIP'] = train_df['VIP'].astype(bool)
test_df['VIP'] = test_df['VIP'].astype(bool)
train_df['VIP'].fillna(0)
test_df['VIP'].fillna(0)
# 奢侈品账单
## RoomService: 缺失(train,test)181,82
## FoodCourt: 缺失(train,test)183,106
## ShoppingMall: 缺失(train,test)208,98
## Spa: 缺失(train,test)183,101
## VRDeck: 缺失(train,test)188,80
def fill0(df, cols):
    for col in cols:
        df[col].fillna(0)
luxury_cols=['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
fill0(train_df, luxury_cols)
fill0(test_df, luxury_cols)
train_df['LuxurySum'] = train_df['RoomService']+train_df['FoodCourt']+train_df['ShoppingMall']+train_df['Spa']+train_df['VRDeck']
test_df['LuxurySum'] = test_df['RoomService']+test_df['FoodCourt']+test_df['ShoppingMall']+test_df['Spa']+test_df['VRDeck']

## Name: 缺失(train,test)200,94 - 无需插补

# 特征工程

In [5]:
dummy_cols = ['HomePlanet', 'Destination']
train_df = pd.get_dummies(train_df, columns=dummy_cols)
test_df = pd.get_dummies(test_df, columns=dummy_cols)
# 对齐训练集和测试集的特征, 避免有的值没有出现训练集和测试集的特征不匹配问题
train_df, test_df = train_df.align(test_df, join='left', axis = 1, fill_value = 0)

In [6]:
features=['HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars',
          'CryoSleep',
          # , 'Cabin', 
          'Destination_55 Cancri e', 'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e',
          'Age',
           'VIP',
          # 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'
          'LuxurySum'
         ]

# 模型拟合和调参

In [7]:
X = train_df[features]
y = train_df['Transported']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_cv_score = cross_val_score(rf_model, X, y, cv=5).mean()
print(f"交叉验证准确率:{rf_cv_score:4f}")

交叉验证准确率:0.683999


In [9]:
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_cv_score = cross_val_score(xgb_model, X, y, cv=5).mean()
print(f"交叉验证准确率:{xgb_cv_score:4f}")

交叉验证准确率:0.715635


# 预测

In [10]:
X_test = test_df[features]
y_pred = rf_model.predict(X_test)
# submission_df = {"PassengerId":test_df["PassengerId"], "Transported":y_pred}
# submission_df.to_csv("Spaceship_titanic_submission.csv", index=False)