In [246]:
import re
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier

In [247]:
def loadData():
    fTrainData = pd.read_csv("train.csv")
    fTestData = pd.read_csv("test.csv")
    pids = fTestData["PassengerId"].tolist()

    def PreProcessData(data):
        # 缺失值 中位数补充
        data["Age"] = data["Age"].fillna(data["Age"].median())
        data["Fare"] = data["Fare"].fillna(data["Fare"].median())

        # 文本特征处理
        data.loc[data["Sex"] == "male", "Sex"] = 0
        data.loc[data["Sex"] == "female", "Sex"] = 1
        data["Embarked"] = data["Embarked"].fillna("S")
        data.loc[data["Embarked"] == "S", "Embarked"] = 0
        data.loc[data["Embarked"] == "C", "Embarked"] = 1
        data.loc[data["Embarked"] == "Q", "Embarked"] = 2

        def get_title(name):
            titleSearch = re.search(" ([A-Za-z]+)\\.", name)
            if titleSearch:
                return titleSearch.group(1)
            return ""

        titles = data["Name"].apply(get_title)
        title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8,
                         "Mme": 8, "Don": 9, "Dona": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7,
                         "Ms": 2}
        for k, v in title_mapping.items():
            titles[titles == k] = v

        # 新特征表示尊称
        data["Title"] = [int(i) for i in titles.values.tolist()]
        # 新特征表示名字长度
        data["NameLength"] = data["Name"].apply(lambda x: len(x))

        # # 相关性分析
        # # 相关性协方差表, corr()函数,返回结果接近0说明无相关性,大于0说明是正相关,小于0是负相关.
        # train_corr = data.corr()
        # print(train_corr)
        # # # 画出相关性热力图
        # a = plt.subplots(figsize=(15, 9))  #调整画布大小
        # a = sns.heatmap(train_corr, vmin=-1, vmax=1, annot=True, square=True)  #画热力图
        # plt.show()

        # 相关性太差的删除
        data.drop(['PassengerId'], axis=1, inplace=True)
        data.drop(['Cabin'], axis=1, inplace=True)
        # data.drop(['SibSp'], axis=1, inplace=True)
        # data.drop(['Parch'], axis=1, inplace=True)
        data.drop(['Ticket'], axis=1, inplace=True)
        data.drop(['Name'], axis=1, inplace=True)

        return data

    PreProcessData(fTrainData)
    PreProcessData(fTestData)

    trainData = fTrainData.iloc[:, 1:]
    trainLabel = fTrainData.iloc[:, 0]
    testData = fTestData.iloc[:, :]

    return trainData, trainLabel,testData, pids

## 数据预处理

In [248]:
trainData,trainLabel,testData,pids = loadData()

## 特征工程

In [249]:
def FeatureEngineering(data, COMPONENT_NUM=0.8):
    # 标准化
    scaler = preprocessing.StandardScaler()
    s_data = scaler.fit_transform(data)

    # 降维
    # 不降维时准确率上升
    # pca = PCA(n_components=COMPONENT_NUM, whiten=False)
    # pca.fit(s_data)
    # pca_data = pca.transform(s_data)
    # print("方差大小:\n", pca.explained_variance_, "方差占比:\n", pca.explained_variance_ratio_)
    # print("特征数量: %s" % pca.n_components_)
    # print("总方差占比: %s" % sum(pca.explained_variance_ratio_))

    return s_data

In [250]:
pcaTrainData = FeatureEngineering(trainData)
pcaTestData = FeatureEngineering(testData)

## 模型训练&模型融合

In [251]:
def TrainModelByLR(trainData, trainLabel):
    model = LogisticRegression(random_state=1)
    scores = cross_val_score(model, trainData, trainLabel, cv=5, scoring="roc_auc")
    print(scores.mean(), "\n", scores)

In [252]:
TrainModelByLR(pcaTrainData, trainLabel)

0.8592520159240402 
 [0.86126482 0.84131016 0.86831551 0.84157754 0.88379205]
