In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
%matplotlib inline

train_data = pd.read_csv("C:/AI_Projects/Titanic/train.csv")
test_data = pd.read_csv("C:/AI_Projects/Titanic/test.csv")
all_data = pd.concat([train_data, test_data], sort=False) 


print("数据概览：")
print(train_data.info())
print("\n缺失值统计：")
print(train_data.isnull().sum())


sns.barplot(x='Survived', y='Sex', data=train_data)
plt.title("Survival Rate by Sex")
plt.show()


sns.barplot(x='Survived', y='Pclass', orient='h', data=train_data)
plt.title("Survival Rate by Pclass")
plt.show()


all_data['Age'] = all_data.groupby('Pclass')['Age'].transform(lambda x: x.fillna(x.median()))

all_data['Fare'] = all_data['Fare'].fillna(all_data['Fare'].median())

all_data['Embarked'] = all_data['Embarked'].fillna(all_data['Embarked'].mode()[0])

all_data['Cabin'] = all_data['Cabin'].fillna('Unknown')


all_data['Title'] = all_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

all_data['Title'] = all_data['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
all_data['Title'] = all_data['Title'].replace('Mlle', 'Miss')
all_data['Title'] = all_data['Title'].replace('Ms', 'Miss')
all_data['Title'] = all_data['Title'].replace('Mme', 'Mrs')


all_data['FamilySize'] = all_data['SibSp'] + all_data['Parch'] + 1

all_data['IsAlone'] = (all_data['FamilySize'] == 1).astype(int)

all_data['CabinLetter'] = all_data['Cabin'].str[0]

all_data['FareBin'] = pd.qcut(all_data['Fare'], 4, labels=False)

all_data['AgeBin'] = pd.cut(all_data['Age'].astype(int), 5, labels=False)

label = LabelEncoder()
for col in ['Sex', 'Embarked', 'Title', 'CabinLetter']:
    all_data[col] = label.fit_transform(all_data[col])

train = all_data[:len(train_data)]
test = all_data[len(train_data):]
X = train.drop(['Survived', 'PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
y = train['Survived']
X_test = test.drop(['Survived', 'PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

rf = RandomForestClassifier(random_state=42)
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5]
}
rf_grid = GridSearchCV(rf, rf_params, cv=5, scoring='accuracy')
rf_grid.fit(X, y)
print("随机森林最佳参数：", rf_grid.best_params_)
print("随机森林最佳得分：", rf_grid.best_score_)

xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.1]
}
xgb_grid = GridSearchCV(xgb, xgb_params, cv=5, scoring='accuracy')
xgb_grid.fit(X, y)
print("XGBoost最佳参数：", xgb_grid.best_params_)
print("XGBoost最佳得分：", xgb_grid.best_score_)

voting_clf = VotingClassifier(
    estimators=[
        ('rf', rf_grid.best_estimator_),
        ('xgb', xgb_grid.best_estimator_)
    ],
    voting='soft'
)
voting_clf.fit(X, y)

predictions = voting_clf.predict(X_test).astype(int)
output = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': predictions})
output.to_csv('submission_advanced.csv', index=False)
print("高级提交文件已生成：submission_advanced.csv")

ModuleNotFoundError: No module named 'seaborn'