In [None]:
from google.colab import files
uploaded = files.upload()


Saving test.csv to test (1).csv
Saving train.csv to train (1).csv


In [None]:
import pandas as pd
import numpy as np

# データ読み込み
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# 不要な列を削除
train_df = train_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
test_df = test_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

# 欠損値の補完
train_df["Age"] = train_df["Age"].fillna(train_df["Age"].median())
test_df["Age"] = test_df["Age"].fillna(test_df["Age"].median())
train_df["Embarked"] = train_df["Embarked"].fillna(train_df["Embarked"].mode()[0])
test_df["Fare"] = test_df["Fare"].fillna(test_df["Fare"].median())

# ワンホットエンコーディング
train_df = pd.get_dummies(train_df, columns=["Sex", "Embarked"])
test_df = pd.get_dummies(test_df, columns=["Sex", "Embarked"])

# 特徴量と目的変数に分割
X_train = train_df.drop("Survived", axis=1)
y_train = train_df["Survived"]

# カラム一致の確認
print("カラム一致:", set(X_train.columns) == set(test_df.columns))

# テストデータ（予測用）
X_test = test_df

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# モデルの作成と学習
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# 予測（訓練データで）
y_pred = model.predict(X_train)
print("訓練データでの精度:", accuracy_score(y_train, y_pred))

# 元のテストデータの PassengerId を再取得
test_df_raw = pd.read_csv("test.csv")
y_test_pred = model.predict(X_test)

submission = pd.DataFrame({
    "PassengerId": test_df_raw["PassengerId"],
    "Survived": y_test_pred
})

submission.to_csv("submission.csv", index=False)
print("提出ファイルを作成しました")


カラム一致: True
訓練データでの精度: 0.8002244668911336
提出ファイルを作成しました


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

# データ読み込み
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# 保存用にPassengerIdだけ別に保持
test_df_raw = test_df.copy()
passenger_ids = test_df_raw["PassengerId"]

# 不要な列を削除
train_df = train_df.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)
test_df = test_df.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)

# 欠損値の補完
train_df["Age"] = train_df["Age"].fillna(train_df["Age"].median())
test_df["Age"] = test_df["Age"].fillna(test_df["Age"].median())
train_df["Embarked"] = train_df["Embarked"].fillna(train_df["Embarked"].mode()[0])
test_df["Fare"] = test_df["Fare"].fillna(test_df["Fare"].median())

# 特徴量エンジニアリング：FamilySize
train_df["FamilySize"] = train_df["SibSp"] + train_df["Parch"] + 1
test_df["FamilySize"] = test_df["SibSp"] + test_df["Parch"] + 1

# 特徴量エンジニアリング：IsAlone
train_df["IsAlone"] = (train_df["FamilySize"] == 1).astype(int)
test_df["IsAlone"] = (test_df["FamilySize"] == 1).astype(int)

# 年齢をビンで分けてカテゴリ化（AgeBand）
train_df["AgeBand"] = pd.cut(train_df["Age"], bins=[0, 16, 32, 48, 64, 80], labels=[0, 1, 2, 3, 4])
test_df["AgeBand"] = pd.cut(test_df["Age"], bins=[0, 16, 32, 48, 64, 80], labels=[0, 1, 2, 3, 4])
train_df["AgeBand"] = train_df["AgeBand"].astype(int)
test_df["AgeBand"] = test_df["AgeBand"].astype(int)

# ワンホットエンコーディング（ダミー変数化）
train_df = pd.get_dummies(train_df, columns=["Sex", "Embarked"])
test_df = pd.get_dummies(test_df, columns=["Sex", "Embarked"])

# カラムを揃える（訓練とテストで同じ列にする）
missing_cols = set(train_df.columns) - set(test_df.columns) - {"Survived"}
for col in missing_cols:
    test_df[col] = 0
test_df = test_df[train_df.drop("Survived", axis=1).columns]

# 特徴量と目的変数に分割
X_train = train_df.drop("Survived", axis=1)
y_train = train_df["Survived"]
X_test = test_df

# モデルの作成と学習
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 精度評価（交差検証）
cv_scores = cross_val_score(model, X_train, y_train, cv=5)
print(f"交差検証スコアの平均: {cv_scores.mean():.4f}")

# 予測（訓練データで）
y_pred_train = model.predict(X_train)
print("訓練データでの精度:", accuracy_score(y_train, y_pred_train))

# テストデータで予測
y_test_pred = model.predict(X_test)

# 提出ファイルの作成
submission = pd.DataFrame({
    "PassengerId": passenger_ids,
    "Survived": y_test_pred
})
submission.to_csv("submission.csv", index=False)
print("提出ファイルを作成しました")


交差検証スコアの平均: 0.8047
訓練データでの精度: 0.9797979797979798
提出ファイルを作成しました


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

# データ読み込み
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
test_df_raw = test_df.copy()

# 不要な列を削除
train_df = train_df.drop(['Name', 'Ticket', 'Cabin'], axis=1)
test_df = test_df.drop(['Name', 'Ticket', 'Cabin'], axis=1)

# 欠損値補完
train_df["Age"] = train_df["Age"].fillna(train_df["Age"].median())
test_df["Age"] = test_df["Age"].fillna(test_df["Age"].median())
train_df["Embarked"] = train_df["Embarked"].fillna(train_df["Embarked"].mode()[0])
test_df["Fare"] = test_df["Fare"].fillna(test_df["Fare"].median())

# 特徴量追加
for df in [train_df, test_df]:
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
    df["IsAlone"] = (df["FamilySize"] == 1).astype(int)
    df["AgeBand"] = pd.cut(df["Age"], 5, labels=False)

# ワンホットエンコーディング
train_df = pd.get_dummies(train_df, columns=["Sex", "Embarked"])
test_df = pd.get_dummies(test_df, columns=["Sex", "Embarked"])

# カラム揃え（testに不足があれば補う）
for col in train_df.columns:
    if col not in test_df.columns and col != "Survived":
        test_df[col] = 0
test_df = test_df[train_df.drop("Survived", axis=1).columns]

# 特徴量・目的変数に分離
X_train = train_df.drop("Survived", axis=1)
y_train = train_df["Survived"]
X_test = test_df

# モデル学習
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# クロスバリデーションで評価
cv_score = cross_val_score(model, X_train, y_train, cv=5)
print("CV平均スコア:", cv_score.mean())

# テストデータ予測
y_test_pred = model.predict(X_test)

# 提出ファイル作成
submission = pd.DataFrame({
    "PassengerId": test_df_raw["PassengerId"],
    "Survived": y_test_pred
})
submission.to_csv("submission.csv", index=False)
print("提出ファイルを作成しました")


CV平均スコア: 0.7980541083422258
提出ファイルを作成しました
