<a href="https://colab.research.google.com/github/bekasberkisah/tugasfundamentaldataanalyst/blob/main/implementasiCRISP-DM_DataSetTitanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ===============================================
# CRISP-DM IMPLEMENTATION — TITANIC DATASET
# ===============================================

# 1. BUSINESS UNDERSTANDING
business_goal = """
Memprediksi apakah seorang penumpang Titanic selamat atau tidak.
Tujuan bisnis: meningkatkan pemahaman faktor-faktor keselamatan.
Tujuan ML: binary classification (Survived = 0 atau 1)
"""

print(business_goal)


# 2. DATA UNDERSTANDING
import pandas as pd

# GANTI DENGAN URL RAW GITHUB MILIKMU
url = "URL_RAW_GITHUB/Titanic-Dataset.csv"

df = pd.read_csv(url)
print("Preview Data:")
display(df.head())
print(df.info())
print(df.describe())
print("\nMissing Value:")
print(df.isnull().sum())


# 3. DATA PREPARATION
# Hapus kolom yang tidak berguna
df = df.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"])

# Isi missing value
df["Age"] = df["Age"].fillna(df["Age"].median())
df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode()[0])

# Encoding – ubah kategori jadi angka
df = pd.get_dummies(df, drop_first=True)

# Pisahkan X dan y
X = df.drop(columns=["Survived"])
y = df["Survived"]

# Scaling data numerik
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)


# 4. MODELING
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

model = RandomForestClassifier(random_state=42)

# Hyperparameter tuning
params = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10, 20]
}

grid = GridSearchCV(model, params, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
print("Best model:", best_model)


# 5. EVALUATION
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y_pred = best_model.predict(X_test)

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nAccuracy =", accuracy_score(y_test, y_pred))


# 6. DEPLOYMENT
import joblib

joblib.dump(best_model, "titanic_model.pkl")
joblib.dump(scaler, "titanic_scaler.pkl")

print("\nModel dan scaler telah disimpan!")
