In [52]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# 데이터 CSV 파일 읽기
file_path = "https://raw.githubusercontent.com/MyungKyuYi/AI-class/refs/heads/main/titanic.csv"
df = pd.read_csv(file_path)

# 데이터프레임 확인
display(df.head())
print(df.columns)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [53]:
# 결측치 확인
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [54]:
# 결측치 처리: 'Age'의 결측치를 평균값으로 채움
df['Age'] = df['Age'].fillna(df['Age'].mean())

In [55]:
# 레이블 확인인
df['Survived'].value_counts()

Survived
0    549
1    342
Name: count, dtype: int64

In [56]:
# 불필요한 컬럼 삭제
df = df.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1)

In [57]:
# LabelEncoder 객체 생성
encoder = LabelEncoder()

# 'Sex' 열을 숫자로 변환
print(df['Sex'].value_counts())
df['Sex'] = encoder.fit_transform(df['Sex'])
print(df['Sex'].value_counts())

Sex
male      577
female    314
Name: count, dtype: int64
Sex
1    577
0    314
Name: count, dtype: int64


In [58]:
# 레이블 확인
display(df.head())

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,1,0,3,1,22.0,1,0,7.25
1,2,1,1,0,38.0,1,0,71.2833
2,3,1,3,0,26.0,0,0,7.925
3,4,1,1,0,35.0,1,0,53.1
4,5,0,3,1,35.0,0,0,8.05


In [59]:
# 'Survived'를 예측하기 위한 특성과 레이블 분리
X = df.drop('Survived', axis=1)  # 특성 (X)
y = df['Survived']               # 레이블 (y)

# 데이터 나누기 (학습 80%, 테스트 20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [60]:
from sklearn.tree import DecisionTreeClassifier

# Decision Tree 모델
print("Decision Tree 코드")
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
print("Decision Tree 결과: Accuracy =", accuracy_score(y_test, y_pred_dt))
print("\n", classification_report(y_test, y_pred_dt))
print("Confusion Matrix:")
cm_dt = confusion_matrix(y_test, y_pred_dt)
print(cm_dt)

Decision Tree 코드
Decision Tree 결과: Accuracy = 0.7374301675977654

               precision    recall  f1-score   support

           0       0.76      0.80      0.78       105
           1       0.70      0.65      0.67        74

    accuracy                           0.74       179
   macro avg       0.73      0.72      0.73       179
weighted avg       0.74      0.74      0.74       179

Confusion Matrix:
[[84 21]
 [26 48]]


In [61]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression 모델
print("Logistic Regression 코드")
# 데이터 정규화
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 로지스틱 회귀 모델 학습
lr_model = LogisticRegression(max_iter=200)
lr_model.fit(X_train_scaled, y_train)

# 예측 및 평가
y_pred_lr = lr_model.predict(X_test_scaled)
print("Logistic Regression 결과: Accuracy =", accuracy_score(y_test, y_pred_lr))
print("\n", classification_report(y_test, y_pred_lr))
print("Confusion Matrix:")
cm_lr = confusion_matrix(y_test, y_pred_lr)
print(cm_lr)

Logistic Regression 코드
Logistic Regression 결과: Accuracy = 0.8100558659217877

               precision    recall  f1-score   support

           0       0.81      0.88      0.84       105
           1       0.80      0.72      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179

Confusion Matrix:
[[92 13]
 [21 53]]


In [62]:
from sklearn.svm import SVC

# Support Vector Machine 모델
print("SVM 코드")
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
print("SVM 결과: Accuracy =", accuracy_score(y_test, y_pred_svm))
print("\n", classification_report(y_test, y_pred_svm))
print("Confusion Matrix:")
cm_svm = confusion_matrix(y_test, y_pred_svm)
print(cm_svm)

SVM 코드
SVM 결과: Accuracy = 0.5977653631284916

               precision    recall  f1-score   support

           0       0.60      0.98      0.74       105
           1       0.67      0.05      0.10        74

    accuracy                           0.60       179
   macro avg       0.63      0.52      0.42       179
weighted avg       0.62      0.60      0.48       179

Confusion Matrix:
[[103   2]
 [ 70   4]]


In [46]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest 모델
print("Random Forest 코드")
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print("Random Forest 결과: Accuracy =", accuracy_score(y_test, y_pred_rf))
print("\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:")
cm_rf = confusion_matrix(y_test, y_pred_rf)
print(cm_rf)

Random Forest 코드
Random Forest 결과: Accuracy = 0.8212290502793296

               precision    recall  f1-score   support

           0       0.82      0.89      0.85       105
           1       0.82      0.73      0.77        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179

Confusion Matrix:
[[93 12]
 [20 54]]


In [47]:
from sklearn.neighbors import KNeighborsClassifier

# 4데이터 정규화
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# KNN 모델 생성 및 학습
k = 3  # 최근접 이웃 개수
knn_model = KNeighborsClassifier(n_neighbors=k)
knn_model.fit(X_train_scaled, y_train)

# 예측 및 평가
y_pred_knn = knn_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred_knn)

print(f"KNN 결과: Accuracy = {accuracy:.4f}")
print("\n", classification_report(y_test, y_pred_knn))
print("Confusion Matrix:")
cm_knn = confusion_matrix(y_test, y_pred_knn)
print(cm_knn)

KNN 결과: Accuracy = 0.7989

               precision    recall  f1-score   support

           0       0.82      0.84      0.83       105
           1       0.76      0.74      0.75        74

    accuracy                           0.80       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179

Confusion Matrix:
[[88 17]
 [19 55]]
