In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

1. 데이터 로딩

In [None]:
# Titanic.csv 파일의 raw URL
url = 'https://github.com/Yoon-Kyuhyun/AI-introduction/blob/main/week2/titanic.csv'

# 데이터 로딩
df = pd.read_csv('titanic.csv')

# 데이터 확인
df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


2. 결측치 확인

In [4]:
# 결측치 확인
df.isnull().sum()


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

3. 결측치 제거 (Age인 경우 평균나이로)

In [None]:
# 'Age' 컬럼의 평균값 계산
mean_age = df['Age'].mean()

# 'Age' 컬럼의 결측치를 평균값으로 채우기
df['Age'] = df['Age'].fillna(mean_age)

# 결측치 확인 
print(df.isnull().sum())



PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


4. 레이블 확인 (imbalanced data)

In [8]:
# 레이블 컬럼 분포 확인 (imbalanced 여부)
df['Survived'].value_counts()


0    549
1    342
Name: Survived, dtype: int64

5. 불필요한 컬럼 제거

In [9]:
# 불필요한 컬럼 제거 (예: 'Name', 'Ticket', 'Cabin', 'Embarked')
df = df.drop(columns=['Name', 'Ticket', 'Cabin', 'Embarked'])

# 데이터 확인
df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,1,0,3,male,22.0,1,0,7.25
1,2,1,1,female,38.0,1,0,71.2833
2,3,1,3,female,26.0,0,0,7.925
3,4,1,1,female,35.0,1,0,53.1
4,5,0,3,male,35.0,0,0,8.05


6. 엔코딩 (숫자형으로 변환)

In [20]:
# LabelEncoder 초기화
label_encoder = LabelEncoder()

# 인코딩할 컬럼 리스트 
columns_to_encode = ['Sex']  

# 각 컬럼에 대해 인코딩 수행
for column in columns_to_encode:
    df[column] = label_encoder.fit_transform(df[column])

# 결과 출력
print(df.head())



   PassengerId  Survived  Pclass  Sex   Age  SibSp  Parch     Fare
0            1         0       3    1  22.0      1      0   7.2500
1            2         1       1    0  38.0      1      0  71.2833
2            3         1       3    0  26.0      0      0   7.9250
3            4         1       1    0  35.0      1      0  53.1000
4            5         0       3    1  35.0      0      0   8.0500


5가지 분류 (RF, DT, LR, KNN, SVM)

In [22]:
# X와 y 분리
X = df.drop(columns=['Survived'])  # 'Survived'는 레이블 컬럼
y = df['Survived']

# 훈련 세트와 테스트 세트로 분할 (80% 훈련, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 정의
models = {
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Logistic Regression': LogisticRegression(),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC()
}

# 각 모델의 정확도 및 confusion matrix 출력
for name, model in models.items():
    # 모델 훈련
    model.fit(X_train, y_train)
    
    # 예측
    y_pred = model.predict(X_test)
    
    # 정확도 계산
    accuracy = accuracy_score(y_test, y_pred)
    
    # confusion matrix 계산
    cm = confusion_matrix(y_test, y_pred)
    
    print(f'\n{name} Model')
    print(f'Accuracy: {accuracy:.4f}')
    print('Confusion Matrix:')
    print(cm)
    print('-' * 50)


Random Forest Model
Accuracy: 0.8212
Confusion Matrix:
[[93 12]
 [20 54]]
--------------------------------------------------

Decision Tree Model
Accuracy: 0.7318
Confusion Matrix:
[[84 21]
 [27 47]]
--------------------------------------------------

Logistic Regression Model
Accuracy: 0.7765
Confusion Matrix:
[[89 16]
 [24 50]]
--------------------------------------------------

KNN Model
Accuracy: 0.6592
Confusion Matrix:
[[89 16]
 [45 29]]
--------------------------------------------------

SVM Model
Accuracy: 0.5978
Confusion Matrix:
[[103   2]
 [ 70   4]]
--------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
