### 데이터 로딩

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# 1. 데이터 로딩
file_path = "C:\\Users\\axhtl\\OneDrive\\바탕 화면\\학교\\인공지능개론\\titanic.csv"
df = pd.read_csv(file_path)

# 데이터프레임 확인
print(df.head())
# 컬럼명 확인
print(df.columns)

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
In

In [2]:
# 2. 결측치 확인
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [3]:
# 3. 결측치 제거
# 'Age' 칼럼의 평균값 계산
mean_age = df['Age'].mean()

# 'Age' 칼럼의 결측치를 평균값으로 채우기
df['Age'] = df['Age'].fillna(mean_age)

# 변환된 데이터프레임 확인
print(df.head())

# 4. 레이블 확인
df['Survived'].value_counts()

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


Survived
0    549
1    342
Name: count, dtype: int64

In [4]:
# 5. 불필요한 컬럼 제거
columns_to_drop = ['Name', 'Ticket', 'Cabin']  # 삭제할 컬럼 목록
df = df.drop(columns=columns_to_drop)

# 변환된 데이터프레임 확인
print(df.head())

   PassengerId  Survived  Pclass     Sex   Age  SibSp  Parch     Fare Embarked
0            1         0       3    male  22.0      1      0   7.2500        S
1            2         1       1  female  38.0      1      0  71.2833        C
2            3         1       3  female  26.0      0      0   7.9250        S
3            4         1       1  female  35.0      1      0  53.1000        S
4            5         0       3    male  35.0      0      0   8.0500        S


In [5]:
# 숫자 변환 전 레이블 갯수
df['Survived'].value_counts()

Survived
0    549
1    342
Name: count, dtype: int64

In [6]:
from sklearn.preprocessing import LabelEncoder

# 6. 엔코딩
# LabelEncoder 초기화
label_encoder = LabelEncoder()

#  컬럼에 LabelEncoder 적용
columns_to_encode = ['Sex','Embarked'] # 인코딩할 컬럼 리스트
for column in columns_to_encode:
    df[column] = label_encoder.fit_transform(df[column])

print(df)

# 숫자 변환 후 레이블 갯수
df['Survived'].value_counts()

     PassengerId  Survived  Pclass  Sex        Age  SibSp  Parch     Fare  \
0              1         0       3    1  22.000000      1      0   7.2500   
1              2         1       1    0  38.000000      1      0  71.2833   
2              3         1       3    0  26.000000      0      0   7.9250   
3              4         1       1    0  35.000000      1      0  53.1000   
4              5         0       3    1  35.000000      0      0   8.0500   
..           ...       ...     ...  ...        ...    ...    ...      ...   
886          887         0       2    1  27.000000      0      0  13.0000   
887          888         1       1    0  19.000000      0      0  30.0000   
888          889         0       3    0  29.699118      1      2  23.4500   
889          890         1       1    1  26.000000      0      0  30.0000   
890          891         0       3    1  32.000000      0      0   7.7500   

     Embarked  
0           2  
1           0  
2           2  
3          

Survived
0    549
1    342
Name: count, dtype: int64

In [7]:
# 특성과 타겟 분리
X = df.iloc[:, :-1]  # 마지막 열 제외 (특성)
y = df.iloc[:, -1]   # 마지막 열 (클래스)

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# 데이터 표준화
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
# 모델 학습 및 평가
# 결정 트리 학습 및 평가
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train) # 학습

dt_pred = dt_model.predict(X_test) # 예측

print("\n--- Decision Tree ---")
print(accuracy_score(y_test, dt_pred))
print(confusion_matrix(y_test, dt_pred))


--- Decision Tree ---
0.7541899441340782
[[ 21   1  20   1]
 [  1  13   3   0]
 [ 15   3 101   0]
 [  0   0   0   0]]


In [10]:
# 랜덤 포레스트 학습 및 평가
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

rf_pred = rf_model.predict(X_test)

print("\n--- Random Forest ---")
print(accuracy_score(y_test, rf_pred))
print(confusion_matrix(y_test, rf_pred))


--- Random Forest ---
0.7374301675977654
[[  8   0  35]
 [  0  11   6]
 [  4   2 113]]


In [11]:
# SVM 학습 및 평가
svm_model = SVC()
svm_model.fit(X_train, y_train)

svm_pred = svm_model.predict(X_test)

print("\n--- SVM ---")
print(accuracy_score(y_test, svm_pred))
print(confusion_matrix(y_test, svm_pred))


--- SVM ---
0.6536312849162011
[[  0   0  43]
 [  0   0  17]
 [  2   0 117]]


In [12]:
# 로지스틱 회귀 학습 및 평가
lr_model = LogisticRegression(max_iter=200)
lr_model.fit(X_train, y_train)

lr_pred = lr_model.predict(X_test)

print("\n--- Logistic regression ---")
print(accuracy_score(y_test, lr_pred))
print(confusion_matrix(y_test, lr_pred))


--- Logistic regression ---
0.6703910614525139
[[  2   0  41]
 [  0   0  17]
 [  1   0 118]]


In [13]:
# knn으로 붓꽃 분류

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# 5. 예측 및 평가
y_pred = knn.predict(X_test)

print("\n--- KNN ---")
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


--- KNN ---
0.6871508379888268
[[  8   1  34]
 [  2   8   7]
 [  9   3 107]]
