# EDA and Simple Classifications

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer, MinMaxScaler, RobustScaler, StandardScaler

In [None]:
scaler = MinMaxScaler()

> Due to the search results, they mentioned about the test as below.
> 1. some packages (numpy, pandas, matplotlib, sklearn) are allowed.
> 2. dataset are titanic, pima, or similar simple tabular dataset.
> 3. regression, classification models are asked to implement.

## data load and basic data check

In [None]:
df = pd.read_csv('E:/RESEARCH/Datasets/kaggle/titanic/titanic_dataset.csv')

In [None]:
# df.info()
df.head(5)

In [None]:
df.isnull().sum()

In [None]:
df = df.drop(['Cabin', 'Name', 'Ticket', 'PassengerId'], axis=1)
df.shape

> If just deleting the columns with nan variables.

In [None]:
df_nadrop  = df.copy()
df_nadrop.dropna(inplace=True)

In [None]:
df_nadrop.shape

In [None]:
df_nadrop.Survived.value_counts()

> If interpolating the age variable nan values

In [None]:
df_interpolate = df.copy()
age_mean_pclass = df_interpolate.groupby('Pclass')['Age'].transform('mean')
df_interpolate['Age'].fillna(age_mean_pclass, inplace=True)

In [None]:
df_interpolate.Survived.value_counts()

In [None]:
df_interpolate.dropna(inplace=True)

In [None]:
df_interpolate.shape

In [None]:
df_interpolate.info()

## convert categorical

In [None]:
# data = df_dropna.copy()
data = df_interpolate.copy()

In [None]:
df_final = pd.get_dummies(data, columns=['Sex', 'Embarked'], drop_first=True, dtype=int)

In [None]:
df_final.info()

In [None]:
df_final.head()

***

## Visualization

In [None]:
data_vis = df_final.copy()

### count bar plot

In [None]:
survival_counts = data_vis['Survived'].value_counts()

In [None]:
plt.figure(figsize=(5, 5))
plt.bar(survival_counts.index, survival_counts.values, color='green')

### Correlation

In [None]:
corr_matrix = data_vis.corr()
plt.figure(figsize=(8, 5))
plt.imshow(corr_matrix, cmap='coolwarm', interpolation='none', aspect='auto')
plt.colorbar()  # 색상 막대 추가
plt.xticks(range(len(corr_matrix)), corr_matrix.columns, rotation=45)
plt.yticks(range(len(corr_matrix)), corr_matrix.columns)

In [None]:
data_vis.corr().Survived.sort_values(ascending = True)

### histogram

In [None]:
numeric_cols = data_vis.select_dtypes(include=['int','float']).columns
categ_cols = data_vis.select_dtypes(include=['object', 'bool']).columns

In [None]:
data_vis[numeric_cols].hist()

***

## Classification models

In [None]:
x = df_final.drop('Survived', axis=1)
y = df_final.Survived

In [None]:
x[:] = (scaler.fit_transform(x[:]).round(decimals=6))

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2024)

### Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

#### Basic Model

In [None]:
logis_model = LogisticRegression()
logis_model.fit(x_train, y_train)

In [None]:
predictions = logis_model.predict(x_test)

In [None]:
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

#### model with L2, L1 normalization

In [None]:
model_l1 = LogisticRegression(penalty='l1', solver='liblinear', C=0.1, random_state=2024)
model_l2 = LogisticRegression(penalty='l2', C=0.1, random_state=2024)

In [None]:
model_l1.fit(x_train, y_train)
model_l2.fit(x_train, y_train)

In [None]:
log_pred1 = model_l1.predict(x_test)
log_pred2 = model_l2.predict(x_test)

In [None]:
print(confusion_matrix(y_test, log_pred1))
print(classification_report(y_test, log_pred1))

In [None]:
print(confusion_matrix(y_test, log_pred2))
print(classification_report(y_test, log_pred2))

***

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dt_model=DecisionTreeClassifier()
dt_model.fit(x_train,y_train)

In [None]:
dt_pred = dt_model.predict(x_test)

In [None]:
print(confusion_matrix(y_test,dt_pred))
print(classification_report(y_test,dt_pred))

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf= RandomForestClassifier(n_estimators=500)
rf.fit(x_train,y_train)

In [None]:
rf_pred=rf.predict(x_test)

In [None]:
print(confusion_matrix(y_test,rf_pred))
print(classification_report(y_test,rf_pred))

### Integrated (non DNN models)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# 다양한 분류 모델 정의
non_dnn_models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier()
}

In [None]:
# 모델 학습 및 평가
for name, model in non_dnn_models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    ## checking model performance
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{name} Accuracy: {accuracy:.2f}')

### Simple DNN

In [None]:
from sklearn.neural_network import MLPClassifier

mlp_model = MLPClassifier(hidden_layer_sizes=(16, 8, 4), 
                      activation='relu',
                      solver='adam', 
                      max_iter=500, 
                      random_state=2024)

In [None]:
mlp_model.fit(x_train, y_train)

# 예측
y_pred = mlp_model.predict(x_test)

# 정확도 출력
accuracy = accuracy_score(y_test, y_pred)
print(f'DNN model accuracy: {accuracy:.2f}')