In [1]:
import numpy as np
from scipy.stats import mode
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

# 데이터 불러오기

In [5]:
df_train = pd.read_csv(r'data\clean_data\train_clearn.csv')
df_test = pd.read_csv(r'data\clean_data\test_clearn.csv')

1. 표준화 : 평균으로부터 떨어진 정도 (s = (d-m)/std)
2. 정규화 : 0~1로 변환 (s = (d-min)/max-min)

In [198]:
class knn:
    __slots__ = ['n','normalize', 'data_max', 'data_min', 'data', 'target']
    def __init__(self, n = 2, normalize = True):
        self.n = n
        self.normalize = normalize
    
    def fit(self, data, target):
        if self.normalize:
            self.data_max = data.max(axis=0)
            self.data_min = data.min(axis=0)
            
            self.data = self.__normalize(data)
        else:
            self.data = data.copy()
        
        self.target = target.copy()
            
    
    def predict(self, unknown):    
        
        def l2_distance(v1, v2):
            dist_vector = np.abs(v1-v2)
            return dist_vector.dot(dist_vector)
        
        def l1_distance(v1, v2):
            dist_vector = np.abs(v1-v2)
            return np.sum(dist_vector)
        
        if self.normalize:
            unknown = self.__normalize(unknown)
            
        neighbors = []
        for idx, data in enumerate(self.data) :
            neighbors.append((l2_distance(data, unknown), self.target[idx]))
        
        neighbors.sort()    
        neighbors = neighbors[:self.n]
        
        preq = [0,0]
        for _, survived in neighbors:
            preq[survived]+=1
        
        if preq[0] > preq[1]:
            return 0 #Die
        elif preq[0] < preq[1]:
            return 1 # live
        else:
            return np.random.choice([0,1], p=[0.6, 0.4]) #train에서 획득한 비율
        
    
    def score(self, unknowns, real):
        predicts = []
        for unknown in unknowns:
            predicts.append(self.predict(unknown))
        
        def r2_score(y_true, y_pred):
            SS_total = 0
            SS_reg = 0
    
            for i in range(len(y_true)):
                x = (y_true[i] - y_pred[i])**2
                SS_reg = SS_reg+x
    
            y_true_mean = np.mean(y_true)
            for i in range(len(y_true)):
                x = (y_true[i]-y_true_mean)**2
                SS_total = SS_total+x
            return 1 - (SS_reg/SS_total)
        
        r2_score = r2_score(real, predicts)
        return r2_score, predicts
        
    def __normalize(self, data):
        data = data.copy()
        return (data-self.data_min)/(self.data_max-self.data_min)

In [2]:
def split_set(df, p=0.6):
    df = df.copy(deep=True)
    df = df.sample(frac = 1) # 반환하는 row 비율
    
    cut = int(df.shape[0]*p)
    train = df.iloc[:cut]
    test = df.iloc[cut:]
    return train, test

def split_id_data_target(df):
    id = df.PassengerId
    data = df.drop(['PassengerId', 'Survived'], axis = 1)
    target = df.Survived
    return id, data, target

In [286]:
train, test = split_set(df_train,p=0.99)
_, data, target = split_id_data_target(train)

In [287]:
model = DecisionTreeClassifier(min_samples_split = 2, class_weight = 'balanced')

In [288]:

model.fit(data.to_numpy(), target.to_numpy())


DecisionTreeClassifier(ccp_alpha=0.0, class_weight='balanced', criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [289]:
_, data, target = split_id_data_target(test)

In [290]:
model.score(data, target)

0.7777777777777778

# Test set에 적용
1. sample 수가 적기 때문에, train set을 모두 이용한다.

In [6]:
_, train_data, train_target = split_id_data_target(df_train)
test_id = df_test.PassengerId
test_data = df_test.drop(['PassengerId'], axis=1)

model = DecisionTreeClassifier(min_samples_split = 2, class_weight = 'balanced')
model.fit(train_data, train_target)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight='balanced', criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [7]:
predicts = model.predict(test_data.to_numpy())

In [9]:
result = pd.DataFrame({'PassengerId':test_id.values,\
                       'Survived':predicts})

In [13]:
result.to_csv(r'data\result.csv', index = False)

In [14]:
result.shape

(418, 2)

In [15]:
result.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
