In [1]:
import numpy as np
from scipy.stats import mode
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# 데이터 불러오기

In [2]:
df_train = pd.read_csv(r'data\clean_data\train_clearn.csv')
df_test = pd.read_csv(r'data\clean_data\test_clearn.csv')

1. 표준화 : 평균으로부터 떨어진 정도 (s = (d-m)/std)
2. 정규화 : 0~1로 변환 (s = (d-min)/max-min)

In [3]:
class knn:
    __slots__ = ['n','normalize', 'data_max', 'data_min', 'data', 'target']
    def __init__(self, n = 2, normalize = True):
        self.n = n
        self.normalize = normalize
    
    def fit(self, data, target):
        if self.normalize:
            self.data_max = data.max(axis=0)
            self.data_min = data.min(axis=0)
            
            self.data = self.__normalize(data)
        else:
            self.data = data.copy()
        
        self.target = target.copy()
            
    
    def predict(self, unknown):    
        
        def l2_distance(v1, v2):
            dist_vector = np.abs(v1-v2)
            return dist_vector.dot(dist_vector)
        
        def l1_distance(v1, v2):
            dist_vector = np.abs(v1-v2)
            return np.sum(dist_vector)
        
        if self.normalize:
            unknown = self.__normalize(unknown)
            
        neighbors = []
        for idx, data in enumerate(self.data) :
            neighbors.append((l2_distance(data, unknown), self.target[idx]))
        
        neighbors.sort()    
        neighbors = neighbors[:self.n]
        
        preq = [0,0]
        for _, survived in neighbors:
            preq[survived]+=1
        
        if preq[0] > preq[1]:
            return 0 #Die
        elif preq[0] < preq[1]:
            return 1 # live
        else:
            return np.random.choice([0,1], p=[0.6, 0.4]) #train에서 획득한 비율
        
    
    def score(self, unknowns, real):
        predicts = []
        for unknown in unknowns:
            predicts.append(self.predict(unknown))
        
        def r2_score(y_true, y_pred):
            SS_total = 0
            SS_reg = 0
    
            for i in range(len(y_true)):
                x = (y_true[i] - y_pred[i])**2
                SS_reg = SS_reg+x
    
            y_true_mean = np.mean(y_true)
            for i in range(len(y_true)):
                x = (y_true[i]-y_true_mean)**2
                SS_total = SS_total+x
            return 1 - (SS_reg/SS_total)
        
        r2_score = r2_score(real, predicts)
        return r2_score, predicts
        
    def __normalize(self, data):
        data = data.copy()
        return (data-self.data_min)/(self.data_max-self.data_min)

In [4]:
def split_set(df, p=0.6):
    df = df.copy(deep=True)
    df = df.sample(frac = 1) # 반환하는 row 비율
    
    cut = int(df.shape[0]*p)
    train = df.iloc[:cut]
    test = df.iloc[cut:]
    return train, test

def split_id_data_target(df):
    id = df.PassengerId
    data = df.drop(['PassengerId', 'Survived'], axis = 1)
    target = df.Survived
    return id, data, target

# X-Validation
1. best model들을 선정한 뒤, 이를 모두 이용하여 경향성(6:4)를 보고 최종 결과를 판단한다.

In [80]:
def x_validation(model, data, target, k=-1):
    
    #suffle
    join = np.hstack((data, target.reshape(data.shape[0],1)))
    np.random.shuffle(join)
    target = join[:, join.shape[1]-1]
    data = join[:, 0:join.shape[1]-1]
    
    if k == -1:
        volumn = 1
        k=data.shape[0]
    else:
        volumn = int(data.shape[0]/k)
    cuts = [volumn*i for i in range(k)]
    cuts.append(data.shape[0])
    predict_data = [data[cuts[i]:cuts[i+1]] for i in range(k)]
    predict_target = [target[cuts[i]:cuts[i+1]] for i in range(k)]
    
    train_data = [np.delete(data, list(range(cuts[i], cuts[i+1])), axis=0) for i in range(k)]
    train_target = [np.delete(target, list(range(cuts[i], cuts[i+1])), axis=0) for i in range(k)]
    
    fitted_models = [model.fit(train_data[i], train_target[i]) for i in range(k)]
    
    
    scores = np.round([fitted_models[i].score(predict_data[i], predict_target[i]) for i in range(k)], decimals = 3)
    print("scores : {}".format(scores))
    
    best_score = np.array(scores).max()
    best_index = np.array(scores).argmax()
    indexes = [idx for idx, score in enumerate(scores) if best_score - 0.025 <= score <= best_score + 0.025]
    
    print("best score : {} and index = {}".format(best_score, best_index))
    print("Selected Index : {}".format(indexes))
    best_models = [fitted_models[i] for i in indexes]
    return best_models

In [86]:
#model = DecisionTreeClassifier(min_samples_split = 2, class_weight = 'balanced')
model = RandomForestClassifier(n_estimators=100, n_jobs=-1, class_weight = 'balanced')

In [87]:
#Find best model params
best_models = x_validation(model = model, data = df_train.drop(['Survived', 'PassengerId'], axis=1).to_numpy(), target = df_train.Survived.to_numpy(), k=10)

scores : [0.843 0.876 0.921 0.843 0.843 0.865 0.865 0.899 0.809 0.822]
best score : 0.921 and index = 2
Selected Index : [2, 7]


# Test set에 적용
1. best_models를 이용한다

In [88]:
id = df_test.PassengerId.to_numpy()
data = df_test.drop(['PassengerId'], axis=1).to_numpy()

In [89]:
def Predict_with_models(models,label, data, p = None):
    '''
        p는 가중 평균 or weighted choice를 위한 parms
    '''
    
    def get_predict(p, predicts):
        preq = {key:0 for key in predicts}
        for k in predicts:
            preq[k]+=1
        most = list(np.array(preq.values()).max())[0]
    
        most_preq = []
        for k in predicts:
            if preq[k] == most and k not in most_preq:
                most_preq.append(k)
                
        if len(most_preq) == 1:
            return most_preq[0]
        else:
            prob = [p[key] for key in most_preq]
            result = np.random.choice(most_preq, p=prob)
            return result
    
    
    if p is None:
        p = {key:1/len(models) for key in label}
    
    multi_predicts = np.array([model.predict(data) for model in models]).T # 행 일치

    result = []
    for predict in multi_predicts:
        result.append(get_predict(p, predict))
    
    return result
        
            

In [94]:
predicts = Predict_with_models(best_models, [0,1], data, p = {0:0.6, 1:0.4})

In [95]:
result = pd.DataFrame({'PassengerId':id,\
                       'Survived':predicts}, dtype = int)

In [96]:
result.to_csv(r'data\result.csv', index = False)

In [97]:
result.describe()

Unnamed: 0,PassengerId,Survived
count,418.0,418.0
mean,1100.5,0.38756
std,120.810458,0.487777
min,892.0,0.0
25%,996.25,0.0
50%,1100.5,0.0
75%,1204.75,1.0
max,1309.0,1.0


In [85]:
result.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
