In [1]:
import numpy as np
from utils import *
from tqdm.notebook import tqdm
from multiprocessing import Pool

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
from sklearn.preprocessing import normalize, StandardScaler
from itertools import combinations

def check_float(data):
    if(data.dtype == np.int64): data = data.astype(np.float64)
    return data

def NormJa(data):
    data = check_float(data)
    for index, row in enumerate(data):
        min = row.min()
        max = row.max()
        # mean = row.mean()
        row = (row.astype(np.float64) - min) / float(max - min)
        data[index] = row
        # print(row)
    return data

def StandardJa(data):
    data = check_float(data)
    for index, row in enumerate(data):
        mean = row.mean()
        std = row.std()
        row = (row - mean) / std
        data[index] = row
        # print(row)
    return data

# LogisticRegression

In [2]:
def search(p_id, X,y,comb_list):
    count = 0
    best_score = -100
    best_comb = None
    for comb in comb_list:
        if(count % 400 == 0):
            print(f"\tp_id:{p_id} running {count}/{len(comb_list)}")

        X_selected = X[:,comb].copy()
        model = LogisticRegression()
        cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
        cross = cross_val_score(model, X_selected, y, cv=cv)
        score = cross.mean()
        # print(f"{p_id}  {best_score} {score} {comb}")
        if(best_score < score):
            best_score = score
            best_comb = comb
            # print(f"{p_id} Update: {score} {comb}")
        count += 1
    print(f"\t{p_id} Done!!")
    return score, best_comb

_NORM_SK_NORM = 0
_NORM_SK_STD = 1
_NORM_MY_NORM = 2
_NORM_MY_STD = 3

In [3]:
for norm_type in [0,1,2,3]:
    X_ori, y_ori = load('X_ori'), load('y_ori')
    X,y = np.array(X_ori), np.array(y_ori)
    if(norm_type == _NORM_SK_NORM):
        print("Perform: sklearn normalize")
        X = normalize(X.copy(), axis=0)
    elif(norm_type == _NORM_SK_STD):
        print("Perform: sklearn standardize")
        X = StandardScaler().fit_transform(X.copy())
    elif(norm_type == _NORM_MY_NORM):
        print("Perform: my normalize")
        X = NormJa(X.copy().T).T
    elif(norm_type == _NORM_MY_STD):
        print("Perform: my standardize")
        X = StandardJa(X.copy().T).T
    X_shuff,y_shuff = shuffle(X,y)

    comb_list = []
    for feature_num in range(2,14):
        comb_list.extend(list(combinations(range(13),feature_num)))
    try:
        t_out = 60000
        pool = Pool()
        p_list = []
        ans_list = []
        for p_id in range(6):
            p_list.append(pool.apply_async( search, [p_id, X_shuff,y_shuff,comb_list[p_id::6]] ))
        for i in range(6):
            ans_list.append( p_list[i].get(timeout=t_out) )
        # ans_list
    finally:
        print("\t========= close ========")
        pool.close() 
        pool.terminate()
    
    best_score = -100
    best_grid = None
    best_comb = None
    for ans in ans_list:
        if(ans[0] > best_score):
            best_score = ans[0]
            best_comb = ans[1]
    print(f"\tBest Combo {best_comb} | The best score is {best_score:.2f}")
    print()
    print()

Perform: sklearn normalize
	p_id:0 running 0/1363
	p_id:2 running 0/1363	p_id:3 running 0/1363	p_id:1 running 0/1363


	p_id:4 running 0/1363
	p_id:5 running 0/1363
	p_id:1 running 400/1363
	p_id:2 running 400/1363
	p_id:0 running 400/1363
	p_id:5 running 400/1363
	p_id:4 running 400/1363
	p_id:3 running 400/1363
	p_id:2 running 800/1363
	p_id:1 running 800/1363
	p_id:4 running 800/1363
	p_id:0 running 800/1363
	p_id:5 running 800/1363
	p_id:3 running 800/1363
	p_id:2 running 1200/1363
	p_id:1 running 1200/1363
	p_id:4 running 1200/1363
	p_id:0 running 1200/1363
	p_id:5 running 1200/1363
	p_id:3 running 1200/1363
	2 Done!!
	1 Done!!
	4 Done!!
	5 Done!!
	0 Done!!
	3 Done!!
	Best Combo (5, 8, 9, 10, 11, 12) | The best score is 0.65


Perform: sklearn standardize
	p_id:0 running 0/1363
	p_id:1 running 0/1363
	p_id:2 running 0/1363	p_id:3 running 0/1363

	p_id:5 running 0/1363	p_id:4 running 0/1363

	p_id:1 running 400/1363
	p_id:3 running 400/1363
	p_id:5 running 400/1363
	p_id:4 running 

In [16]:
# sklearn Standardize
# Best Combo (0, 1, 2, 3, 6, 7, 10, 11, 12) | The best score is 0.73
X_ori, y_ori = load('X_ori'), load('y_ori')
X,y = np.array(X_ori), np.array(y_ori)
X = StandardScaler().fit_transform(X.copy())
comb = (0, 1, 2, 3, 6, 7, 10, 11, 12)
X_shuff,y_shuff = shuffle(X,y)
model = LogisticRegression()
model.fit(X_shuff[:,comb], y_shuff)
ans = model.predict(X_shuff[:,comb])
acc = sum(ans == y_shuff) / len(y_shuff)
cross = cross_val_score(model, X_shuff[:,comb], y_shuff, cv=10)
print(acc, cross.mean(), cross)
print(ans)

0.7263888888888889 0.7208333333333334 [0.72222222 0.72222222 0.75       0.63888889 0.73611111 0.73611111
 0.72222222 0.79166667 0.76388889 0.625     ]
[1 1 1 1 1 1 0 0 0 1 1 1 0 1 1 0 1 1 1 0 1 0 0 1 1 1 1 0 0 0 1 1 1 0 1 0 1
 1 0 1 0 1 1 0 0 1 1 1 1 0 1 0 1 1 0 1 0 1 1 0 0 1 0 1 1 0 0 0 1 0 1 1 0 0
 1 1 1 1 1 0 1 1 0 1 1 0 1 1 0 1 1 1 1 1 1 1 0 0 1 0 0 0 1 1 0 1 0 1 1 1 1
 1 1 1 0 0 0 1 1 0 0 1 0 1 1 1 1 1 1 1 0 0 1 0 1 0 1 0 1 1 1 1 1 1 1 0 1 1
 1 1 1 0 1 0 1 1 0 0 0 1 1 1 0 1 0 1 1 1 1 1 1 0 1 0 0 1 0 0 1 0 0 1 0 1 1
 1 1 1 0 1 0 1 0 1 0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 1 0 0 0 1 1 1 0 1 1 1 1 0
 1 0 1 1 1 0 1 0 0 0 0 1 1 1 1 0 1 0 1 1 0 1 0 0 1 1 0 0 1 0 1 0 1 0 1 0 0
 0 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 1 0 1 1 1 1 1 0 0 0 0 1 1 1 0 1 0 1 1 0 1
 1 0 1 1 1 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 1 0 0 0 1 0 0 1 0 1 1 1 1 1 1 1 1
 1 1 1 1 1 0 0 0 1 1 1 0 0 0 1 0 0 1 1 0 0 1 1 1 1 1 1 0 0 1 0 1 0 1 0 0 0
 0 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 0 0 0 0 1 1 0 0 1 1 1 0 1 0 0 1 1 1 0 1
 0 0 1 0 0 0 0 1 1 0 1 1

# Cut 17-23

In [17]:
for norm_type in [0,1,2,3]:
    X_ori, y_ori = load('X_ori_cut17-23'), load('y_ori_cut17-23')
    X,y = np.array(X_ori), np.array(y_ori)
    if(norm_type == _NORM_SK_NORM):
        print("Perform: sklearn normalize")
        X = normalize(X.copy(), axis=0)
    elif(norm_type == _NORM_SK_STD):
        print("Perform: sklearn standardize")
        X = StandardScaler().fit_transform(X.copy())
    elif(norm_type == _NORM_MY_NORM):
        print("Perform: my normalize")
        X = NormJa(X.copy().T).T
    elif(norm_type == _NORM_MY_STD):
        print("Perform: my standardize")
        X = StandardJa(X.copy().T).T
    X_shuff,y_shuff = shuffle(X,y)

    comb_list = []
    for feature_num in range(2,14):
        comb_list.extend(list(combinations(range(13),feature_num)))
    try:
        t_out = 60000
        pool = Pool()
        p_list = []
        ans_list = []
        for p_id in range(6):
            p_list.append(pool.apply_async( search, [p_id, X_shuff,y_shuff,comb_list[p_id::6]] ))
        for i in range(6):
            ans_list.append( p_list[i].get(timeout=t_out) )
        # ans_list
    finally:
        print("\t========= close ========")
        pool.close() 
        pool.terminate()
    
    best_score = -100
    best_grid = None
    best_comb = None
    for ans in ans_list:
        if(ans[0] > best_score):
            best_score = ans[0]
            best_comb = ans[1]
    print(f"\tBest Combo {best_comb} | The best score is {best_score:.2f}")
    print()
    print()

Perform: sklearn normalize
	p_id:0 running 0/1363	p_id:1 running 0/1363
	p_id:3 running 0/1363	p_id:2 running 0/1363


	p_id:5 running 0/1363	p_id:4 running 0/1363

	p_id:0 running 400/1363
	p_id:2 running 400/1363
	p_id:4 running 400/1363
	p_id:1 running 400/1363
	p_id:3 running 400/1363
	p_id:5 running 400/1363
	p_id:4 running 800/1363
	p_id:0 running 800/1363
	p_id:2 running 800/1363
	p_id:5 running 800/1363
	p_id:1 running 800/1363
	p_id:3 running 800/1363
	p_id:2 running 1200/1363
	p_id:0 running 1200/1363
	p_id:4 running 1200/1363
	p_id:5 running 1200/1363
	p_id:1 running 1200/1363
	p_id:3 running 1200/1363
	2 Done!!
	0 Done!!
	4 Done!!
	5 Done!!
	1 Done!!
	3 Done!!
	Best Combo (8, 9, 10, 11) | The best score is 0.63


Perform: sklearn standardize
	p_id:3 running 0/1363	p_id:0 running 0/1363	p_id:1 running 0/1363	p_id:2 running 0/1363



	p_id:5 running 0/1363	p_id:4 running 0/1363

	p_id:5 running 400/1363
	p_id:1 running 400/1363
	p_id:2 running 400/1363
	p_id:0 running 400/136

In [18]:
# sklearn standardized
# Best Combo (1, 3, 4, 5, 11, 12) | The best score is 0.75
X_ori, y_ori = load('X_ori_cut17-23'), load('y_ori_cut17-23')
X,y = np.array(X_ori), np.array(y_ori)
X = StandardJa(X.copy().T).T
comb = (1, 3, 4, 5, 11, 12)
X_shuff,y_shuff = shuffle(X,y)
model = LogisticRegression()
model.fit(X_shuff[:,comb], y_shuff)
ans = model.predict(X_shuff[:,comb])
acc = sum(ans == y_shuff) / len(y_shuff)
cross = cross_val_score(model, X_shuff[:,comb], y_shuff, cv=5)
print(acc, cross.mean(), cross)
print(ans)

0.7803030303030303 0.7727272727272728 [0.70454545 0.83333333 0.78787879 0.79545455 0.74242424]
[0 1 1 1 1 1 1 1 0 0 0 1 1 0 0 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 0 1 0 1
 1 0 1 0 1 1 1 1 0 1 1 1 0 1 1 0 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1
 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1
 0 0 1 1 0 0 1 0 1 1 0 1 0 1 0 1 1 1 1 1 1 0 1 1 0 1 0 0 1 1 1 0 1 1 1 1 0
 1 1 1 0 1 1 1 0 0 0 0 1 1 0 1 1 0 0 0 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 0 0 0 0 1 0 0 1 0 1 1 0 0
 1 1 0 0 1 1 1 1 0 0 0 0 0 0 1 0 1 1 0 1 1 1 1 1 0 1 0 1 0 1 1 1 0 1 1 1 0
 1 1 0 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 0 1 0 1 0 1 1 1 0 0 1 1 0 1
 0 0 1 1 0 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1 1 1 1 1 1 1 1 1
 0 0 1 1 1 0 1 1 0 1 0 1 0 1 0 1 0 1 1 1 0 1 0 1 1 1 1 1 0 1 1 0 1 0 1 1 1
 1 1 1 0 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 0 0 0 1 1 1 1
 1 1 1 1 1 1 0 0 0 1 0 1 1 0 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 0 1
 0 1 