In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
def imputation(df):
    # use SimpleImputer to impute the missing values as mean of the column they are in
    # use missing_values = np.nan and strategy = 'mean'
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp_data = imputer.fit_transform(df)
    # print the imputed value at df.iloc[1,23], keep 6 decimal places
    print(imp_data[1,23])
    # return imputed data
    return imp_data
    
    
def split_train_test(imp_data):
    X = imp_data[:,:-1]
    y = imp_data[:, -1]
    
    # split train and test set with train_test_split method in sklean
    # the test size is 30% of the data and set random_state=0
    # since bankruptcy label is likely to be unbalanced, set stratify=y
    X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=.7, random_state=0, stratify=y)
    
    # print the value at position (0,0) of X_train
    print(X_train[0,0])
    return X_train, X_test, y_train, y_test
    
    
def normalization(X_train, X_test, y_train, y_test):
    # normalize X_train and X_test by StandardScaler in sklearn
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    #print(X_test)
    # print the value at position (0,0) of X_test, keep 7 decimal places
    out = X_test[0][0]
    print("{:.7f}".format(out))
    return X_train, X_test, y_train, y_test


def dimension_reduction_LR(X_train, X_test, y_train, y_test):
    # you will selection the most important two features in this example using Logistic Regression
    # we will add a l1 penalty term (Lasso) to regularize the coefficient
    # tune the tolerance level λ from 1 to 0.1 to 0.01 to 0.001 (the regularization parameter in Lasso is C)
    # until you find there are only two predictors that has nonzero coefficient
    # fit the logistic regressor using LogisticRegression() in sklearn 
    # set the solver='liblinear', random_state=0, and also the l1 penalty term
    
    # find the ideal tolerance level by trial and error, and print it 
    lr = LogisticRegression(penalty='l1', solver= 'liblinear', random_state=0,C=.01)
    lr.fit(X_train, y_train)
    print(lr.C)
    # print the coefficient of the two features as a List
    coefs = [i for i in lr.coef_[0] if i!= 0]
    x = np.array(coefs)
    #x = "[{:.7f} {:.8f}]".format(coefs[0], coefs[1])
    print(x)
    # reset X_train and X_test with only the two most important predictors
    cols = pd.DataFrame(X_train)
    predictors = [cols.columns[i] for i in range(len(lr.coef_[0])) if lr.coef_[0][i] != 0]
    X_train = X_train[:,predictors]
    X_test = X_test[:,predictors]
    # i.e. drop the less important features
    
    
    return X_train, X_test, y_train, y_test
    
def SVM(X_train, X_test, y_train, y_test):
    # fit the SVM model to use the two features to predict bankrupcy
    # set the kernel as 'rbf', gamma=0.2, C=1 and random_state=0
    svm = SVC(gamma=.2,random_state=0)
    svm.fit(X_train, y_train)
    # print training accuracy, keep 6 decimal places
    trainacc = svm.score(X_train,y_train)
    print("{:.6f}".format(trainacc))
    # print the test accuracy, keep 6 decimal places
    testacc = svm.score(X_test,y_test)
    print("{:.6f}".format(testacc))

def test_0(df):
    imp_data = imputation(df)
    
def test_1(df):
    imp_data = imputation(df)
    X_train, X_test, y_train, y_test = split_train_test(imp_data)
    
def test_2(df):
    imp_data = imputation(df)
    X_train, X_test, y_train, y_test = split_train_test(imp_data)
    X_train, X_test, y_train, y_test = normalization(X_train, X_test, y_train, y_test)
    
def test_3(df):
    imp_data = imputation(df)
    X_train, X_test, y_train, y_test = split_train_test(imp_data)
    X_train, X_test, y_train, y_test = normalization(X_train, X_test, y_train, y_test)
    X_train, X_test, y_train, y_test = dimension_reduction_LR(X_train, X_test, y_train, y_test)
    
def test_4(df):
    imp_data = imputation(df)
    X_train, X_test, y_train, y_test = split_train_test(imp_data)
    X_train, X_test, y_train, y_test = normalization(X_train, X_test, y_train, y_test)
    X_train, X_test, y_train, y_test = dimension_reduction_LR(X_train, X_test, y_train, y_test)
    SVM(X_train, X_test, y_train, y_test)

if __name__ == '__main__':
    test_id = int(input().strip())
    row_num = int(input().strip())
    Data = []
    col_names = list(map(str, input().split(',')))
    for i in range(row_num):
        line=list(map(str, input().split(',')))
        for j in range(1,65):
            if line[j] == '':
                line[j] = np.nan
            else:
                line[j] = float(line[j])
        line[65] = int(eval(line[65]))
        Data.append(line[1:])
    df = pd.DataFrame(Data, columns= col_names[1:])
    
    if test_id == 0:
        test_0(df)
    if test_id == 1:
        test_1(df)
    if test_id == 2:
        test_2(df)
    if test_id == 3:
        test_3(df)
    if test_id == 4:
        test_4(df)