In [1]:
import pandas as pd
import numpy as np
import copy
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
import lightgbm as lgbm
from sklearn.metrics import mean_squared_error as mse
import warnings
warnings.filterwarnings('ignore')
train_data = pd.read_csv('train.csv')
train_data_2 = pd.read_csv('train_2.csv')
test_data = pd.read_csv('test.csv')

### feature engineering

In [3]:
data = pd.concat([train_data,train_data_2,test_data],axis = 0).reset_index()
data.drop(['index','id'],axis = 1,inplace=True)

In [6]:
def l1(alpha,ratio,penalty):
    if penalty == 'l1':
        return(alpha)
    elif penalty == 'elasticnet':
        return(alpha * ratio)
    else:
        return(0)
    
def l2(alpha,ratio,penalty):
    if penalty == 'l2':
        return(alpha)
    elif penalty == 'elasticnet':
        return(alpha * (1-ratio))
    else:
        return(0)

In [7]:
data['l1'] = data.apply(lambda row: l1(row['alpha'],row['l1_ratio'],row['penalty']),axis = 1)
data['l2'] = data.apply(lambda row: l2(row['alpha'],row['l1_ratio'],row['penalty']),axis = 1)
data['n_clusters'] = data['n_classes'] * data['n_clusters_per_class']
data['num'] = data['n_samples'] * data['n_features']

In [8]:
def rep_job(x):
    if x == -1:
        return(16)
    else:
        return(x)
def rep_ratio(ratio,penalty):
    if penalty == 'elasticnet':
        return(ratio)
    else:
        return(0)

def rep_alpha(alpha,penalty):
    if penalty == 'none':
        return(0)
    else:
        return(alpha)

data['l1_ratio'] = data.apply(lambda row:rep_ratio(row['l1_ratio'],row['penalty']),axis = 1)
data['alpha'] = data.apply(lambda row: rep_alpha(row['alpha'],row['penalty']),axis = 1)

In [9]:
data['efficiency'] = data['max_iter'] / data['n_jobs']
data['complexity'] = data['num'] * data['max_iter']
data['local'] = data['n_samples'] * data['max_iter']
data['num*iter_job'] = data['num'] * data['max_iter'] / data['n_jobs']
data['num_class*flip*iter'] = data['num']/data['n_classes'] * data['flip_y'] * data['max_iter']
data['num_class*flip*iter'] = data['num']/data['n_classes'] * data['flip_y'] * data['max_iter']
data['feature * iter'] = data['n_features'] * data['max_iter']

In [10]:
train = data[:440]
test = data[-len(test_data):]

In [11]:
large = train[train['time'] >= 10]

### resampling

In [12]:
def bootstrap(dataframe,count):  
    n = len(dataframe)
    num_list = []
    init_num = np.random.randint(0,n,1)
    num_list.append(int(init_num[0]))
    df = pd.DataFrame(data.loc[init_num])
    for i in range(count - 1):
        num = np.random.randint(0,n,1)
        num_list.append(int(num[0]))
        new_df = pd.DataFrame(data.loc[num])
        df = pd.concat([df,new_df], axis = 0)
    return df, num_list

In [13]:
large2,numlist = bootstrap(large,100)
train = pd.concat([train,large],axis = 0)

In [14]:
from sklearn.utils import shuffle

In [15]:
train = shuffle(train)
train = train.reset_index()

In [16]:
data = pd.concat([train,test],axis =0)
data['time'] = data['time'].apply(lambda x: np.log(x))

In [17]:
data = pd.get_dummies(data)

In [18]:
for feature in data.columns:
    if feature != 'time'and feature != 'penalty_elasticnet'and feature != 'penalty_l1' and feature != 'penalty_l2' and feature != 'penalty_none':
        data[feature] = (data[feature] - data[feature].mean())/data[feature].std()

### ensemble learning

In [47]:
train = data[:400]
test = data[400:469]
train_X = train[[ 'flip_y', 'max_iter', 'n_classes', 'n_features', 'n_informative', 'n_jobs','n_samples', 'random_state',
                 'scale', 'l1', 'l2','n_clusters', 'num', 'efficiency', 'complexity', 'local',
                 'num*iter_job', 'num_class*flip*iter', 'feature * iter']]
test_X = test[[ 'flip_y', 'max_iter', 'n_classes', 'n_features', 'n_informative', 'n_jobs',
                 'n_samples', 'random_state', 'scale', 'l1', 'l2','n_clusters', 'num', 'efficiency', 'complexity', 'local',
                 'num*iter_job', 'num_class*flip*iter', 'feature * iter']]
train_y = train['time']
test_y = test['time']

In [29]:
lgbm1 = lgbm.LGBMRegressor(random_state= 24,n_estimators=1800,learning_rate= 0.05,max_depth= 4, num_leaves = 31,min_child_samples= 15,
                                reg_lambda=0.5).fit(train_X,train_y)
print(mse(train_y,lgbm1.predict(train_X)),mse(test_y,lgbm1.predict(test_X)))
lgbm2 = lgbm.LGBMRegressor(random_state= 48,n_estimators=500,learning_rate= 0.03,max_depth= 4, num_leaves = 31,min_child_samples= 20,
                                reg_lambda=0.5).fit(train_X,train_y)
print(mse(train_y,lgbm2.predict(train_X)),mse(test_y,lgbm2.predict(test_X)))
lgbm3 = lgbm.LGBMRegressor(random_state= 48,n_estimators = 500,learning_rate= 0.03,max_depth= 5, num_leaves = 31,min_child_samples= 20,
                                reg_lambda=0.5).fit(train_X,train_y)
print(mse(train_y,lgbm3.predict(train_X)),mse(test_y,lgbm3.predict(test_X)))
lgbm4 = lgbm.LGBMRegressor(random_state= 48,n_estimators = 900,learning_rate= 0.06,max_depth= 3, num_leaves = 31,min_child_samples= 20,
                                reg_lambda=0.1).fit(train_X,train_y)
print(mse(train_y,lgbm4.predict(train_X)),mse(test_y,lgbm4.predict(test_X)))
lgbm5 = lgbm.LGBMRegressor(random_state= 100,n_estimators=700,learning_rate= 0.04,max_depth= 3, num_leaves = 31,min_child_samples= 20,
                                reg_lambda=0.2).fit(train_X,train_y)
print(mse(train_y,lgbm5.predict(train_X)),mse(test_y,lgbm5.predict(test_X)))
lgbm6 = lgbm.LGBMRegressor(random_state= 35,n_estimators=500,learning_rate= 0.1,max_depth= 4, num_leaves = 31,min_child_samples= 20,
                                reg_lambda=0.1).fit(train_X,train_y)
print(mse(train_y,lgbm6.predict(train_X)),mse(test_y,lgbm6.predict(test_X)))

1.3693106114451201e-05 0.11487283936640692
0.00796126106062299 0.1254320832159813
0.006706676097157976 0.122270450568218
0.0011624594171199233 0.11555510962135854
0.004305042631167184 0.1162556661803873
0.000515958085899282 0.12385763887677033


In [37]:
test = data[469:]
test_X = test[[ 'flip_y', 'max_iter', 'n_classes', 'n_features', 'n_informative', 'n_jobs',
                 'n_samples', 'random_state', 'scale', 'l1', 'l2','n_clusters', 'num', 'efficiency', 'complexity', 'local',
                 'num*iter_job', 'num_class*flip*iter', 'feature * iter']]

In [39]:
y = (lgbm1.predict(test_X)+lgbm2.predict(test_X)+lgbm3.predict(test_X)+lgbm4.predict(test_X)+lgbm5.predict(test_X)+lgbm6.predict(test_X))/6

In [40]:
y = np.exp(y)
final = pd.DataFrame(y)
final = final.reset_index()
final.columns = ['id','time']
final.to_csv('submission.csv',index = None)