In [2]:
## import packages
import time
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import gc
import time
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix, hstack

from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split, cross_val_score
import lightgbm as lgb

In [None]:
## import data 
start_time = time.time()

train_raw = pd.read_csv('/home/bsong/Python_Stuff/Data/Kaggle_Mercari/train.tsv',delimiter= '\t')
#test_raw = pd.read_csv('/home/bsong/Python_Stuff/Data/Kaggle_Mercari/test.tsv', delimiter='\t')
#train_raw = train_raw.iloc[0:10000,] # just a bit
# standardize price here because may as well

#randomize order here 
train_raw = train_raw.sample(frac=1).reset_index(drop=True)

normalized_price = np.log1p(train_raw['price'].values)
mean_price_norm = np.mean(normalized_price)
std_price_norm = np.std(normalized_price) 
price_y = (normalized_price - mean_price_norm)/std_price_norm 

end_time = time.time()
print('import data took ' + str(end_time - start_time) + " seconds.")

In [None]:
NUM_BRANDS = 4004
NUM_CATEGORIES = 1001
NAME_MIN_DF = 10
MAX_FEATURES_ITEM_DESCRIPTION = 39000

def handle_missing_inplace(dataset):
    dataset['category_name'].fillna(value='missing', inplace=True)
    dataset['brand_name'].fillna(value='missing', inplace=True)
    dataset['item_description'].fillna(value='missing', inplace=True)


def cutting(dataset):
    pop_brand = dataset['brand_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]
    dataset.loc[~dataset['brand_name'].isin(pop_brand), 'brand_name'] = 'missing'
    pop_category = dataset['category_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]
    dataset.loc[~dataset['category_name'].isin(pop_category), 'category_name'] = 'missing'


def to_categorical(dataset):
    dataset['category_name'] = dataset['category_name'].astype('category')
    dataset['brand_name'] = dataset['brand_name'].astype('category')
    dataset['item_condition_id'] = dataset['item_condition_id'].astype('category')

## post-prediction functions

def rmsle(h, y): 
    log_h = np.log(h+1) # the +1 is to prevent 0 
    log_y = np.log(y+1) # writing these to prevent memoryerror
    sq_logs = np.square(log_h - log_y)
    score_ = np.sqrt(np.mean(sq_logs))
    return score_

def rmse(h,y):
    sq_logs = np.square(h-y)
    score_ = np.sqrt(np.mean(sq_logs))
    return score_

def unwind(preds, mean_,std_, norm_ = True):
    unstandardized = preds*std_ + mean_
    if norm_ == True: # norm_ is if the original value (like the label) was normalized with np.logm1
        unstandardized = np.expm1(unstandardized)
    return unstandardized

def optimal_weights_ensemble(Xs, y):
    # make sure Xs is all predictions where each column is predictions by each model
    # y just has to be a vector of true values
    # note: the values of Xs need to be scaled to the values that would go into RMSE
    
    y = np.reshape(y,(-1,1)) # to make it shape [n,1]
    
    first = np.matmul(np.transpose(y),Xs)
    second = np.matmul(np.transpose(Xs), Xs)
    w_ = np.matmul(first, np.linalg.inv(second)) # this should be of size [1,n_weights]
    return np.transpose(w_) #returns [n_weight,1] for easier matrix multiplication 

In [None]:
start_time = time.time()

handle_missing_inplace(train_raw)
print('[{}] Finished to handle missing'.format(time.time() - start_time))

cutting(train_raw)
print('[{}] Finished to cut'.format(time.time() - start_time))

to_categorical(train_raw)
print('[{}] Finished to convert categorical'.format(time.time() - start_time))

cv = CountVectorizer(min_df=NAME_MIN_DF)
X_name = cv.fit_transform(train_raw['name'])
print('[{}] Finished count vectorize `name`'.format(time.time() - start_time))

cv = CountVectorizer()
X_category = cv.fit_transform(train_raw['category_name'])
print('[{}] Finished count vectorize `category_name`'.format(time.time() - start_time))

tv = TfidfVectorizer(max_features=MAX_FEATURES_ITEM_DESCRIPTION,
                     ngram_range=(1, 3),
                     stop_words='english')
X_description = tv.fit_transform(train_raw['item_description'])
print('[{}] Finished TFIDF vectorize `item_description`'.format(time.time() - start_time))

lb = LabelBinarizer(sparse_output=True)
X_brand = lb.fit_transform(train_raw['brand_name'])
print('[{}] Finished label binarize `brand_name`'.format(time.time() - start_time))

X_dummies = csr_matrix(pd.get_dummies(train_raw[['item_condition_id', 'shipping']],
                                      sparse=True).values)
print('[{}] Finished to get dummies on `item_condition_id` and `shipping`'.format(time.time() - start_time))

sparse_train_raw = hstack((X_dummies, X_description, X_brand, X_category, X_name)).tocsr()
print('[{}] Finished to create sparse train_raw'.format(time.time() - start_time))


In [None]:
## pickle block
'''
import pickle

pickle.dump(sparse_train_raw, open('/home/bsong/Python_Stuff/Data/Kaggle_Mercari/sparse_features.pkl','wb'))
pickle.dump(price_y, open('/home/bsong/Python_Stuff/Data/Kaggle_Mercari/labels_for_sparse_features.pkl','wb'))
'''

## read pickles here
import pickle
with open('/home/bsong/Python_Stuff/Data/Kaggle_Mercari/sparse_features.pkl','rb') as pickle_in:
    sparse_train_raw = pickle.load(pickle_in)

with open('/home/bsong/Python_Stuff/Data/Kaggle_Mercari/labels_for_sparse_features.pkl','rb') as pickle_in:
    price_y = pickle.load(pickle_in)


In [None]:
test_ind = round(0.95*sparse_train_raw.shape[0]) # save 5% as a test set 
X_not_test = sparse_train_raw[:test_ind,] # these will be split
Y_not_test = price_y[:test_ind,]          # into train and validation set

X_test = sparse_train_raw[test_ind:,] # these are the 
Y_test = price_y[test_ind:,]          # test sets

split_ratio = 0.9
ind_split = round(split_ratio*X_not_test.shape[0])
X_train = X_not_test[:ind_split,]
X_val = X_not_test[ind_split:,]

Y_train = Y_not_test[:ind_split,]
Y_val = Y_not_test[ind_split:,]

In [None]:
def normalize_or_standardize(dat, norm_ = 'standardize'):
    if norm_ == 'normalize':
        dat = np.log1p(dat)
    mean_norm = np.mean(normalized_price)
    std_norm = np.std(normalized_price) 
    price_y = (normalized_price - mean_price_norm)/std_price_norm 
        
def blend_lgb_ridge(X_train, Y_train, X_test, Y_test, norm_ = 'none'):
    '''
    Make sure Y_train and Y_test arent scaled. If it is though, maybe it'll be fine lol.
    '''
    # lgb regression
    lgb_dat = lgb.Dataset(X_train, label = Y_train)
    params = {
        'learning_rate': 0.75,
        'application': 'regression',
        'max_depth': 3,
        'num_leaves': 100,
        'verbosity': -1,
        'metric': 'RMSE',
        }

    model_lgb = lgb.train(params, train_set = lgb_dat, num_boost_round = 3000, verbose_eval = 100)
    preds_lgb_test = model_lgb.predict(X_test)
    
    # ridge regression 
    model_ridge = Ridge(solver="sag", fit_intercept=True)
    model_ridge.fit(X_train, Y_train)
    preds_ridge_test = model_ridge.predict(X_test)
    
    # normalize
    set_norm_terms = set(('none','standardize','normalize'))
    try:
        assert norm_ in set_norm_terms
    except:
        print('Possible typo: The parameter "norm_" must be string "none", "standardize", or "normalize".')
        
    if norm_ != 'none':
        if norm_ == 'normalize':   
            Y_train = np.log1p(Y_train)
    
        mean_price_norm = np.mean(normalized_price)
        std_price_norm = np.std(normalized_price) 
        price_y = (normalized_price - mean_price_norm)/std_price_norm 


In [11]:
norm_ = 'none1'

set_norm_terms = set(('none','standardize','normalize'))
try:
    assert norm_ in set_norm_terms
except:
    print('Possible typo: The parameter "norm_" must be string "none", "standardize", or "normalize".')

Possible typo: The parameter "norm_" must be string "none", "standardize", or "normalize".


In [None]:
# lightgbm step

lgb_dat = lgb.Dataset(X_train, label = Y_train)
#lgb_dat1 = lgb.Dataset(X_train, label= Y_train, max_bin=8192)
params = {
    'learning_rate': 0.75,
    'application': 'regression',
    'max_depth': 3,
    'num_leaves': 100,
    'verbosity': -1,
    'metric': 'RMSE',
    }

model_lgb = lgb.train(params, train_set = lgb_dat, num_boost_round = 3000, verbose_eval = 100)
preds_lgb_val = model_lgb.predict(X_val)
preds_lgb_test = model_lgb.predict(X_test)

In [None]:
# ridge regression
model_ridge = Ridge(solver="sag", fit_intercept=True)
model_ridge.fit(X_train, Y_train)

preds_ridge_val = model_ridge.predict(X_val)
preds_ridge_test = model_ridge.predict(X_test)

In [None]:
## pickle block numbah 2
'''
comb_pred_val = np.hstack((np.reshape(preds_lgb_val, (-1,1)),np.reshape(preds_ridge_val,(-1,1))))
comb_pred_test = np.hstack((np.reshape(preds_lgb_test, (-1,1)),np.reshape(preds_ridge_test,(-1,1))))

pickle.dump(comb_pred_val, open('/home/bsong/Python_Stuff/Data/Kaggle_Mercari/comb_pred_val.pkl','wb'))
pickle.dump(comb_pred_test, open('/home/bsong/Python_Stuff/Data/Kaggle_Mercari/comb_pred_test.pkl','wb'))
'''

pickle.dump(model_lgb, open('/home/bsong/Python_Stuff/Data/Kaggle_Mercari/model_lgb.pkl','wb'))
pickle.dump(model_ridge, open('/home/bsong/Python_Stuff/Data/Kaggle_Mercari/model_ridge.pkl','wb'))

In [None]:
## ensemble 

preds_lgb_val_unw = unwind(preds_lgb_val, mean_price_norm, std_price_norm)
preds_lgb_test_unw = unwind(preds_lgb_test, mean_price_norm, std_price_norm)
preds_ridge_val_unw = unwind(preds_ridge_val, mean_price_norm, std_price_norm)
preds_ridge_test_unw = unwind(preds_ridge_test, mean_price_norm, std_price_norm)
preds_rf_val_unw = unwind(preds_rf_val, mean_price_norm, std_price_norm)
preds_rf_test_unw = unwind(preds_rf_test, mean_price_norm, std_price_norm)

true_val_unw = unwind(Y_val, mean_price_norm, std_price_norm)
true_test_unw = unwind(Y_test, mean_price_norm, std_price_norm)


rmsle_lgb_val = rmsle(preds_lgb_val_unw, true_val_unw)
rmsle_lgb_test = rmsle(preds_lgb_test_unw, true_test_unw)
rmsle_ridge_val = rmsle(preds_ridge_val_unw, true_val_unw)
rmsle_ridge_test = rmsle(preds_ridge_test_unw, true_test_unw)
rmsle_rf_val = rmsle(preds_rf_val_unw, true_val_unw)
rmsle_rf_test = rmsle(preds_rf_test_unw, true_test_unw)

print(rmsle_lgb_val)
print(rmsle_lgb_test)
print(rmsle_ridge_val)
print(rmsle_ridge_test)
print(rmsle_rf_val)
print(rmsle_rf_test)


In [None]:
preds_blend_val = 0.6*preds_lgb_val_unw + 0.4*preds_ridge_val_unw
preds_blend_test = 0.6*preds_lgb_test_unw + 0.4*preds_ridge_test_unw

rmsle_blend_val = rmsle(preds_blend_val, true_val_unw)
rmsle_blend_test = rmsle(preds_blend_test, true_test_unw)

print(rmsle_blend_val)
print(rmsle_blend_test)

In [None]:
def blending(combined_pred, true_labels , verbose = True):
    # wrapper function that prints out optimal weights and returns final prediction
    best_w = optimal_weights_ensemble(combined_pred,true_labels)
    best_pred = np.matmul(combined_pred, best_w)
    best_rmse = rmse(np.squeeze(best_pred), true_labels)
    
    #technically not "best" due to computations, but prob will be off by 1e-4 rmse to the best
    if verbose == True:
        print('best weights in order of combined regressors: ' + str(best_w))
        print(' ')
        print('RMSE of the ensemble: ' + str(best_rmse))
    return best_rmse

In [None]:
## blending method start here

logpreds_lgb_val_unw = unwind(preds_lgb_val, mean_price_norm, std_price_norm, norm_= False)
logpreds_lgb_test_unw = unwind(preds_lgb_test, mean_price_norm, std_price_norm, norm_= False)
logpreds_ridge_val_unw = unwind(preds_ridge_val, mean_price_norm, std_price_norm, norm_= False)
logpreds_ridge_test_unw = unwind(preds_ridge_test, mean_price_norm, std_price_norm, norm_= False)

logtrue_val_unw = unwind(Y_val, mean_price_norm, std_price_norm, norm_ = False)
logtrue_test_unw = unwind(Y_test, mean_price_norm, std_price_norm, norm_ = False)

combined_predictions = np.hstack((np.reshape(logpreds_lgb_val_unw,(-1,1)), np.reshape(logpreds_ridge_val_unw,(-1,1))))
best_w = optimal_weights_ensemble(combined_predictions,logtrue_val_unw) 

final_prediction = np.matmul(combined_predictions, best_w)

rmsle_blended = rmse(np.squeeze(final_prediction), logtrue_val_unw) # rmse because the values are already log'd
print(rmsle_blended)


In [None]:
## blending method start here

combined_predictions = np.hstack((np.reshape(preds_lgb_val_unw,(-1,1)), np.reshape(preds_ridge_val_unw,(-1,1))))
best_w = optimal_weights_ensemble(combined_predictions,true_val_unw) 
print('best weights: ' + str(best_w))
final_prediction = np.matmul(combined_predictions, best_w)
fp2 = np.matmul(combined_predictions, np.array([[.6],[.4]]))
rmsle_blended = rmsle(np.squeeze(final_prediction), true_val_unw) # squeeze fixes memory issues 
rmsle_b2 = rmsle(np.squeeze(fp2),true_val_unw)
print(rmsle_blended)
print(rmsle_b2)
