In [1]:
import numpy as np 
import pandas as pd 
import plotly.express as px
from matplotlib import pyplot as plt


import seaborn as sns
import numpy.matlib
import matplotlib.gridspec as gridspec
from matplotlib.ticker import MaxNLocator
from scipy import stats
from scipy.stats import norm
from joblib import Parallel, delayed
import shutil
import glob
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from sklearn.metrics import r2_score
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor
import torch
from sklearn.metrics import mean_squared_error
np.random.seed(0)
import os
import torch
from torch.optim import Adam, SGD
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts
from scipy.stats import norm, skew
# train

all_data = pd.read_csv('../data/feature_data_530.csv')
# glove
glove_tags = pd.read_csv('../data/alltags_feature.csv')
glove_title = pd.read_csv('../data/title_feature.csv')
# glove_title = pd.read_csv('../data/title_feature.csv')
all_data = pd.concat([all_data, glove_tags, glove_title], axis=1)
columns = ['Title_len', 'Title_number', 'Alltags_len', 'Alltags_number', 'photo_count', 'totalTags', 'totalGeotagged', 'totalFaves',
          'totalInGroup','photoCount','meanView', 'meanTags', 'meanFaves', 'followerCount','followingCount']
skew_features = all_data[columns].apply(lambda x: skew(x)).sort_values(ascending=False)
high_skew = skew_features[abs(skew_features) > 0.75]
skew_index = high_skew.index
for i in skew_index:
    all_data[i] = np.log1p(all_data[i])
    
useless_columns = ['Pid','mean_label'] 
useless_columns += ['user_fe_{}'.format(i) for i in range(399)]
useless_columns += ['loc_fe_{}'.format(i) for i in range(400)]
all_data = all_data.drop(useless_columns, axis=1)


train_all_data = all_data[all_data['train_type'] != -1]
submit_all_data = all_data[all_data['train_type'] == -1]
feature_columns = ['train_type','label']
all_data = all_data.drop(feature_columns, axis=1)

train_all_data = train_all_data.reset_index(drop=True)
submit_all_data = submit_all_data.reset_index(drop=True)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_label_df = train_all_data[['label']]
train_feature_df = train_all_data.drop(feature_columns, axis=1)

submit_label_df = submit_all_data[['label']]
submit_feature_df = submit_all_data.drop(feature_columns, axis=1)

print(len(train_feature_df), len(submit_feature_df), len(train_feature_df.columns))
print(len(train_label_df), len(submit_label_df), len(train_feature_df.columns))

305613 180581 682
305613 180581 682


In [3]:
# Consider everything as categorical variables might be useful : this is the only trick of this notebook
# CAT_COLS = [c for c in df_train.columns if c.startswith("feature_")] 
categories_columns = ['Uid', 'Category', 'Subcategory', 'Concept', 'Mediatype', 'hour', 'day', 'weekday', 'week_hour', 'year_weekday','Geoaccuracy', 'ispro' , 'Ispublic']
# CAT_COLS = [c for c in df_train.columns if c.startswith("feature_")] 
# NUM_COLS = [] 

# FEATURES = CAT_COLS + NUM_COLS
CAT_COLS = [c for c in categories_columns] 
NUM_COLS = [c for c in all_data.columns if c not in CAT_COLS] 
# NUM_COLS = [] 
FEATURES = CAT_COLS + NUM_COLS

encoders = {}
for cat_col in CAT_COLS:
    label_enc = LabelEncoder()
        
    train_feature_df[cat_col] = label_enc.fit_transform(train_feature_df[cat_col])
    encoders[cat_col] = label_enc

for cat_col in CAT_COLS:
    label_enc = encoders[cat_col]
    le_dict = dict(zip(label_enc.classes_, label_enc.transform(label_enc.classes_)))
    # Replace unknown values by the most common value
    # Changing this to another value might make more sense
    if le_dict.get("low_frequency") is not None:
        default_val = le_dict["low_frequency"]
    else:
        default_val = train_feature_df[cat_col].mode().values[0]
    submit_feature_df[cat_col] = submit_feature_df[cat_col].apply(lambda x: le_dict.get(x, default_val ))
    
# Clip numerical features in test set to match training set
for num_col in NUM_COLS:
    submit_feature_df[num_col] = np.clip(submit_feature_df[num_col], train_feature_df[num_col].min(), train_feature_df[num_col].max())
# for col in CAT_COLS:
#     l_enc = LabelEncoder()
#     train_feature_df[col] = l_enc.fit_transform(train_feature_df[col].values)
#     categorical_dims[col] = len(l_enc.classes_)
    
# cat_idxs = [ i for i, f in enumerate(all_data.columns.tolist()) if f in categories_columns]

# cat_dims = [ categorical_dims[f] for i, f in enumerate(all_data.columns.tolist()) if f in categories_columns] 





KeyboardInterrupt: 

In [None]:
# + NUM_COLS
# FEATURES = CAT_COLS 
cat_dims = train_feature_df[CAT_COLS].nunique().to_list()
cat_idxs = [FEATURES.index(cat_col) for cat_col in CAT_COLS]
cat_emb_dims = np.ceil(np.log(cat_dims)).astype(int).tolist()
# cat_emb_dims = np.ceil(np.clip((np.array(cat_dims)) / 2, a_min=1, a_max=50)).astype(np.int).tolist()
# cat_emb_dims=1

X = train_feature_df[FEATURES].values
y = train_label_df['label'].values

X_test = submit_feature_df[FEATURES].values

In [None]:
# from pytorch_tabnet.pretraining import TabNetPretrainer
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
N_D = 64 #64 # 32
N_A = 64 # 32
N_INDEP = 1 #2
N_SHARED = 1 #2
N_STEPS = 3 #2
MASK_TYPE = "sparsemax"
GAMMA = 1.2
BS = 256
MAX_EPOCH =  30
# PRETRAIN = True
PRETRAIN = False


if PRETRAIN:
    pretrain_params = dict(n_d=N_D, n_a=N_A, n_steps=N_STEPS,  #0.2,
                           n_independent=N_INDEP, n_shared=N_SHARED,
                           # device = 'gpu',
                           device_name = 'cuda',
                           cat_idxs=cat_idxs,
                           cat_dims=cat_dims,
                           cat_emb_dim=cat_emb_dims,
                           gamma=GAMMA,
                           lambda_sparse=0., optimizer_fn=torch.optim.Adam,
                           optimizer_params=dict(lr=2e-2),
                           mask_type=MASK_TYPE,
                           scheduler_params=dict(mode="min",
                                                 patience=3,
                                                 min_lr=1e-5,
                                                 factor=0.5,),
                           scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,                         
                           verbose=1,
                          )

    pretrainer = TabNetPretrainer(**pretrain_params)

    pretrainer.fit(X_train=X_test, 
                   eval_set=[X],
                   max_epochs=MAX_EPOCH,
                   patience=25, batch_size=BS, virtual_batch_size=BS, #128,
                   num_workers=1, drop_last=True,
                   pretraining_ratio=0.5 # The bigger your pretraining_ratio the harder it is to reconstruct
                  )

In [None]:
from sklearn.model_selection import StratifiedKFold, KFold
BS = 2048
MAX_EPOCH =  100
LAMBDA_SPARSE = 1e-5 #1e-5
submit_proba = []
N_SPLITS = 5
NB_FOLDS = 5 # max N_SPLITS
# skf = StratifiedKFold(n_splits=N_SPLITS, random_state=2021, shuffle=True)
kfold = KFold(n_splits=N_SPLITS, shuffle=True, random_state=2020)
from sklearn.metrics import f1_score, mean_absolute_error, mean_squared_error
from scipy import stats
# k = 0

# for train_idx, valid_idx in kfold.split(train_feature_df, train_label_df):

LR = 1e-1 # 5e-2
fold_nb = 1
for train_index, valid_index in kfold.split(X, y):
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index].reshape(-1, 1), y[valid_index].reshape(-1, 1)

    tabnet_params = dict(n_d=N_D, 
                         n_a=N_A,
                         n_steps=N_STEPS, gamma=GAMMA,
                         n_independent=N_INDEP, n_shared=N_SHARED,
                         lambda_sparse=LAMBDA_SPARSE,
                         seed=0,
                         # clip_value=2,
                         cat_idxs=cat_idxs,
                         cat_dims=cat_dims,
                         cat_emb_dim=cat_emb_dims,
                         mask_type=MASK_TYPE,
                         device_name='auto',
                         optimizer_fn=torch.optim.Adam,
                         optimizer_params=dict(lr=LR, weight_decay=1e-5),
#                          scheduler_params=dict(max_lr=LR,
#                                                steps_per_epoch=int(X_train.shape[0] / BS),
#                                                epochs=MAX_EPOCH,
#                                                #final_div_factor=100,
#                                                is_batch_level=True),
#                         scheduler_fn=torch.optim.lr_scheduler.OneCycleLR,
                              scheduler_params=dict(mode='min',
                                                    factor=0.5,
                                                    patience=3,
                                                    is_batch_level=False,),
                              scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                         verbose=1)
    # Defining TabNet model
    # model = TabNetClassifier(**tabnet_params)
    model = TabNetRegressor(**tabnet_params)

    model.fit(X_train=X_train, y_train=y_train,
              from_unsupervised=pretrainer if PRETRAIN else None,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_name=["train", "valid"],
              eval_metric=["mae"],
              batch_size=BS,
              virtual_batch_size=256,
              max_epochs=MAX_EPOCH,
              drop_last=True,
              pin_memory=True,
              patience=10,
             )  
    
    valid_pred = model.predict(X_valid).reshape(-1)
    valid_mse = mean_squared_error(y_valid, valid_pred)
    valid_mae = mean_absolute_error(y_valid, valid_pred)
    valid_src = stats.spearmanr(y_valid, valid_pred)[0]
    
    print("MSE: %.4f, MAE: %.4f, SRC: %.4f"%(valid_mse, valid_mae, valid_src))
    test_preds = model.predict(X_test)
    submit_proba.append(test_preds.reshape(-1))
    # submit_proba[model.classes_] += test_preds.reshape(-1)
    fold_nb+=1
    
    if fold_nb > NB_FOLDS:
        break

# df_sub[model.classes_] = df_sub[model.classes_] / NB_FOLDS

In [None]:
submit_proba

In [9]:
submit_proba_df = pd.DataFrame(submit_proba)
submit_proba_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,180571,180572,180573,180574,180575,180576,180577,180578,180579,180580
0,8.450766,10.253364,11.358452,6.567711,6.567711,6.567711,6.669056,6.567711,6.653739,6.7001,...,6.695781,4.314606,3.995066,6.005832,9.465634,7.45383,5.221391,7.82745,6.394236,4.845299
1,7.733644,11.625751,9.973303,6.683363,6.683363,6.683363,6.689012,6.683363,6.775867,6.78211,...,4.566028,4.566028,4.450768,5.584325,10.464658,7.01316,6.180557,7.90469,7.008329,5.010685
2,8.111763,9.890415,10.094333,6.369346,6.369346,6.369346,6.717002,6.369346,6.773114,6.657667,...,7.529596,7.812297,7.555118,6.818549,11.348025,7.565997,7.882134,7.084733,6.603564,7.008011
3,7.525281,10.323838,10.341242,6.582438,6.582438,6.582438,6.582438,6.582438,6.582438,6.582438,...,6.683481,6.683481,6.215586,6.303994,12.324958,6.097549,7.779709,7.664347,8.275883,6.157398
4,8.174033,13.233686,13.113686,6.698243,6.698243,6.698243,6.698243,6.698243,6.659917,6.659917,...,8.089007,8.089007,8.086403,5.490835,10.057899,6.744744,7.853061,7.510925,7.215281,8.044188


In [11]:
submit_ans = np.mean(submit_proba_df, axis=0)
submit_ans

0          7.999097
1         11.065412
2         10.976202
3          6.580220
4          6.580220
            ...    
180576     6.975056
180577     6.983370
180578     7.598428
180579     7.099459
180580     6.213116
Length: 180581, dtype: float32

In [12]:
pd.DataFrame(submit_ans).to_csv('./TabNet.csv',header=True, index=None)

In [None]:
result = pd.DataFrame()
result['post_id'] = submit_label_df['Pid'].apply(lambda x: 'post' + str(x))
result['popularity_score'] = submit_ans.round(decimals=4)

out_json = dict()
out_json["version"] = "VERSION 1.0"
out_json["result"] = result.to_dict(orient='records')
out_json["external_data"] = {"used": "true", "details": "catboost"}
f = open('KFold_catboost.json', "w")
json.dump(out_json, f)
f.close()

In [15]:
ss = []
test_preds = model.predict(X_test)

In [24]:
from sklearn.metrics import f1_score, mean_absolute_error, mean_squared_error
from scipy import stats
valid_pred = model.predict(X_valid).reshape(-1)
valid_mse = mean_squared_error(y_valid, valid_pred)
valid_mae = mean_absolute_error(y_valid, valid_pred)
valid_src = stats.spearmanr(y_valid, valid_pred)[0]
    
print("MSE: %.4f, MAE: %.4f, SRC: %.4f"%(valid_mse, valid_mae, valid_src))

MSE: 1.0599, MAE: 0.5958, SRC: 0.9169


In [21]:
test_preds.reshape(-1)

array([ 8.622696 , 12.716267 , 11.696694 , ...,  6.3496876, 10.743238 ,
        2.0040984], dtype=float32)

In [22]:
ss = []
ss.append(test_preds.reshape(-1))
ss.append(test_preds.reshape(-1))
ss

[array([ 8.622696 , 12.716267 , 11.696694 , ...,  6.3496876, 10.743238 ,
         2.0040984], dtype=float32),
 array([ 8.622696 , 12.716267 , 11.696694 , ...,  6.3496876, 10.743238 ,
         2.0040984], dtype=float32)]

In [20]:
test_preds.shape

(180581, 1)