In [1]:
import pandas as pd
import numpy as np
from utils.forest_infer import pred_multi
from utils.metrics import print_metrics
import glob
from tqdm import tqdm
from utils.forest_infer import pred_multi
from xgboost import XGBClassifier
import gc
from catboost import CatBoostClassifier

In [2]:
df_pairs = pd.read_parquet("/workspace/inference_data/df_pairs_features_NZL_30.parquet")

In [3]:
features = ['similarity', 'haversine', 'name_davies',
       'name_leven', 'name_dice', 'name_jaro', 'name_set_ratio',
       'street_davies', 'street_leven', 'street_jaro', 'email_davies',
       'email_leven', 'email_jaro', 'url_davies', 'url_leven', 'url_jaro',
       'brands_davies', 'brand_leven', 'brand_jaro', 'phone_lcs',
       'subcat_WRatio', 'subcat_ratio', 'subcat_token_set_ratio',
       'Is_direction_match_0', 'Is_direction_match_1', 'Is_direction_match_2',
       'Is_house_match_0', 'Is_house_match_1', 'Is_house_match_2',
       'Is_category_match_0', 'Is_category_match_1', 'Is_subcategory_match_0',
       'Is_subcategory_match_1', 'Is_subcategory_match_2', 'Is_brand_match_0',
       'Is_brand_match_1', 'Is_brand_match_2', 'Is_brand_match_3',
       'Is_related_cat_0', 'Is_related_cat_1', 'Is_name_number_match_0',
       'Is_name_number_match_1', 'Is_name_number_match_2',
       'Is_name_number_match_3', 'is_phone_match_1', 'is_phone_match_2',
       'is_phone_match_3', 'is_phone_match_4', 'Is_email_match_0',
       'Is_email_match_1', 'Is_email_match_2', 'Is_url_match_0',
       'Is_url_match_1', 'Is_url_match_2']

# Lightgbm inference

In [4]:
models = glob.glob(f"/workspace/models/model_duplicate_gsplit_lgb*")

In [5]:
all_pred = []
for model_file in tqdm(models):
    prediction = pred_multi(model_file,df_pairs[features])
    all_pred.append(prediction)
    
all_pred = np.array(all_pred)
pred = np.mean(all_pred, axis=0)
df_pairs["prediction_lgb_probab"] = pred
df_pairs["prediction_lgb"] = pred>0.5
df_pairs["prediction_lgb"] = df_pairs["prediction_lgb"]*1

  0%|          | 0/5 [00:00<?, ?it/s]

[W] [17:18:45.270800] Treelite currently does not support float64 model parameters. Accuracy may degrade slightly relative to native LightGBM invocation.


  0%|          | 0/3960 [00:00<?, ?it/s]

 20%|██        | 1/5 [00:26<01:46, 26.58s/it]

[W] [17:19:02.585530] Treelite currently does not support float64 model parameters. Accuracy may degrade slightly relative to native LightGBM invocation.


  0%|          | 0/3960 [00:00<?, ?it/s]

 40%|████      | 2/5 [00:58<01:29, 29.92s/it]

[W] [17:19:34.885353] Treelite currently does not support float64 model parameters. Accuracy may degrade slightly relative to native LightGBM invocation.


  0%|          | 0/3960 [00:00<?, ?it/s]

 60%|██████    | 3/5 [01:30<01:01, 30.65s/it]

[W] [17:20:06.369902] Treelite currently does not support float64 model parameters. Accuracy may degrade slightly relative to native LightGBM invocation.


  0%|          | 0/3960 [00:00<?, ?it/s]

 80%|████████  | 4/5 [01:57<00:29, 29.28s/it]

[W] [17:20:33.595332] Treelite currently does not support float64 model parameters. Accuracy may degrade slightly relative to native LightGBM invocation.


  0%|          | 0/3960 [00:00<?, ?it/s]

100%|██████████| 5/5 [02:31<00:00, 30.39s/it]


# Xgboost inference

In [6]:
models = glob.glob(f"/workspace/models/xgboost_dedup_v2_*")

In [7]:
def pred_multi_xgb(model_file,df):
    xgb_model = XGBClassifier()
    xgb_model.load_model(model_file)
    prediction = xgb_model.predict_proba(df)[:, 1]
    del xgb_model
    gc.collect()
    return prediction

In [8]:
all_pred = []
for model_file in tqdm(models):
    prediction = pred_multi_xgb(model_file,df_pairs[features])
    all_pred.append(prediction)
    
all_pred = np.array(all_pred)
pred = np.mean(all_pred, axis=0)
df_pairs["prediction_xgb_probab"] = pred
df_pairs["prediction_xgb"] = pred>0.5
df_pairs["prediction_xgb"] = df_pairs["prediction_xgb"]*1

100%|██████████| 5/5 [00:51<00:00, 10.34s/it]


# catboost inference

In [9]:
models = glob.glob(f"/workspace/models/catboost_dedup_*")

In [10]:
def pred_multi_catboost(model_file,df):
    xgb_model = CatBoostClassifier()
    xgb_model.load_model(model_file)
    prediction = xgb_model.predict_proba(df)[:, 1]
    del xgb_model
    gc.collect()
    return prediction

In [11]:
all_pred = []
for model_file in tqdm(models):
    prediction = pred_multi_catboost(model_file,df_pairs[features])
    all_pred.append(prediction)
    
all_pred = np.array(all_pred)
pred = np.mean(all_pred, axis=0)
df_pairs["prediction_catboost_probab"] = pred
df_pairs["prediction_catboost"] = pred>0.5
df_pairs["prediction_catboost"] = df_pairs["prediction_catboost"]*1

100%|██████████| 5/5 [00:25<00:00,  5.00s/it]


In [12]:
df_pairs.columns

Index(['country', 'placeId1', 'placeId2', 'sourceNames1', 'sourceNames2',
       'category1', 'category2', 'brands1', 'brands2', 'email1', 'email2',
       'latitude1', 'longitude1', 'latitude2', 'longitude2', 'houseNumber1',
       'houseNumber2', 'streets1', 'streets2', 'cities1', 'cities2',
       'subCategory1', 'subCategory2', 'phoneNumbers1', 'phoneNumbers2',
       'internet1', 'internet2', 'postalCode1', 'postalCode2', 'similarity',
       'haversine', 'name_davies', 'name_leven', 'name_dice', 'name_jaro',
       'name_set_ratio', 'street_davies', 'street_leven', 'street_jaro',
       'email_davies', 'email_leven', 'email_jaro', 'url_davies', 'url_leven',
       'url_jaro', 'brands_davies', 'brand_leven', 'brand_jaro', 'phone_lcs',
       'subcat_WRatio', 'subcat_ratio', 'subcat_token_set_ratio',
       'Is_direction_match_0', 'Is_direction_match_1', 'Is_direction_match_2',
       'Is_house_match_0', 'Is_house_match_1', 'Is_house_match_2',
       'Is_category_match_0', 'Is_cate

In [13]:
df_pairs[(df_pairs["prediction_xgb"]==1)  & (df_pairs["prediction_lgb"]==1)  & (df_pairs["prediction_catboost"]==1)].to_csv("/workspace/nzl_duplicate.csv",index=None)

In [14]:
df_pairs.shape

(8256235, 89)

In [13]:
df_pairs[(df_pairs["prediction_xgb"]==1)  & (df_pairs["prediction_lgb"]==1)  & (df_pairs["prediction_catboost"]==1)].shape

(257513, 89)

In [1]:
import pandas as pd

In [2]:
df_pairs = pd.read_csv("/workspace/nzl_duplicate.csv")

In [3]:
df_pairs["prediction_catboost"] = df_pairs["prediction_catboost_probab"]>=0.9
df_pairs["prediction_catboost"] = df_pairs["prediction_catboost"]*1
df_pairs["prediction_xgb"] = df_pairs["prediction_xgb"]>=0.9
df_pairs["prediction_xgb"] = df_pairs["prediction_xgb"]*1
df_pairs["prediction_lgb"] = df_pairs["prediction_lgb_probab"]>=0.9
df_pairs["prediction_lgb"] = df_pairs["prediction_lgb"]*1

In [4]:
df_pairs[(df_pairs["prediction_xgb"]==1)  & (df_pairs["prediction_lgb"]==1)  & (df_pairs["prediction_catboost"]==1)].to_csv("/workspace/nzl_duplicate_0.9.csv",index=None)