# Inference KNN + XLMRoberta

In [1]:
import os
import gc
import math
import time
import torch
import itertools
import numpy as np
import pandas as pd
import torch.nn as nn
from tqdm import tqdm
from joblib import dump, load
from sklearn.metrics import f1_score
from torch.optim import Adam, SGD, AdamW
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer, XLMRobertaModel
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel, AutoConfig, XLMRobertaConfig, XLMRobertaModel

In [2]:
# ====================================================
# CFG
# ====================================================
class CFG:
    model = "xlm-roberta-base"
    max_len = 200
    batch_size = 500
    num_warmup_steps = 0
    use_scheduler = True
    print_freq = 100
    debug = True
    epochs = 5
    config_path = 'config.pth'
    model_path = 'xlm-roberta-base_best.pth'
    tokenizer_path = 'tokenizer/'
    final = False
    scaler_2 = 'std_scaler_main.bin'
    th = 0.5
    
OUTPUT_DIR = './'

# Import dataset

In [3]:
#Import train dataset
init_data = pd.read_csv('datasets/train.csv')
init_data = init_data.drop(columns=['address','city','state','zip','country','url','phone'])
init_data = init_data.sort_values(by = ['longitude']).reset_index(drop=True)
init_data.fillna('unknown', inplace = True)
init_data.head()

Unnamed: 0,id,name,latitude,longitude,categories,point_of_interest
0,E_22e8722af27f8f,"Tongatapu International Airport, Tongatapu, Tonga",-21.13943,-175.160248,Airports,P_94d933e731ebf7
1,E_77bd2381786179,Fuaʻamotu International Airport,-21.241393,-175.141634,Airports,P_94d933e731ebf7
2,E_4b77ccd07a7889,Fua'amotu International Airport (TBU),-21.244104,-175.137096,Airports,P_94d933e731ebf7
3,E_2088cfef49b8c5,Janes Beach Fales,-13.447485,-172.376063,Resorts,P_85041d895cbb1f
4,E_4515f166a43037,Etelinas Pizzeria,-13.442352,-172.358032,Pizza Places,P_8cc6a008838677


In [4]:
if CFG.debug == True:
    init_data = init_data.iloc[:10000]

In [5]:
if CFG.final == False:
    init_data = init_data.drop(columns = ['point_of_interest'])

# Helper functions 

In [6]:
def get_results(predictions, th = 0.5):
    preds = np.concatenate(predictions)
    preds = np.where(preds>=th, 1, 0)
    return preds
def euclidianDistance(lat1,long1,lat2,long2):
    return round(((lat2-lat1)**2+(long2-long1)**2)**(1/2), 6)

def JaccartCoef(set1,set2):
    intesect = set1.intersection(set2)
    return len(intesect)/(len(set1)+len(set2) - len(intesect))

    
def intersectionOverUnion(train_scoring_dict,predicted_df):
    coefs = []
    predicted_ids = set(predicted_df['id'].values)
    for _,predicted_row in predicted_df.iterrows():
        predicted_matches = set(predicted_row['match'].split(' '))
        train_matches = set()
        for id in train_scoring_dict[predicted_row['id']]:
            if id in predicted_ids:
                train_matches.add(id)
        coefs.append(JaccartCoef(train_matches,predicted_matches))          
    return sum(coefs)/len(coefs)

In [7]:
# download dict for scoring
answers_df = pd.read_csv('train_df_for_scoring.csv')
train_scoring_dict = answers_df.set_index('id').to_dict()['matches']
answers_df.head()

Unnamed: 0,id,matches
0,E_22e8722af27f8f,E_22e8722af27f8f E_77bd2381786179 E_4b77ccd07a...
1,E_77bd2381786179,E_22e8722af27f8f E_77bd2381786179 E_4b77ccd07a...
2,E_4b77ccd07a7889,E_22e8722af27f8f E_77bd2381786179 E_4b77ccd07a...
3,E_2088cfef49b8c5,E_2088cfef49b8c5
4,E_4515f166a43037,E_4515f166a43037


In [8]:
for ind in train_scoring_dict:
    train_scoring_dict[ind] = set(train_scoring_dict[ind].split())

In [9]:
train_scoring_dict['E_22e8722af27f8f']

{'E_22e8722af27f8f', 'E_4b77ccd07a7889', 'E_77bd2381786179'}

# Tokenizer & device

In [10]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = XLMRobertaTokenizer.from_pretrained(CFG.tokenizer_path)
CFG.tokenizer = tokenizer

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
CFG.device = device
print(device)

cuda


# Dataset

In [12]:
# ====================================================
# Dataset
# ====================================================
def prepareInputs(line):
    text = str(line['name_1']) + '[SEP]' + str(line['categories_1']) + '[SEP]' + str(line['distance']) \
    + '[SEP]'+ str(line['name_2']) + '[SEP]'+ str(line['categories_2'])
    
    inputs = CFG.tokenizer(text, 
                           add_special_tokens=True,
                           max_length=CFG.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs

class PairsDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs
        

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        inputs = prepareInputs(self.pairs.iloc[idx])
        
        return inputs

# Model

In [13]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, config_path=None, pretrained=False):
        super().__init__()
        
        if config_path is None:
            self.config = AutoConfig.from_pretrained(CFG.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(CFG.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
            
        
        
        '''
        self.config = XLMRobertaConfig.from_pretrained(CFG.model, output_hidden_states=True)
        self.model = XLMRobertaModel.from_pretrained(CFG.model, config=self.config)'''
        #self.config = torch.load(config_path)
        #self.model = AutoModel(self.config) 
        #self.model = AutoModel.from_config(self.config)
        
        self.fc_dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs.last_hidden_state[:,0,:]
        return last_hidden_states

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

# Inference

In [14]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))   

def infer_fn(infer_loader, model, device):
    model.eval()
    preds = []
    start = end = time.time()
    for step, inputs in enumerate(infer_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(infer_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  .format(step, len(infer_loader),
                          remain=timeSince(start, float(step+1)/len(infer_loader))))
    
    predictions = np.concatenate(preds)
    return predictions 

In [15]:
def inferenceMain(dataset):
    # ====================================================
    # loader
    # ====================================================
    infer_dataset = PairsDataset(dataset)
    
    infer_dataloader = DataLoader(infer_dataset, batch_size=CFG.batch_size, shuffle=False, pin_memory = True)

    #infer_labels = np.where(test['match'].to_numpy() == True, 1,0)
    # ====================================================
    # model & predictioning
    # ====================================================
    tokenizer = CFG.tokenizer
    model = CustomModel(config_path = CFG.config_path, pretrained=False)
    state = torch.load(CFG.model_path,
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    model.to(CFG.device)
    
    start_time = time.time()
    
    predictions = infer_fn(infer_dataloader, model, device)

    results = np.concatenate(predictions)

    elapsed = time.time() - start_time
    print(f'time: {elapsed:.0f}s')

    torch.cuda.empty_cache()
    gc.collect()
    return results

# Processing with bunches

In [16]:
def generatePairsIds(init_data, n_neighbours = 7 ):
    #import scaler and scale
    data = init_data.copy()
    scaler= StandardScaler()
    num_features = ['latitude','longitude']
    data[num_features] = scaler.fit_transform(data[num_features])
    
    X = data[['latitude','longitude']]
    
    # apply KNN
    nbrs = NearestNeighbors(n_neighbors = n_neighbours, algorithm='kd_tree').fit(X)
    indices = nbrs.kneighbors(X,return_distance=False)
    indices_set = list(map(set,indices))
    
    # remove any duplicated pairs: if we have a=b, delete b=a
    for primary_id in tqdm(range(len(indices_set))):
        ind_set = indices_set[primary_id]
        if primary_id in ind_set:
            ind_set.remove(primary_id)

        for secondary_id in ind_set:
            if primary_id in indices_set[secondary_id]:
                indices_set[secondary_id].remove(primary_id)
    count = 0
    for i in indices_set:
        count += len(i)
    print(f'{count} pairs ids generated')
    return indices_set

def fillIdDict(matches, id_dict):
    start_time = time.time()
    for i in tqdm(range(len(matches))):
        row = matches.iloc[i]
        id1 = row['id_1']
        id2 = row['id_2']
        similarity = row['match']
        id_dict[id1].neighbours[id2] = similarity
        id_dict[id2].neighbours[id1] = similarity
    elapsed = time.time() - start_time
    print(f'ID dict generated. Time elapsed: {elapsed:.0} s')
    return id_dict

class EntryID:
    # class is not justified, could be replaced with just dict
    def __init__(self, index):
        self.neighbours = {index: 1}
        
def findMoreNeighbours(weight, primary_id, id_dict, current_id, visited, th):
    id_obj = id_dict[current_id]
    result = {}
    if current_id not in visited:
        visited.add(current_id)
        for key in id_obj.neighbours:
            summary_prob = id_obj.neighbours[key]*weight
            if  summary_prob >= th and key != primary_id and key != current_id:
                result[key] = summary_prob
                current_result = findMoreNeighbours(summary_prob, primary_id, id_dict, key, visited, th)
                result.update(current_result)
                
    return result
    

def postProcess(id_dict, th = 0.5):
    # the idea is if we have results as probabilities, we could 
    # connect entries A and B that are not neighbours if their  
    # product is bigger than 'th'
    
    start_time = time.time()
    for entry_id in id_dict:
        neighbours = list(id_dict[entry_id].neighbours.keys()).copy()
        for current_id in neighbours:
            weight = id_dict[entry_id].neighbours[current_id]
            if  weight> th and current_id != entry_id:
                result = findMoreNeighbours(weight, entry_id, id_dict, current_id, set(), th)
                
                neighbours_dict = id_dict[entry_id].neighbours
                for key in result:
                    score = neighbours_dict.get(key)
                    if not score:
                        neighbours_dict[key] = result[key]
                    elif score < result[key]:
                        neighbours_dict[key] = result[key]
    elapsed = time.time() - start_time
    print(f'Post processing finished. Time elapsed: {elapsed}')
    return id_dict
        
def pairsGenerator(df, ids):
    start_time = time.time()
    count = 0
    ids1 = []
    ids2 = []
    for entry_id in range(len(ids)):
        if len(ids[entry_id]) > 0:
            current_ids2 = list(ids[entry_id])
            current_ids1 = [entry_id for _ in range(len(current_ids2))]
            
            ids1.extend(current_ids1)
            ids2.extend(current_ids2)
            
            count += len(current_ids2)

            if count > 500000:
                match_to_add = pd.concat([df.iloc[ids1].reset_index(drop=True),
                                         df.iloc[ids2].reset_index(drop=True)], 
                                        axis = 1)
                match_to_add.columns = ['id_1','name_1','latitude_1','longitude_1','categories_1',
                                        'id_2','name_2','latitude_2','longitude_2', 'categories_2']
                
                ids1 = []
                ids2 = []
                elapsed = time.time() - start_time
                print(f'{count} pairs generated. Time elapsed: {elapsed}')
                yield match_to_add
                start_time = time.time()
                count = 0
                del match_to_add
    if count > 0:
        match_to_add = pd.concat([df.iloc[ids1].reset_index(drop=True),
                                         df.iloc[ids2].reset_index(drop=True)], 
                                        axis = 1)
        match_to_add.columns = ['id_1','name_1','latitude_1','longitude_1','categories_1',
                                'id_2','name_2','latitude_2','longitude_2', 'categories_2']
        del ids1, ids2
        elapsed = time.time() - start_time
        print(f'{count} pairs generated. Time elapsed: {elapsed}')
        yield match_to_add


def generateSubmission(id_dict):
    for entry_id in id_dict:
        neighbours = id_dict[entry_id].neighbours
        matches_to_add = []
        for ind in neighbours:
            if neighbours[ind] > CFG.th:
                matches_to_add.append(ind)

        id_dict[entry_id] = ' '.join(matches_to_add)


    result = pd.DataFrame.from_dict(id_dict, orient = 'index', columns = ['match'])
    result.index.name = 'id'
    result = result.reset_index()
    return result

# Main

In [17]:
if __name__ == '__main__':
    #initializing dict for id: poi, for poi:{id1,id2,..etc}, generate pairs with KNN
    id_dict = {str(entry_id) : EntryID(str(entry_id)) for entry_id in init_data['id'].values}
    pair_ids = generatePairsIds(init_data)
    
    for test_pairs in pairsGenerator(init_data, pair_ids):
        
        # take pairs from generator and scale their longitude and latitude
        scaler=load(CFG.scaler_2)
        num_features = [['latitude_1','longitude_1'],['latitude_2','longitude_2']]
        test_pairs[num_features[0]] = scaler.transform(test_pairs[num_features[0]])
        test_pairs[num_features[1]] = scaler.transform(test_pairs[num_features[1]])
        
        # get euclidian distance as feature
        test_pairs['distance'] = test_pairs.apply(lambda x: euclidianDistance(x.latitude_1,x.longitude_1,x.latitude_2,x.longitude_2), axis = 1)
        test_pairs = test_pairs.drop(columns = ['latitude_1','longitude_1','latitude_2','longitude_2'])
        
        # get predictions
        results = inferenceMain(test_pairs)
        test_pairs['match'] = results
        
        #fill dict for ids and pois
        id_dict = fillIdDict(test_pairs, id_dict)
    
    # process answers
    id_dict = postProcess(id_dict, th = 0.9)
    submission = generateSubmission(id_dict)
    score = intersectionOverUnion(train_scoring_dict,submission)
    print('IOU score: ', score)
    
    submission.to_csv('submission.csv', index=False)

100%|████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 237225.00it/s]


38882 pairs ids generated
38882 pairs generated. Time elapsed: 0.03725099563598633
EVAL: [0/78] Elapsed 0m 5s (remain 6m 43s) 
EVAL: [77/78] Elapsed 4m 4s (remain 0m 0s) 
time: 245s


100%|██████████████████████████████████████████████████████████████████████████| 38882/38882 [00:04<00:00, 8491.11it/s]


ID dict generated. Time elapsed: 5e+00 s
Post processing finished. Time elapsed: 0.37008166313171387
IOU score:  0.8596932123432113
