In [None]:
%pylab inline

plt.style.use("bmh")

In [None]:
import pathlib

from difflib import SequenceMatcher

import china_cities
import pandas as pd
import numpy as np
import scipy.sparse as sp

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from lightgbm import LGBMClassifier
from transliterate import translit

import warnings
warnings.filterwarnings('ignore')

In [None]:
from tqdm import tqdm
tqdm.pandas()

# Загрузка данных

In [None]:
DATA_DIR = pathlib.Path("../data")

In [None]:
train_df = pd.read_csv(DATA_DIR.joinpath("train.csv"), index_col="pair_id")
test_df = pd.read_csv(DATA_DIR.joinpath("test.csv"), index_col="pair_id")

In [None]:
train = train_df.copy()
test = test_df.copy()

# Очистка данных

In [None]:
import pycountry
import re

In [None]:
countries = [country.name.lower() for country in pycountry.countries]

In [None]:
cities = [city.lower() for city in china_cities.get_cities_en()]

In [None]:
provinces = [province.lower() for province in china_cities.get_provinces()]

In [None]:
train["name_1"] = train["name_1"].str.lower()
train["name_2"] = train["name_2"].str.lower()

test["name_1"] = test["name_1"].str.lower()
test["name_2"] = test["name_2"].str.lower()

In [None]:
def multi_str_replace(strings, debug=False):
    re_str = r'\b(?:' + '|'.join(
        [re.escape(s) for s in strings]
    ) + r')(?!\S)'
    if debug:
        print(re_str)
    return re.compile(re_str, re.UNICODE)

In [None]:
legal_entities = ["ltd\.", "co\.", "inc\.", "b\.v\.", "s\.c\.r\.l\.", "gmbh", "pvt\.", "llc", "corp", "corp\.",
                  "bv", "s\.a\.", "c\.v\.", "ltda", "de", "cv", "sa", "ca", "c\.a\.", "ооо", "ooo", "гк"]
legal_re = re.compile(r'\s*\b(?:' + '|'.join([rf"{entity}" for entity in legal_entities]) + r')(?!\S)')

In [None]:
train.head()

In [None]:
for dataset in (train, test):
    dataset.replace(to_replace=re.compile(r"\s+\(.*\)"), value="", inplace=True, regex=True)

In [None]:
countries_re = multi_str_replace(countries)
cities_re = multi_str_replace(cities)
provinces_re = multi_str_replace(provinces)

In [None]:
for dataset in train, test :
    dataset.replace(to_replace=countries_re, value="", inplace=True, regex=True)
    dataset.replace(to_replace=cities_re, value="", inplace=True, regex=True)
    dataset.replace(to_replace=provinces_re, value="", inplace=True, regex=True)

In [None]:
for dataset in (train, test):
    dataset.replace(to_replace=re.compile(r"[^\w\s]"), value=" ", inplace=True, regex=True)
    dataset.replace(to_replace=re.compile(r"\s+"), value=" ", inplace=True, regex=True)
    dataset.replace(to_replace=re.compile(r"\s*\b\w{1}\b"), value="", inplace=True, regex=True)

In [None]:
intersections = []
for row in train[train['is_duplicate'] == 0].iterrows():
    name_1 = row[1]['name_1']
    name_2 = row[1]['name_2']
    i = set(name_1.split()) & set(name_2.split())
    for w in i:
        intersections.append(w)

In [None]:
junk_re = multi_str_replace(set(intersections))
for dataset in (train, test):
    dataset.replace(to_replace=junk_re, value="", inplace=True, regex=True)

# Генерирование фичей

In [None]:
from strsimpy.levenshtein import Levenshtein
from strsimpy.qgram import QGram
from strsimpy.jaro_winkler import JaroWinkler
from strsimpy.normalized_levenshtein import NormalizedLevenshtein

In [None]:
train['name_1_tr'] = train['name_1'].apply(lambda x: translit(x, 'ru'))
train['name_2_tr'] = train['name_2'].apply(lambda x: translit(x, 'ru'))

test['name_1_tr'] = test['name_1'].apply(lambda x: translit(x, 'ru'))
test['name_2_tr'] = test['name_2'].apply(lambda x: translit(x, 'ru'))

In [None]:
train['n_intersect'] = train.apply(lambda x: len(set(x['name_1'].split()) & set(x['name_2'].split())), axis=1)
test['n_intersect'] = test.apply(lambda x: len(set(x['name_1'].split()) & set(x['name_2'].split())), axis=1)

In [None]:
train['coltdetc_n1'] = train_df['name_1'].apply(lambda x: len(re.findall(legal_re, x.lower())))
test['coltdetc_n1'] = test_df['name_1'].apply(lambda x: len(re.findall(legal_re, x.lower())))

train['coltdetc_n2'] = train_df['name_2'].apply(lambda x: len(re.findall(legal_re, x.lower())))
test['coltdetc_n2'] = test_df['name_2'].apply(lambda x: len(re.findall(legal_re, x.lower())))

In [None]:
levenshtein = Levenshtein()

train["levenshtein"] = train.progress_apply(lambda r: levenshtein.distance(r.name_1, r.name_2), axis=1)
test["levenshtein"] = test.progress_apply(lambda r: levenshtein.distance(r.name_1, r.name_2), axis=1)

train["levenshtein_tr21"] = train.progress_apply(lambda r: levenshtein.distance(r.name_1, r.name_2_tr), axis=1)
test["levenshtein_tr21"] = test.progress_apply(lambda r: levenshtein.distance(r.name_1, r.name_2_tr), axis=1) 

train["levenshtein_tr12"] = train.progress_apply(lambda r: levenshtein.distance(r.name_2, r.name_1_tr), axis=1)
test["levenshtein_tr12"] = test.progress_apply(lambda r: levenshtein.distance(r.name_2, r.name_1_tr), axis=1)

In [None]:
normalized_levenshtein = NormalizedLevenshtein()

train["norm_levenshtein"] = train.progress_apply(lambda r: normalized_levenshtein.distance(r.name_1, r.name_2),
                                                axis=1)
test["norm_levenshtein"] = test.progress_apply(lambda r: normalized_levenshtein.distance(r.name_1, r.name_2),
                                              axis=1)
train["norm_levenshtein_tr21"] = train.progress_apply(lambda r: normalized_levenshtein.distance(r.name_1, r.name_2_tr), axis=1)
test["norm_levenshtein_tr21"] = test.progress_apply(lambda r: normalized_levenshtein.distance(r.name_1, r.name_2_tr), axis=1) 

train["norm_levenshtein_tr12"] = train.progress_apply(lambda r: normalized_levenshtein.distance(r.name_2, r.name_1_tr), axis=1)
test["norm_levenshtein_tr12"] = test.progress_apply(lambda r: normalized_levenshtein.distance(r.name_2, r.name_1_tr), axis=1)

In [None]:
train["levenshtein_cont"] = train.progress_apply(lambda r: levenshtein.distance(r.name_1.replace(' ', ''), r.name_2.replace(' ', '')), axis=1)
test["levenshtein_cont"] = test.progress_apply(lambda r: levenshtein.distance(r.name_1.replace(' ', ''), r.name_2.replace(' ', '')), axis=1)

train["levenshtein_tr21_cont"] = train.progress_apply(lambda r: levenshtein.distance(r.name_1.replace(' ', ''), r.name_2_tr.replace(' ', '')), axis=1)
test["levenshtein_tr21_cont"] = test.progress_apply(lambda r: levenshtein.distance(r.name_1.replace(' ', ''), r.name_2_tr.replace(' ', '')), axis=1) 

train["levenshtein_tr12_cont"] = train.progress_apply(lambda r: levenshtein.distance(r.name_2.replace(' ', ''), r.name_1_tr.replace(' ', '')), axis=1)
test["levenshtein_tr12_cont"] = test.progress_apply(lambda r: levenshtein.distance(r.name_2.replace(' ', ''), r.name_1_tr.replace(' ', '')), axis=1)

In [None]:
train["norm_levenshtein_cont"] = train.progress_apply(lambda r: normalized_levenshtein.distance(r.name_1.replace(' ', ''), r.name_2.replace(' ', '')),
                                                axis=1)
test["norm_levenshtein_cont"] = test.progress_apply(lambda r: normalized_levenshtein.distance(r.name_1.replace(' ', ''), r.name_2.replace(' ', '')),
                                              axis=1)
train["norm_levenshtein_tr21_cont"] = train.progress_apply(lambda r: normalized_levenshtein.distance(r.name_1.replace(' ', ''), r.name_2_tr.replace(' ', '')), axis=1)
test["norm_levenshtein_tr21_cont"] = test.progress_apply(lambda r: normalized_levenshtein.distance(r.name_1.replace(' ', ''), r.name_2_tr.replace(' ', '')), axis=1) 

train["norm_levenshtein_tr12_cont"] = train.progress_apply(lambda r: normalized_levenshtein.distance(r.name_2.replace(' ', ''), r.name_1_tr.replace(' ', '')), axis=1)
test["norm_levenshtein_tr12_cont"] = test.progress_apply(lambda r: normalized_levenshtein.distance(r.name_2.replace(' ', ''), r.name_1_tr.replace(' ', '')), axis=1)

In [None]:
qgram = QGram()

train["qgram"] = train.progress_apply(lambda r: qgram.distance(r.name_1, r.name_2), axis=1)
test["qgram"] = test.progress_apply(lambda r: qgram.distance(r.name_1, r.name_2), axis=1)

train["qgram_tr21"] = train.progress_apply(lambda r: qgram.distance(r.name_1, r.name_2_tr), axis=1)
test["qgram_tr21"] = test.progress_apply(lambda r: qgram.distance(r.name_1, r.name_2_tr), axis=1) 

train["qgram_tr12"] = train.progress_apply(lambda r: qgram.distance(r.name_2, r.name_1_tr), axis=1)
test["qgram_tr12"] = test.progress_apply(lambda r: qgram.distance(r.name_2, r.name_1_tr), axis=1)

In [None]:
def count_upper(name_string):
    counter = 0
    for c in name_string:
        if c.isupper():
            counter += 1
    return counter

In [None]:
train['upper_count1'] = train_df['name_1'].apply(count_upper)
train['upper_count2'] = train_df['name_2'].apply(count_upper)

test['upper_count1'] = test_df['name_1'].apply(count_upper)
test['upper_count2'] = test_df['name_2'].apply(count_upper)

In [None]:
train['num_punct1'] = train_df['name_1'].apply(lambda x: len(re.findall(r'[^\s\w]', x)))
train['num_punct2'] = train_df['name_2'].apply(lambda x: len(re.findall(r'[^\s\w]', x)))

test['num_punct1'] = test_df['name_1'].apply(lambda x: len(re.findall(r'[^\s\w]', x)))
test['num_punct2'] = test_df['name_2'].apply(lambda x: len(re.findall(r'[^\s\w]', x)))

In [None]:
sm = SequenceMatcher()
def get_ratios(seq1, seq2):
    sm.set_seqs(seq1, seq2)
    return sm.ratio()

def get_list_ratios(seq1, seq2):
    sm.set_seqs(seq1.split(), seq2.split())
    return sm.ratio()

In [None]:
train['ratios'] = train.apply(lambda x: get_ratios(x['name_1'], x['name_2']), axis=1)
test['ratios'] = test.apply(lambda x: get_ratios(x['name_1'], x['name_2']), axis=1)

train['ratios_tr12'] = train.apply(lambda x: get_ratios(x['name_1_tr'], x['name_2']), axis=1)
test['ratios_tr12'] = test.apply(lambda x: get_ratios(x['name_1_tr'], x['name_2']), axis=1)

train['ratios_tr21'] = train.apply(lambda x: get_ratios(x['name_1'], x['name_2_tr']), axis=1)
test['ratios_tr21'] = test.apply(lambda x: get_ratios(x['name_1'], x['name_2_tr']), axis=1)

In [None]:
train['ratios_cont'] = train.apply(lambda x: get_ratios(x['name_1'].replace(' ', ''), x['name_2'].replace(' ', '')), axis=1)
test['ratios_cont'] = test.apply(lambda x: get_ratios(x['name_1'].replace(' ', ''), x['name_2'].replace(' ', '')), axis=1)

train['ratios_tr12_cont'] = train.apply(lambda x: get_ratios(x['name_1_tr'].replace(' ', ''), x['name_2'].replace(' ', '')), axis=1)
test['ratios_tr12_cont'] = test.apply(lambda x: get_ratios(x['name_1_tr'].replace(' ', ''), x['name_2'].replace(' ', '')), axis=1)

train['ratios_tr21_cont'] = train.apply(lambda x: get_ratios(x['name_1'].replace(' ', ''), x['name_2_tr'].replace(' ', '')), axis=1)
test['ratios_tr21_cont'] = test.apply(lambda x: get_ratios(x['name_1'].replace(' ', ''), x['name_2_tr'].replace(' ', '')), axis=1)

In [None]:
train['seq_ratios'] = train.apply(lambda x: get_list_ratios(x['name_1'], x['name_2']), axis=1)
test['seq_ratios'] = test.apply(lambda x: get_list_ratios(x['name_1'], x['name_2']), axis=1)

train['seq_ratios_tr12'] = train.apply(lambda x: get_list_ratios(x['name_1_tr'], x['name_2']), axis=1)
test['seq_ratios_tr12'] = test.apply(lambda x: get_list_ratios(x['name_1_tr'], x['name_2']), axis=1)

train['seq_ratios_tr21'] = train.apply(lambda x: get_list_ratios(x['name_1'], x['name_2_tr']), axis=1)
test['seq_ratios_tr21'] = test.apply(lambda x: get_list_ratios(x['name_1'], x['name_2_tr']), axis=1)

In [None]:
train['name1_len'] = train['name_1'].apply(len)
train['name2_len'] = train['name_2'].apply(len)

test['name1_len'] = test['name_1'].apply(len)
test['name2_len'] = test['name_2'].apply(len)

In [None]:
train['name1_nwords'] = train['name_1'].apply(lambda x: len(x.split()))
train['name2_nwords'] = train['name_2'].apply(lambda x: len(x.split()))

test['name1_nwords'] = test['name_1'].apply(lambda x: len(x.split()))
test['name2_nwords'] = test['name_2'].apply(lambda x: len(x.split()))

In [None]:
train['first_w_inter'] = train.apply(lambda x: 1 if all((len(x['name_1'].split()), len(x['name_2'].split()))) and x['name_1'].split()[0] == x['name_2'].split()[0] else 0, axis=1)
test['first_w_inter'] = test.apply(lambda x: 1 if all((len(x['name_1'].split()), len(x['name_2'].split()))) and x['name_1'].split()[0] == x['name_2'].split()[0] else 0, axis=1)

In [None]:
longest_n1 = max(train['name1_nwords'])
longest_n2 = max(train['name2_nwords'])

In [None]:
train_words_lev_feats = []
test_words_lev_feats = []

for i, row in enumerate(train.iterrows()):
    w_row = []
    n1 = row[1]['name_1'].split()
    n2 = row[1]['name_2'].split()
    
    for j in range(longest_n1):
        for k in range(longest_n2):
            if j >= len(n1):
                w_row.append(0)
            elif k >= len(n2):
                w_row.append(0)
            else:
                w_row.append(levenshtein.distance(n1[j], n2[k]))
    train_words_lev_feats.append(w_row)
    
for i, row in enumerate(test.iterrows()):
    w_row = []
    n1 = row[1]['name_1'].split()
    n2 = row[1]['name_2'].split()
    
    for j in range(longest_n1):
        for k in range(longest_n2):
            if j >= len(n1):
                w_row.append(0)
            elif k >= len(n2):
                w_row.append(0)
            else:
                w_row.append(levenshtein.distance(n1[j], n2[k]))
    test_words_lev_feats.append(w_row)

In [None]:
train_sparse_lev_feats = sp.coo_matrix(train_words_lev_feats)
test_sparse_lev_feats = sp.coo_matrix(test_words_lev_feats)

In [None]:
train_words_norm_lev_feats = []
test_words_norm_lev_feats = []

for i, row in enumerate(train.iterrows()):
    w_row = []
    n1 = row[1]['name_1'].split()
    n2 = row[1]['name_2'].split()
    
    for j in range(longest_n1):
        for k in range(longest_n2):
            if j >= len(n1):
                w_row.append(0)
            elif k >= len(n2):
                w_row.append(0)
            else:
                w_row.append(normalized_levenshtein.distance(n1[j], n2[k]))
    train_words_norm_lev_feats.append(w_row)
    
for i, row in enumerate(test.iterrows()):
    w_row = []
    n1 = row[1]['name_1'].split()
    n2 = row[1]['name_2'].split()
    
    for j in range(longest_n1):
        for k in range(longest_n2):
            if j >= len(n1):
                w_row.append(0)
            elif k >= len(n2):
                w_row.append(0)
            else:
                w_row.append(normalized_levenshtein.distance(n1[j], n2[k]))
    test_words_norm_lev_feats.append(w_row)

In [None]:
train_sparse_norm_lev_feats = sp.coo_matrix(train_words_norm_lev_feats)
test_sparse_norm_lev_feats = sp.coo_matrix(test_words_norm_lev_feats)

In [None]:
train_words_ratios_feats = []
test_words_ratios_feats = []

for i, row in enumerate(train.iterrows()):
    w_row = []
    n1 = row[1]['name_1'].split()
    n2 = row[1]['name_2'].split()
    
    for j in range(longest_n1):
        for k in range(longest_n2):
            if j >= len(n1):
                w_row.append(0)
            elif k >= len(n2):
                w_row.append(0)
            else:
                sm.set_seqs(n1[j], n2[k])
                w_row.append(sm.ratio())
    train_words_ratios_feats.append(w_row)
    
for i, row in enumerate(test.iterrows()):
    w_row = []
    n1 = row[1]['name_1'].split()
    n2 = row[1]['name_2'].split()
    
    for j in range(longest_n1):
        for k in range(longest_n2):
            if j >= len(n1):
                w_row.append(0)
            elif k >= len(n2):
                w_row.append(0)
            else:
                sm.set_seqs(n1[j], n2[k])
                w_row.append(sm.ratio())
    test_words_ratios_feats.append(w_row)

In [None]:
train_sparse_ratio_feats = sp.coo_matrix(train_words_ratios_feats)
test_sparse_ratio_feats = sp.coo_matrix(test_words_ratios_feats)

In [None]:
train['intersections'] = train.apply(lambda x: ' '.join(set(x['name_1'].split()) & set(x['name_2'].split())), axis=1)
test['intersections'] = test.apply(lambda x: ' '.join(set(x['name_1'].split()) & set(x['name_2'].split())), axis=1)

In [None]:
train['feat1'] = train['seq_ratios'] + train['norm_levenshtein']
train['feat2'] = train['ratios_cont'] + train['norm_levenshtein']
train['feat3'] = train['qgram'] + train['levenshtein']
train['feat4'] = train_df.apply(lambda x: 1 if all((len(x['name_1'].split()), len(x['name_2'].split()))) and x['name_1'].split()[0].lower() == x['name_2'].split()[0].lower() else 0, axis=1)

test['feat1'] = test['seq_ratios'] + test['norm_levenshtein']
test['feat2'] = test['ratios_cont'] + test['norm_levenshtein']
test['feat3'] = test['qgram'] + test['levenshtein']
test['feat4'] = test_df.apply(lambda x: 1 if all((len(x['name_1'].split()), len(x['name_2'].split()))) and x['name_1'].split()[0].lower() == x['name_2'].split()[0].lower() else 0, axis=1)

In [None]:
train['feat5'] = train.apply(lambda x: 1 if all((len(x['name_1_tr'].split()), len(x['name_2'].split()))) and x['name_1_tr'].split()[0].lower() == x['name_2'].split()[0].lower() else 0, axis=1)
train['feat6'] = train.apply(lambda x: 1 if all((len(x['name_1'].split()), len(x['name_2_tr'].split()))) and x['name_1'].split()[0].lower() == x['name_2_tr'].split()[0].lower() else 0, axis=1)

test['feat5'] = test.apply(lambda x: 1 if all((len(x['name_1_tr'].split()), len(x['name_2'].split()))) and x['name_1_tr'].split()[0].lower() == x['name_2'].split()[0].lower() else 0, axis=1)
test['feat6'] = test.apply(lambda x: 1 if all((len(x['name_1'].split()), len(x['name_2_tr'].split()))) and x['name_1'].split()[0].lower() == x['name_2_tr'].split()[0].lower() else 0, axis=1)

In [None]:
train['name1_nwords_before'] = train_df['name_1'].apply(lambda x: len(x.split()))
train['name2_nwords_before'] = train_df['name_2'].apply(lambda x: len(x.split()))

longest_n1_before = max(train['name1_nwords_before'])
longest_n2_before = max(train['name2_nwords_before'])

In [None]:
train_words_junk_feats = []
test_words_junk_feats = []
sint = set(intersections)

for i, row in enumerate(train_df.iterrows()):
    w_row = []
    n1 = row[1]['name_1'].lower().split()
    n2 = row[1]['name_2'].lower().split()
    
    for j in range(longest_n1_before):
        if j >= len(n1):
            w_row.append(0)
        elif n1[j] in sint:
            w_row.append(1)
        else:
            w_row.append(0)
            
    for k in range(longest_n2_before):
        if k >= len(n2):
            w_row.append(0)
        elif n2[k] in sint:
            w_row.append(1)
        else:
            w_row.append(0)
            
    train_words_junk_feats.append(w_row)
    
for i, row in enumerate(test_df.iterrows()):
    w_row = []
    n1 = row[1]['name_1'].lower().split()
    n2 = row[1]['name_2'].lower().split()
    
    for j in range(longest_n1_before):
        if j >= len(n1):
            w_row.append(0)
        elif n1[j] in sint:
            w_row.append(1)
        else:
            w_row.append(0)
            
    for k in range(longest_n2_before):
        if k >= len(n2):
            w_row.append(0)
        elif n2[k] in sint:
            w_row.append(1)
        else:
            w_row.append(0)
            
    test_words_junk_feats.append(w_row)

In [None]:
train_sparse_junk_feats = sp.coo_matrix(train_words_junk_feats)
test_sparse_junk_feats = sp.coo_matrix(test_words_junk_feats)

# Внешние данные

In [None]:
train_external = pd.read_csv(DATA_DIR.joinpath("train_external.csv"), index_col="pair_id")
test_external = pd.read_csv(DATA_DIR.joinpath("test_external.csv"), index_col="pair_id")

In [None]:
train = pd.concat((train, train_external), axis=1)
test = pd.concat((test, test_external), axis=1)

In [None]:
enc = OneHotEncoder(handle_unknown='ignore')

In [None]:
enc.fit(train[['industry_n1', 'industry_n2']])

# Собираем датасет

In [None]:
FTS = ["first_w_inter", "norm_levenshtein", "ratios", "seq_ratios", "name1_len", "name2_len", "name1_nwords",
       "name2_nwords", "levenshtein_tr21", "levenshtein_tr12", "norm_levenshtein_tr21", "norm_levenshtein_tr12",
       "ratios_tr12", "ratios_tr21", "seq_ratios_tr12", "seq_ratios_tr21", "levenshtein_cont",
       "levenshtein_tr21_cont", "levenshtein_tr12_cont", "norm_levenshtein_cont", "norm_levenshtein_tr21_cont",
       "norm_levenshtein_tr12_cont", "ratios_cont", "ratios_tr12_cont", "ratios_tr21_cont", "qgram", "qgram_tr21",
       "coltdetc_n1", "coltdetc_n2", "n_intersect", "upper_count1", "upper_count2", "num_punct1", "num_punct2", 
       "feat1", "feat2", "feat3", "feat4", "feat5", "feat6", "total employee estimate_n1"]

In [None]:
tfidf1 = TfidfVectorizer()
tfidf2 = TfidfVectorizer()
tfidf3 = TfidfVectorizer()

In [None]:
vecs1 = tfidf1.fit_transform(train['name_1'])
vecs2 = tfidf2.fit_transform(train['name_2'])
vecs3 = tfidf3.fit_transform(train['intersections'])

In [None]:
X = sp.hstack((train[FTS], vecs1, vecs2))
y = train['is_duplicate']

In [None]:
X = sp.hstack((X, train_sparse_lev_feats, train_sparse_norm_lev_feats, train_sparse_ratio_feats, train_sparse_junk_feats, vecs3, enc.transform(train[['industry_n1', 'industry_n2']])))

# Генерация сабмита

In [None]:
model = LGBMClassifier(random_state=42)

In [None]:
model.fit(X, y)

In [None]:
X_test = sp.hstack((test[FTS], tfidf1.transform(test['name_1']), tfidf2.transform(test['name_2'])))

In [None]:
X_test = sp.hstack((X_test, 
                    test_sparse_lev_feats,
                    test_sparse_norm_lev_feats,
                    test_sparse_ratio_feats,
                    test_sparse_junk_feats,
                    tfidf3.transform(test['intersections']),
                    enc.transform(test[['industry_n1', 'industry_n2']])))

In [None]:
sample_sub = pd.read_csv(DATA_DIR.joinpath("sample_submission.csv"), index_col="pair_id")

In [None]:
sample_sub["is_duplicate"] = model.predict(X_test)

In [None]:
sample_sub.to_csv(DATA_DIR.joinpath("baseline_submission.csv"))

In [None]:
sample_sub.value_counts()