## Imports

In [1]:
pip install fuzzywuzzy

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
pip install shap

[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
pip install catboost

[0mNote: you may need to restart the kernel to use updated packages.


In [4]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
import random
import os

import matplotlib.pyplot as plt
import seaborn as sns


from tqdm import tqdm

import fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import shap

from catboost import CatBoostRegressor, CatBoostClassifier, Pool



## Downloading

In [5]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/upppm/titles.csv
/kaggle/input/us-patent-phrase-to-phrase-matching/sample_submission.csv
/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv
/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv


In [6]:
train_df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/train.csv")
test_df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")
sub = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/sample_submission.csv")
titles = pd.read_csv("../input/upppm/titles.csv")

In [7]:
len(train_df['score'].unique())

5

## Baseline

In [8]:
context_dict = {
    'A': 'Human Necessities',
    'B': 'Operations and Transport',
    'C': 'Chemistry and Metallurgy',
    'D': 'Textiles',
    'E': 'Fixed Constructions',
    'F': 'Mechanical Engineering',
    'G': 'Physics',
    'H': 'Electricity',
    'Y': 'Emerging Cross-Sectional Technologies'
}

## Feature Engineering

In [9]:
def create_feature(df, cpc_codes_df):
  
    df['section'] = df['context'].str[:1]
    df['class'] = df['context'].str[1:]
    
    df['anchor_len'] = df['anchor'].apply(lambda x: len(x.split(' ')))
    df['target_len'] = df['target'].apply(lambda x: len(x.split(' ')))
    
    pattern = '[0-9]'
    mask = df['anchor'].str.contains(pattern, na=False)
    df['num_anchor'] = mask
    mask = df['target'].str.contains(pattern, na=False)
    df['num_target'] = mask
    
    df['context_desc'] = df['context'].map(cpc_codes_df.set_index('code')['title']).str.lower()
    
    fuzzy_anchor_target_scores = []
    fuzzy_anchor_context_scores = []
    fuzzy_taget_context_scores = []
    for index, row in df.iterrows():
        fuzzy_anchor_target_scores.append(fuzz.ratio(row['anchor'], row['target']))
        fuzzy_anchor_context_scores.append(fuzz.ratio(row['anchor'], row['context_desc']))
        fuzzy_taget_context_scores.append(fuzz.ratio(row['context_desc'], row['target']))
    df['fuzzy_at_score'] = fuzzy_anchor_target_scores
    df['fuzzy_ac_score'] = fuzzy_anchor_context_scores
    df['fuzzy_tc_score'] = fuzzy_taget_context_scores
    df['fuzzy_c_score'] = df['fuzzy_ac_score'] + df['fuzzy_tc_score']
    df['fuzzy_total'] = df['fuzzy_at_score'] + df['fuzzy_c_score']
    
    df.drop(['context', 'fuzzy_ac_score', 'fuzzy_tc_score'], 1, inplace=True)
    
    return df

In [10]:
train_df_wth_features = create_feature(train_df.copy(), titles)
test_df_wth_features = create_feature(test_df.copy(), titles)
train_df_wth_features.head()

In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only


Unnamed: 0,id,anchor,target,score,section,class,anchor_len,target_len,num_anchor,num_target,context_desc,fuzzy_at_score,fuzzy_c_score,fuzzy_total
0,37d61fd2272659b1,abatement,abatement of pollution,0.5,A,47,1,3,False,False,furniture; domestic articles or appliances; co...,58,39,97
1,7b9652b17b68b7a4,abatement,act of abating,0.75,A,47,1,3,False,False,furniture; domestic articles or appliances; co...,43,31,74
2,36d72442aefd8232,abatement,active catalyst,0.25,A,47,1,2,False,False,furniture; domestic articles or appliances; co...,33,29,62
3,5296b0c19e1ce60e,abatement,eliminating process,0.5,A,47,1,2,False,False,furniture; domestic articles or appliances; co...,29,33,62
4,54c1e3b9184cb5b6,abatement,forest region,0.0,A,47,1,2,False,False,furniture; domestic articles or appliances; co...,27,31,58


In [11]:
train = train_df_wth_features.sample(frac=0.8).copy()

In [12]:
val = train_df_wth_features[~train_df_wth_features.index.isin(train.index)].copy()

In [13]:
X_train = train.drop(['score'], 1)
y_train = train['score']

In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only


In [14]:
X_val = val.drop(['score'], 1)
y_val = val['score']

In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only


## Model

In [15]:
X_train.columns

Index(['id', 'anchor', 'target', 'section', 'class', 'anchor_len',
       'target_len', 'num_anchor', 'num_target', 'context_desc',
       'fuzzy_at_score', 'fuzzy_c_score', 'fuzzy_total'],
      dtype='object')

In [16]:
X_col = ['anchor', 
         'target', 
         'section', 
         'class', 
         'anchor_len',
         'target_len',
         'num_anchor',
         'num_target',
         'context_desc',
         'fuzzy_at_score',
         'fuzzy_c_score',
         'fuzzy_total']
         
y_col = ['score']

In [17]:
X_train.head()

Unnamed: 0,id,anchor,target,section,class,anchor_len,target_len,num_anchor,num_target,context_desc,fuzzy_at_score,fuzzy_c_score,fuzzy_total
7587,91aeb19c2b11c229,contain sulfur compounds,contain sulfates,C,11,3,2,False,False,"animal or vegetable oils, fats, fatty substanc...",65,47,112
32776,cf0adc935d3f3712,target pointer,emblaze,F,41,2,1,False,False,weapons,19,67,86
30901,344454a65236e043,smooth outer surface,predetermined diameter,A,45,3,2,False,False,hand or travelling articles,29,75,104
15373,d161b498cdeae595,her2 targeted,cd fr targeting,A,61,2,3,True,False,medical or veterinary science; hygiene,57,73,130
10395,487c380eabe2c782,duplex device,bandwidth receiver,H,4,2,2,False,False,electric communication technique,32,59,91


In [18]:
num_cols = X_train.select_dtypes(include=np.number).columns.tolist()
text_cols = ['anchor', 'target', 'context_desc']
cat_cols = list(set(X_train.columns) - set(num_cols) - set(text_cols))
print('numerical features:', num_cols)
print('text features:', text_cols)
print('categorical features:', cat_cols)

numerical features: ['anchor_len', 'target_len', 'fuzzy_at_score', 'fuzzy_c_score', 'fuzzy_total']
text features: ['anchor', 'target', 'context_desc']
categorical features: ['num_target', 'num_anchor', 'id', 'section', 'class']


In [19]:
ignore_cols = ['id']

In [20]:
# catboost_params = {
#     'iterations': 1000,
#     'learning_rate': 0.1,
#     'eval_metric': 'Logloss',
#     'task_type': 'GPU',
#     'early_stopping_rounds': 10,
#     'use_best_model': True,
#     'verbose': 100
# }

In [21]:
def fit_model(train_pool, test_pool, **kwargs):
    model = CatBoostClassifier(
        task_type='GPU',
        iterations=2000,
        eval_metric='Accuracy',
        od_type='Iter',
        od_wait=500,
        **kwargs
    )

    return model.fit(
        train_pool,
        eval_set=test_pool,
        verbose=100,
        plot=True,
        use_best_model=True)

In [22]:
train_pool = Pool(
                    X_train,
                    y_train,
                    cat_features = cat_cols,
                    text_features = text_cols,
                    feature_names = list(X_train)
                )

In [23]:
valid_pool = Pool(
                    X_val,
                    y_val,
                    cat_features = cat_cols,
                    text_features = text_cols,
                    feature_names = list(X_train)  
                  )

In [24]:
model = fit_model(
    train_pool, valid_pool,
    learning_rate=0.35,
    tokenizers=[
        {
            'tokenizer_id': 'Sense',
            'separator_type': 'BySense',
            'lowercasing': 'True',
            'token_types':['Word', 'Number', 'SentenceBreak'],
            'sub_tokens_policy':'SeveralTokens'
        }      
    ],
    dictionaries = [
        {
            'dictionary_id': 'Word',
            'max_dictionary_size': '50000'
        }
    ],
    feature_calcers = [
        'BoW:top_tokens_count=10000'
    ]
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.4870793	test: 0.4900617	best: 0.4900617 (0)	total: 28.3ms	remaining: 56.6s
100:	learn: 0.5596682	test: 0.5472241	best: 0.5472241 (98)	total: 2.13s	remaining: 40s
200:	learn: 0.5822195	test: 0.5557231	best: 0.5564085 (188)	total: 3.83s	remaining: 34.3s
300:	learn: 0.6008294	test: 0.5621659	best: 0.5624400 (296)	total: 5.77s	remaining: 32.6s
400:	learn: 0.6152581	test: 0.5679232	best: 0.5680603 (399)	total: 8s	remaining: 31.9s
500:	learn: 0.6283501	test: 0.5721727	best: 0.5727210 (498)	total: 9.68s	remaining: 28.9s
600:	learn: 0.6415793	test: 0.5751885	best: 0.5753256 (543)	total: 11.9s	remaining: 27.7s
700:	learn: 0.6513126	test: 0.5773818	best: 0.5782042 (682)	total: 13.6s	remaining: 25.2s
800:	learn: 0.6600521	test: 0.5816313	best: 0.5825908 (784)	total: 15.3s	remaining: 23s
900:	learn: 0.6688944	test: 0.5839616	best: 0.5850583 (868)	total: 17s	remaining: 20.8s
1000:	learn: 0.6758517	test: 0.5854695	best: 0.5865661 (995)	total: 18.7s	remaining: 18.7s
1100:	learn: 0.6837000

In [25]:
X_test = test_df_wth_features.copy()
preds = pd.DataFrame(model.predict(X_test), columns=['preds'])
preds.head()

Unnamed: 0,preds
0,0.5
1,0.5
2,0.25
3,0.25
4,0.0


In [26]:
sub['score'] = preds['preds']
sub.to_csv('submission.csv', index=False)
sub.head()

Unnamed: 0,id,score
0,4112d61851461f60,0.5
1,09e418c93a776564,0.5
2,36baf228038e314b,0.25
3,1f37ead645e7f0c8,0.25
4,71a5b6ad068d531f,0.0
