In [1]:
import re
import pandas as pd
import numpy as np
from tqdm import tqdm
from fuzzywuzzy import fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
tqdm.pandas()



# Read Data and Add Index

In [2]:
ccag = ['QHP017 ADVANTAGE HEALTH NYC', 'DISCOUNT CARD AMERICAN HEALTH COPR', 'AMWINS MEDICARE MAPH QHP', 'AARONS INC']
company = ['VANTAGE HEALTH PLAN', 'AMERICAN HEALTH COPR', 'AMWINS RX', 'AARONS CONCRETE']

In [3]:
df = pd.DataFrame({
    'ccag':ccag,
    'company':company
})

In [4]:
df['index'] = df.index
df.insert(0, 'index',  df.pop('index'))

In [5]:
df

Unnamed: 0,index,ccag,company
0,0,QHP017 ADVANTAGE HEALTH NYC,VANTAGE HEALTH PLAN
1,1,DISCOUNT CARD AMERICAN HEALTH COPR,AMERICAN HEALTH COPR
2,2,AMWINS MEDICARE MAPH QHP,AMWINS RX
3,3,AARONS INC,AARONS CONCRETE


# TF-IDF Algorithm

In [6]:
def ngrams(string, n=3):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [7]:
vectorizer = TfidfVectorizer(min_df = 0,token_pattern='(?u)\\b\\w+\\b')
train_matrix = vectorizer.fit_transform(ccag+company)
query_matrix = vectorizer.transform(company)

In [8]:
ccag_matrix = train_matrix[0:len(ccag)].toarray()
company_matrix = query_matrix.toarray()

In [9]:
def compute_similarity_tfidf(row, ccag_matrix, company_matrix):
    index = row['index']
    return round(np.dot(ccag_matrix[index], company_matrix[index].T) * 100)

In [10]:
df['%similarity_tfidf'] = df.progress_apply(compute_similarity_tfidf, args=(ccag_matrix,company_matrix), axis=1)

100%|██████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 2002.05it/s]


# Fuzz

In [11]:
def compute_similarity_lev(row):
    f1 = fuzz.token_sort_ratio(row['ccag'], row['company'])
    f2 = fuzz.token_set_ratio(row['ccag'], row['company'])
    return pd.Series([f1,f2])

In [12]:
df[['%similarity_lev_sort', '%similarity_lev_set']] = df.progress_apply(compute_similarity_lev, axis=1)

100%|███████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 999.89it/s]


# Final Output

In [13]:
df

Unnamed: 0,index,ccag,company,%similarity_tfidf,%similarity_lev_sort,%similarity_lev_set
0,0,QHP017 ADVANTAGE HEALTH NYC,VANTAGE HEALTH PLAN,14,35,65
1,1,DISCOUNT CARD AMERICAN HEALTH COPR,AMERICAN HEALTH COPR,69,74,100
2,2,AMWINS MEDICARE MAPH QHP,AMWINS RX,28,48,80
3,3,AARONS INC,AARONS CONCRETE,41,72,75
