<a href="https://colab.research.google.com/github/benMcCarthy87/jaroURLMatcher/blob/main/URLMatcher-v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import io
import numpy as np
from google.colab import files
from google.colab import drive

'''
If the below is True, matches scored 'above average' will be removed from the list to match with, this will speed up the process of matching remaining URLs
This is a bad option if you have few URL changes, or if you have consolidated URLs so there will be multiple matches to your new URL.
'''
remove_urls = True

'''
If the below is True, if a match scored above average, it will return this and not check the rest of the list. If this is false URLs the above remove_urls
option will become irrelevant - URLs won't be removed when found.
'''

match_if_above_ave = True

In [2]:
#Upload URLs (CSV named urls.csv)

uploaded = files.upload()


Saving urls.csv to urls (14).csv


In [3]:
#Create dataframe

df = pd.read_csv(io.BytesIO(uploaded['urls.csv']))
print(df[:10])

                                            old_URLs                                           new_URLs
0      http://www.examplegroup.com/en/group/profile/     http://www.example-group.com/en/group/profile/
1  https://www.examplegroup.com/en/media/mediacon...  https://www.example-group.com/en/media/media-c...
2  https://www.examplegroup.com/en/sustainability...  https://www.example-group.com/en/sustainabilit...
3             https://www.example.com/us/accessories       https://www.example.com/usa/shop/accessories
4         https://www.example.com/us/adicolorapparel  https://www.example.com/usa/shop/adicolor-apparel
5         https://www.example.com/us/example_outdoor           https://www.example.com/usa/shop/apparel
6                 https://www.example.com/us/apparel          https://www.example.com/usa/shop/baseball
7                https://www.example.com/us/baseball        https://www.example.com/usa/shop/basketball
8              https://www.example.com/us/basketball      https:

In [4]:
from math import floor, ceil 

#The Jaro distance is a measure of similarity between two strings. The higher the Jaro distance for two strings is, the more similar the strings are.
#The below function uses Jaro–Winkler similarity where 1 means an exact match and 0 means there is no similarity.

def jaro_distance(s1, s2): 
    if (s1 == s2): 
        return 1.0
    len1 = len(s1) 
    len2 = len(s2) 
    max_dist = floor(max(len1, len2) / 2) - 1
    match = 0 
    hash_s1 = [0] * len(s1) 
    hash_s2 = [0] * len(s2) 
    for i in range(len1): 
        for j in range(max(0, i - max_dist),  
                       min(len2, i + max_dist + 1)): 
            if (s1[i] == s2[j] and hash_s2[j] == 0): 
                hash_s1[i] = 1
                hash_s2[j] = 1
                match += 1
                break
    if (match == 0): 
        return 0.0
    t = 0
    point = 0
    for i in range(len1): 
        if (hash_s1[i]): 
            while (hash_s2[point] == 0): 
                point += 1
            if (s1[i] != s2[point]): 
                point += 1
                t += 1
    t = t//2
    return (match/ len1 + match / len2 + 
            (match - t + 1) / match)/ 3.0


In [5]:
i = 0
all_distances = [0]
hld = 0

#function finds the best match using the above Jaro–Winkler distance function.

def get_url(urlO):
    hld = 0
    bm = 'NO URL'
    for  urlN in new_urls:
      ld = jaro_distance(str(urlO), str(urlN))
      simil = ld
      if ld > hld:
        hld = ld
        bm = urlN
        if ld > np.mean(all_distances) and len(all_distances) > 10 and len(bm) > 3 and len(new_urls) > 100:
          print('Found above average match')
          if remove_urls == True:
            new_urls.remove(bm)
            print('removed ' + bm)
            print(str(len(new_urls)) + ' left in new URLs')
          return bm
    all_distances.append(hld)
    return bm
     

In [6]:
matched_urls = []
old_urls = df['old_URLs'].tolist()
new_urls = df['new_URLs'].tolist()

#Remove unchanged URLs

old_out = []
matched_out = []
for old in old_urls:
  if old in new_urls:
      old_out.append(old)
      old_urls.remove(old)
      matched_out.append(old)
      new_urls.remove(old)

#Loop through old URLs, using the get_url function to find the bestmatch

for urlO in old_urls:
  bm = get_url(urlO)
  matched_urls.append(bm)
  print(str(urlO) + ' matched with ' + str(bm))
  i += 1
  print(i, end = '')
  print(' of ', end = '')
  print(len(old_urls))


#Create new dataframe, add old urls and matched new ones.

df_Results = pd.DataFrame(old_urls,columns=['old_urls'])
df_Results.insert(1, 'matched_urls', matched_urls, True)



http://www.examplegroup.com/en/group/profile/ matched with http://www.example-group.com/en/group/profile/
1 of 21
https://www.examplegroup.com/en/media/mediacontact/ matched with https://www.example-group.com/en/media/media-contact/
2 of 21
https://www.examplegroup.com/en/sustainability/people/factoryworkers/ matched with https://www.example-group.com/en/sustainability/people/factory-workers/
3 of 21
https://www.example.com/us/accessories matched with https://www.example.com/usa/shop/accessories
4 of 21
https://www.example.com/us/adicolorapparel matched with https://www.example.com/usa/shop/adicolor-apparel
5 of 21
https://www.example.com/us/example_outdoor matched with https://www.example.com/usa/shop/example-outdoor
6 of 21
https://www.example.com/us/apparel matched with https://www.example.com/usa/shop/apparel
7 of 21
https://www.example.com/us/baseball matched with https://www.example.com/usa/shop/baseball
8 of 21
https://www.example.com/us/basketball matched with https://www.examp

In [None]:
#Mount Google Drive

drive.mount('drive')

In [8]:
#Save results as CSV in Drive

df_Results.to_csv('URL_Migration_Results.csv')
!cp URL_Migration_Results.csv "drive/My Drive/"