In [123]:
# %%bash

# rm test.db
# sqlite3 test.db '.read data/RFRGdata.sql'

In [128]:
# %load soundex.py
"""
soundex module conforming to Knuth's algorithm
implementation 2000-12-24 by Gregory Jorgensen
public domain
available at:
http://code.activestate.com/recipes/52213-soundex-algorithm/
"""


def soundex(name, len=4):
    # digits holds the soundex values for the alphabet
    digits = '01230120022455012623010202'
    sndx = ''
    fc = ''

    # translate alpha chars in name to soundex digits
    for c in name.upper():
        if c.isalpha():
            if not fc:
                fc = c   # remember first letter
            d = digits[ord(c)-ord('A')]
            # duplicate consecutive soundex digits are skipped
            if not sndx or (d != sndx[-1]):
                sndx += d

    # replace first digit with first alpha character
    sndx = fc + sndx[1:]

    # remove all 0s from the soundex code
    sndx = sndx.replace('0', '')

    # return soundex code padded to len characters
    return (sndx + (len * '0'))[:len]

# Entity Matching

Approach:

1. Run SQL to find project pairs with matching URLs. Add these to the candidate list.
2. Run SQL to find project pairs with matching names. Add these to the candidate list.
3. For each candidate pair.
    1. Calculate the levenshtein distance on URLs.
    2. Calculate the levenshtein distance on names.
    3. Set Boolean: is the RubyForge name found in the RubyGems name?
    4. Set Boolean: is the RubyForge name found in the RubyGems URL?
    5. Set Boolean: is the RubyForge developer found on the list of RubyGems developer.

In [102]:
import sqlite3

In [103]:
conn = sqlite3.connect('test.db')

In [104]:
conn.cursor().execute('SELECT name FROM sqlite_master WHERE type="table"').fetchall()

[('book_rf_entities',),
 ('book_rf_entity_people',),
 ('book_rf_entity_topics',),
 ('book_rg_entities',),
 ('book_rg_entity_people',)]

In [105]:
# NOTE: rf: RubyForge, rg: RubyGems.
conn.cursor().execute('''
    CREATE TABLE IF NOT EXISTS book_entity_matches (
        rf_project_name varchar(100) NOT NULL,
        rg_project_name varchar(100) NOT NULL,
        url_levenshtein int(11) DEFAULT NULL,
        rf_name_soundex varchar(5) DEFAULT NULL,
        rg_name_soundex varchar(5) DEFAULT NULL,
        name_levenshtein int(11) DEFAULT NULL,
        rf_name_in_rg_name tinyint(1) DEFAULT NULL,
        rf_name_in_rg_url tinyint(1) DEFAULT NULL,
        rf_dev_in_rg_dev tinyint(1) DEFAULT NULL,
        PRIMARY KEY (rf_project_name, rg_project_name)
    )
''').fetchone()

In [106]:
cursor = conn.cursor()

In [107]:
# Get all projects with matching URLs.
try:
    cursor.execute('''
        INSERT INTO book_entity_matches 
            (rf_project_name, rg_project_name)
        SELECT rf.project_name, rg.project_name
        FROM book_rf_entities rf
        INNER JOIN book_rg_entities rg
        ON rf.url = rg.url
    ''')
    conn.commit()
except sqlite3.IntegrityError as e:
    print('executed')

In [115]:
# Get projects that have matching project names.
try:
    res = cursor.execute('''
        INSERT INTO book_entity_matches 
            (rf_project_name, rg_project_name)
        SELECT rf.project_name, rg.project_name
        FROM book_rf_entities rf
        INNER JOIN book_rg_entities rg
        ON rf.project_name = rg.project_name
        WHERE rf.project_name NOT IN (
            SELECT bem.rf_project_name
            FROM book_entity_matches bem
        )
    ''')
    conn.commit()
except sqlite3.IntegrityError as e:
    print('executed')

In [120]:
# Calculate the string metrics for each pair.
c = conn.cursor()
c.execute('''
    SELECT bem.rf_project_name,
           bem.rg_project_name,
           rfe.url,
           rge.url
    FROM book_entity_matches bem
    INNER JOIN book_rg_entities rge
        ON bem.rg_project_name = rge.project_name
    INNER JOIN book_rf_entities rfe
        ON bem.rf_project_name = rfe.project_name
    ORDER BY bem.rf_project_name
''')
project_pairs = c.fetchall()
project_pairs[0]

('aafc',
 'acts_as_flux_capacitor',
 'http://aafc.rubyforge.org',
 'http://aafc.rubyforge.org')

In [None]:
for (rf_name, rg_name, rf_url, rg_url) in project_pairs:
    rf_name_lower = rf_name.lower()
    rg_name_lower = rg_name.lower()
    rf_url_lower = rf_url.lower()
    rg_url_lower = rg_url.lower()

In [122]:
conn.close()