### Task 1-1. Construct RLTK Datasets

In [28]:
import rltk
import csv

# You can use this tokenizer in case you need to manipulate some data
tokenizer = rltk.tokenizer.crf_tokenizer.crf_tokenizer.CrfTokenizer()

In [29]:
class GoodRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['ID']

    @rltk.cached_property
    def Title(self):
        return self.raw_object['Title']

    @rltk.cached_property
    def ISBN(self):
        return self.raw_object['ISBN13']
    
    @rltk.cached_property
    def Author(self):
        return self.raw_object['FirstAuthor']

class NobleRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['ID']

    @rltk.cached_property
    def Title(self):
        return self.raw_object['Title']
    
    @rltk.cached_property
    def ISBN(self):
        return self.raw_object['ISBN13']
    
    @rltk.cached_property
    def Author(self):
        return self.raw_object['Author1']

In [30]:
dir_ = ''
good_file = dir_ + 'goodreads.csv'
noble_file = dir_ + 'barnes_and_nobles.csv'

ds1 = rltk.Dataset(rltk.CSVReader(open(good_file, encoding='utf-8')), record_class=GoodRecord)
ds2 = rltk.Dataset(rltk.CSVReader(open(noble_file, encoding='utf-8')), record_class=NobleRecord)

In [31]:
# print some entries
print(ds1.generate_dataframe().head(5))
print(ds2.generate_dataframe().head(5))

  id                                       Title           ISBN  \
0  0          Managing My Life: My Autobiography  9780340728567   
1  1     I Remember: Sketch for an Autobiography  9780844627106   
2  2              Betty Boothroyd: Autobiography  9780712679480   
3  3  Caddie, A Sydney Barmaid: An Autobiography  9780725100148   
4  4     Nureyev: An Autobiography With Pictures  9780340014684   

            Author  
0    Alex Ferguson  
1  Boris Pasternak  
2  Betty Boothroyd  
3           Caddie  
4   Rudolf Nureyev  
  id                                              Title           ISBN  \
0  0          Pioneer Girl: The Annotated Autobiography  9780984504176   
1  1  American Sniper (Movie Tie-in Edition): The Au...  9780062376336   
2  2                     The Autobiography of Malcolm X  9780345350688   
3  3                           Assata: An Autobiography  9781556520747   
4  4                            Autobiography of a Yogi  9780876120798   

                  Author  

### Task 1-2. Blocking

First, we'll load dev set to evaluate both blocking (Task 1-2) and entity linking (Task 1-3).

In [32]:
dev_set_file = dir_ + 'dev.csv'
dev = []
with open(dev_set_file, encoding='utf-8', errors="replace") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if len(row) <= 1:
            continue
        if line_count == 0:
            columns = row
            line_count += 1
        else:
            dev.append(row)
    print(f'Column names are: {", ".join(columns)}')
    print(f'Processed {len(dev)} lines.')

gt = rltk.GroundTruth()
for row in dev:    
    r1 = ds1.get_record(row[0])
    r2  = ds2.get_record(row[1])
    if row[-1] == '1':
        gt.add_positive(r1.raw_object['ID'], r2.raw_object['ID'])
    else:
        gt.add_negative(r1.raw_object['ID'], r2.raw_object['ID'])

rltk.Trial(gt)

Column names are: goodreads.ID, barnes_and_nobles.ID, label
Processed 297 lines.


<rltk.evaluation.trial.Trial at 0x20f0b5c2130>

Then, you can build your own blocking techniques and evaluate it.

Hint:

- What is the total number of pairs without blocking? 
- what is the number of paris with blocking?
- After blocking, how many "correct" (matched) pairs presented in dev set?


In [50]:
correct_pairs = list(map(lambda x: (x[0], x[1]), filter(lambda x: x[2] == '1', dev)))

print('------ No Blocks ------')
pairs_without_blocking = rltk.get_record_pairs(ds1, ds2)
num_0 = 0
for r1, r2 in pairs_without_blocking:
    num_0 += 1
print(f'There are {num_0} pairs without blocking.')


print('------ block on Title ------')
bg = rltk.HashBlockGenerator()
block = bg.generate(bg.block(ds1, property_='Title'),
                    bg.block(ds2, property_='Title'))
pairs = rltk.get_record_pairs(ds1, ds2, block=block)
num = 0
matched_pairs_num = 0
for r1, r2 in pairs:
    num += 1
    if (r1.id, r2.id) in correct_pairs:
        matched_pairs_num += 1
    if num <= 5:
        print(r1.id, r1.Title, '\t', r2.id, r2.Title)
print(f'There are {num} pairs with blocking.')
print(f'There are {matched_pairs_num} matched pairs presented in dev set.')
reduction_ratio = num / num_0
pairs_completeness = matched_pairs_num / len(correct_pairs)
pair_quality = matched_pairs_num / num
print(f'reduction_ratio: {reduction_ratio}')
print(f'pairs_completeness: {pairs_completeness}')
print(f'pair_quality: {pair_quality}')


print('------ block on Title[:3] ------')
bg2 = rltk.HashBlockGenerator()
block2 = bg2.generate(
            bg2.block(ds1, function_=lambda r: r.Title[:3]),
            bg2.block(ds2, function_=lambda r: r.Title[:3]))
pairs = rltk.get_record_pairs(ds1, ds2, block=block2)
num = 0
matched_pairs_num = 0
for r1, r2 in pairs:
    num += 1
    if (r1.id, r2.id) in correct_pairs:
        matched_pairs_num += 1
    if num <= 5:
        print(r1.id, r1.Title, '\t', r2.id, r2.Title)
print(f'There are {num} pairs with blocking.')
print(f'There are {matched_pairs_num} matched pairs presented in dev set.')
reduction_ratio = num / num_0
pairs_completeness = matched_pairs_num / len(correct_pairs)
pair_quality = matched_pairs_num / num
print(f'reduction_ratio: {reduction_ratio}')
print(f'pairs_completeness: {pairs_completeness}')
print(f'pair_quality: {pair_quality}')


print('------ block on ISBN13 ------')
bg3 = rltk.HashBlockGenerator()
block3 = bg3.generate(
            bg3.block(ds1, property_='ISBN'),
            bg3.block(ds2, property_='ISBN'))
pairs = rltk.get_record_pairs(ds1, ds2, block=block3)
num = 0
matched_pairs_num = 0
for r1, r2 in pairs:
    num += 1
    if (r1.id, r2.id) in correct_pairs:
        matched_pairs_num += 1
    if num <= 5:
        print(r1.id, r1.Title, r1.ISBN, '\t', r2.id, r2.Title, r2.ISBN)
print(f'There are {num} pairs with blocking.')
print(f'There are {matched_pairs_num} matched pairs presented in dev set.')
reduction_ratio = num / num_0
pairs_completeness = matched_pairs_num / len(correct_pairs)
pair_quality = matched_pairs_num / num
print(f'reduction_ratio: {reduction_ratio}')
print(f'pairs_completeness: {pairs_completeness}')
print(f'pair_quality: {pair_quality}')


print('------ block on Title[:3] + 1st Author[:3] ------')
bg4 = rltk.HashBlockGenerator()
block4 = bg4.generate(
            bg4.block(ds1, function_=lambda r: r.Title[:3] + r.Author[:3]),
            bg4.block(ds2, function_=lambda r: r.Title[:3] + r.Author[:3]))
pairs = rltk.get_record_pairs(ds1, ds2, block=block4)
num = 0
matched_pairs_num = 0
for r1, r2 in pairs:
    num += 1
    if (r1.id, r2.id) in correct_pairs:
        matched_pairs_num += 1
    if num <= 5:
        print(r1.id, r1.Title, '\t', r2.id, r2.Title)
print(f'There are {num} pairs with blocking.')
print(f'There are {matched_pairs_num} matched pairs presented in dev set.')
reduction_ratio = num / num_0
pairs_completeness = matched_pairs_num / len(correct_pairs)
pair_quality = matched_pairs_num / num
print(f'reduction_ratio: {reduction_ratio}')
print(f'pairs_completeness: {pairs_completeness}')
print(f'pair_quality: {pair_quality}')

------ No Blocks ------
There are 14681867 pairs without blocking.
------ block on Title ------
0 Managing My Life: My Autobiography 	 409 Managing My Life: My Autobiography
8 Jenny Pitman: The Autobiography 	 1431 Jenny Pitman: The Autobiography
11 Call Me Anna: The Autobiography of Patty Duke 	 100 Call Me Anna: The Autobiography of Patty Duke
13 Chasing the Wind: The Autobiography of Steve Fossett 	 1152 Chasing the Wind: The Autobiography of Steve Fossett
22 Mad, Bad & Dangerous to Know: The Autobiography 	 192 Mad, Bad & Dangerous to Know: The Autobiography
There are 3484 pairs with blocking.
There are 40 matched pairs presented in dev set.
reduction_ratio: 0.00023729952055825054
pairs_completeness: 0.5970149253731343
pair_quality: 0.011481056257175661
------ block on Title[:3] ------
194 Man: An Autobiography 	 2625 Man Without A Face / Edition 1
194 Man: An Autobiography 	 1161 Man Of Everest - The Autobiography Of Tenzing
194 Man: An Autobiography 	 377 Man Who Lives in Paradis

**block4's performance is better**

In [35]:
pairs = rltk.get_record_pairs(ds1, ds2, block=block4)
blocked_pairs = []
for r1, r2 in pairs:
    blocked_pairs.append((r1.id, r2.id))

with open(dir_ + 'Zhenmin_Hua_blocked.csv', 'w', newline='') as file:
    writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    header = ['goodreads.ID', 'barnes_and_nobles.ID']
    writer.writerow(header)
    for row in blocked_pairs:
        writer.writerow(row)

### Task 1-3. Entity Linking

In [36]:
def name_string_similarity_1(r1, r2):
    s1 = r1.Title
    s2 = r2.Title
    return rltk.jaro_winkler_similarity(s1, s2)
    
def name_string_similarity_2(r1, r2):
    s1 = r1.ISBN
    s2 = r2.ISBN
    if s1 == s2:
        return 1
    return 0

def name_string_similarity_3(r1, r2):
    for n1, n2 in zip(sorted(r1.Title), sorted(r2.Title)):
        if rltk.levenshtein_distance(n1, n2) > min(len(n1), len(n2)) / 3:
            return 0
    return 1

In [37]:
# threshold value to determine if we are confident the record match
MY_TRESH = 0.8

# entity linkage scoring function
def rule_based_method(r1, r2):
    score_1 = name_string_similarity_1(r1, r2)
    score_2 = name_string_similarity_2(r1, r2)
    score_3 = name_string_similarity_3(r1, r2)
    
    total = 0.8 * score_1 + 0.1 * score_2 + 0.1 * score_3
    
    # return two values: boolean if they match or not, float to determine confidence
    return total > MY_TRESH, total

In [38]:
trial = rltk.Trial(gt)
candidate_pairs = rltk.get_record_pairs(ds1, ds2, ground_truth=gt)
for r1, r2 in candidate_pairs:
    result, confidence = rule_based_method(r1, r2)
    trial.add_result(r1, r2, result, confidence)

In [39]:
trial.evaluate()
print('Trial statistics based on Ground-Truth from development set data:')
print(f'tp: {trial.true_positives:.06f} [{len(trial.true_positives_list)}]')
print(f'fp: {trial.false_positives:.06f} [{len(trial.false_positives_list)}]')
print(f'tn: {trial.true_negatives:.06f} [{len(trial.true_negatives_list)}]')
print(f'fn: {trial.false_negatives:.06f} [{len(trial.false_negatives_list)}]')

Trial statistics based on Ground-Truth from development set data:
tp: 0.835821 [56]
fp: 0.056522 [13]
tn: 0.943478 [217]
fn: 0.164179 [11]


In [40]:
trial.f_measure

0.823529411764706

### Save Test predictions
You will be evaluated on dev and test predictions, over a hidden ground truth.

In [41]:
test_set_file = dir_ + 'test.csv'
test = []
with open(test_set_file, encoding='utf-8', errors="replace") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if len(row) <= 1:
            continue
        if line_count == 0:
            columns = row
            line_count += 1
        else:
            test.append(row)
    print(f'Column names are: {", ".join(columns)}')
    print(f'Processed {len(test)} lines.')

Column names are: goodreads.ID, barnes_and_nobles.ID
Processed 90 lines.


In [42]:
predictions = []
for id1, id2 in test:
    r1 = ds1.get_record(id1)
    r2 = ds2.get_record(id2)
    result, confidence = rule_based_method(r1, r2)
    predictions.append((r1.id, r2.id, result, confidence))

In [43]:
len(predictions), len(ds1.generate_dataframe()), len(ds2.generate_dataframe())

(90, 3967, 3701)

In [44]:
with open(dir_ + 'Zhenmin_Hua_predictions.csv', 'w', newline='') as file:
    writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for row in predictions:
        writer.writerow(row)

### Task 1-4. Record Linkage

In [54]:
i = 0
matching_pairs = []
for r1, r2 in rltk.get_record_pairs(ds1, ds2, block=block4):
    result, confidence = rule_based_method(r1, r2)
    if result:
        i += 1
        matching_pairs.append((r1.id, r2.id))

with open(dir_ + 'Zhenmin_Hua_valid_predictions.csv', 'w', newline='') as file:
    writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    header = ['goodreads.ID', 'barnes_and_nobles.ID']
    writer.writerow(header)
    for row in matching_pairs:
        writer.writerow(row)

# Task 2: Using RDFLib for Knowledge Representation

In [1]:
from rdflib import Graph, URIRef, Literal, XSD, Namespace, RDF
import pandas as pd

In [58]:
df1 = pd.read_csv('goodreads.csv')
df2 = pd.read_csv('barnes_and_nobles.csv')
df = pd.read_csv('Zhenmin_Hua_valid_predictions.csv')

In [59]:
MYNS = Namespace('http://inf558.org/myfakenamespace#')
FOAF = Namespace('http://xmlns.com/foaf/0.1/')
SCHEMA = Namespace('http://schema.org/')

kg = Graph()
kg.bind('my_ns', MYNS)
kg.bind('foaf', FOAF)
kg.bind('schema', SCHEMA)

for i in range(100):    # len(df)
    gr_id = df['goodreads.ID'][i]
    bn_id = df['barnes_and_nobles.ID'][i]
    
    item_id = URIRef(str(gr_id))  # subject
    kg.add((item_id, RDF.type, MYNS['book']))
    kg.add((item_id, FOAF['name'], Literal(df1['Title'][gr_id])))
    kg.add((item_id, SCHEMA.description, Literal(df1['Description'][gr_id])))
    kg.add((item_id, MYNS.ISBN, Literal(df1['ISBN'][gr_id])))
    kg.add((item_id, MYNS.ISBN13, Literal(df1['ISBN13'][gr_id])))
    kg.add((item_id, MYNS.pageCount, Literal(df1['PageCount'][gr_id])))
    kg.add((item_id, SCHEMA.author, Literal(df1['FirstAuthor'][gr_id])))
    kg.add((item_id, SCHEMA.author, Literal(df1['SecondAuthor'][gr_id])))
    kg.add((item_id, SCHEMA.author, Literal(df1['ThirdAuthor'][gr_id])))
    kg.add((item_id, SCHEMA.contentRating, Literal(df1['Rating'][gr_id])))
    kg.add((item_id, SCHEMA.contentRating, Literal(df1['NumberofRatings'][gr_id])))
    kg.add((item_id, SCHEMA.review, Literal(df1['NumberofReviews'][gr_id])))
    kg.add((item_id, SCHEMA.publisher, Literal(df1['Publisher'][gr_id])))
    kg.add((item_id, SCHEMA.datePublished, Literal(df1['PublishDate'][gr_id])))
    kg.add((item_id, SCHEMA.encodingFormat, Literal(df1['Format'][gr_id])))
    kg.add((item_id, SCHEMA.inLanguage, Literal(df1['Language'][gr_id])))
    kg.add((item_id, SCHEMA.name, Literal(df1['FileName'][gr_id])))
           
    kg.add((item_id, MYNS.pages, Literal(df2['Pages'][bn_id])))
    kg.add((item_id, MYNS.dimensions, Literal(df2['Productdimensions'][bn_id])))
    kg.add((item_id, MYNS.Salesrank, Literal(df2['Salesrank'][bn_id])))
    kg.add((item_id, SCHEMA.contentRating, Literal(df2['Ratingvalue'][bn_id])))
    kg.add((item_id, MYNS.Paperbackprice, Literal(df2['Paperbackprice'][bn_id])))
    kg.add((item_id, MYNS.Hardcoverprice, Literal(df2['Hardcoverprice'][bn_id])))
    kg.add((item_id, MYNS.Nookbookprice, Literal(df2['Nookbookprice'][bn_id])))
    kg.add((item_id, MYNS.Audiobookprice, Literal(df2['Audiobookprice'][bn_id])))
           
kg.serialize('Zhenmin_Hua_model.ttl', format="turtle")

<Graph identifier=N551d17599e8c412380835e77877e55e0 (<class 'rdflib.graph.Graph'>)>