# Record Linkage - Blocking

In this notebook, we use Record Linkage to match the JobPostings and Orbis datasets using purely SortedNeighborhood Index.

The notebook is organized in the following fashion:

0. Import libraries and define constants
1. Upload parts of JobPostings dataset
2. Upload parts of Orbis dataset
3. Records to match
4. Blocking on ZIP code and Name
5. Blocking on ZIP code
6. Blocking on City and Name
7. Blocking on partial ZIP codes
8. Blocking on City
9. Blocking on Bundeslands
10. SortedNeighbourhoodIndex on name
11. ECM 
12. Save processed data
13. NaN values


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import pandas as pd
import recordlinkage
# Import jellyfish.cjellyfish for record linkage
import jellyfish.cjellyfish # The import checks if C-version of string comparision of recordlinkage is installed

from linkage.model.utils import save_dataframe, read_dataframe
from linkage.model.record_matching import Linking, print_matched_counts, print_unmatched_counts
from linkage.model.record_linkage_utils import CompareZipCodes, CompareString
from linkage.model.examine_dataframe import print_dataframe_length

In [None]:
# Two types of data, all or the first part (part01.rar)
# part01 is used for implementation purposes 
# To check if everything is working as it sould
TYPE = 'all'  # 'all' or 'part01'

# 'std' for standardized, 'std_dict_40k' for dictionary cleaning with the 40k most common words
NOTE = 'std'

In [None]:
# Specify paths to data directories
PROCESSED_JP_DIR = f"../data/processed/jobpostings"
PROCESSED_ORBIS_DIR = f"../data/processed/orbis/{TYPE}"
PROCESSED_DATA_DIR = f"../data/processed/linkage/{TYPE}"

# Specifie file names to read from
JP_FILE = f'jobpostings_test_sample_std_dict_40k.csv'
ORBIS_NAME_FILE = f'orbis_german_bvid_name_processed_{TYPE}_{NOTE}.csv'
ORBIS_ADDR_FILE = f'orbis_german_all_addresses_processed_{TYPE}_{NOTE}.csv' #'orbis_german_all_addresses_clean.csv'

LINKED_DF = "linked_matches.csv"

# Columns
# JobPostings
JP_INDEX = 'jobposting_id'
JP_COMPANY_NAME, JP_COMPANY_NAME_STANDARDIZED, JP_COMPANY_NAME_DICT_CLEANED = 'company', 'company_standard', 'company_dict_clean'
JP_COMPANY_CITY, JP_COMPANY_ZIP, JP_COMPANY_STATE = 'company_city', 'company_zipcode', 'company_state'
JP_JOB_CITY, JP_JOB_ZIP, JP_JOB_STATE = 'job_city', 'job_zipcode', 'job_state'

# Orbis
ORBIS_INDEX = 'BvD ID number'
ORBIS_COMPANY_NAME, ORBIS_COMPANY_NAME_STANDARDIZED, ORBIS_COMPANY_NAME_DICT_CLEANED = 'NAME', 'NAME_standard', 'NAME_dict_clean'
ORBIS_COMPANY_CITY, ORBIS_COMPANY_ZIP, ORBIS_COMPANY_STATE = 'City (native)', 'Postcode', 'Region in country'

# Files for the partial results 
COMPANY_ZIP_NAME_EXACT = f"linked_matches_blocking_company_zip-name_exact_{TYPE}_{NOTE}.csv"
COMPANY_ZIP_NAME_EX_SIMILAR = f"linked_matches_blocking_company_zip-name_similar_name_{TYPE}_{NOTE}.csv"
COMPANY_ZIP_NAME_SIMILAR = f"linked_matches_blocking_company_zip-name_similar_{TYPE}_{NOTE}.csv"
JOB_ZIP_NAME_SIMILAR = f"linked_matches_blocking_job_zip-name_similar_{TYPE}_{NOTE}.csv"

COMPANY_ZIP_SIMILAR = f"linked_matches_blocking_company_zipcode_similar_{TYPE}_{NOTE}.csv"
JOB_ZIP_SIMILAR = f"linked_matches_blocking_job_zipcode_similar_{TYPE}_{NOTE}.csv"

COMPANY_CITY_NAME_SIMILAR = f"linked_matches_blocking_company_city-name_similar_{TYPE}_{NOTE}.csv"
JOB_CITY_NAME_SIMILAR = f"linked_matches_blocking_job_city-name_similar_{TYPE}_{NOTE}.csv"

COMPANY_CITY_SIMILAR = f"linked_matches_blocking_company_city_similar_{TYPE}_{NOTE}.csv"
JOB_CITY_SIMILAR = f"linked_matches_blocking_job_city_similar_{TYPE}_{NOTE}.csv"

COMPANY_PART_ZIP_SIMILAR = f"linked_matches_blocking_partial_company_zipcode_similar_{TYPE}_{NOTE}.csv"
JOB_PART_ZIP_SIMILAR = f"linked_matches_blocking_partial_job_zipcode_similar_{TYPE}_{NOTE}.csv"

COMPANY_STATE_SIMILAR = f"linked_matches_blocking_company_state_similar_{TYPE}_{NOTE}.csv"
JOB_STATE_SIMILAR = f"linked_matches_blocking_job_state_similar_{TYPE}_{NOTE}.csv"

SORTED_NN_MATCHING_COMPANY = f"linked_matches_sorted_neighbourhood_index_company_{TYPE}_{NOTE}.csv"
SORTED_NN_MATCHING_JOB = f"linked_matches_sorted_neighbourhood_index_job_{TYPE}_{NOTE}.csv"
ECM_MATCHING = f"linked_matches_ecm_{TYPE}_{NOTE}.csv"

NOT_MATCHED = "not_matched_blocking.txt"

## 1. Upload parts of JobPostings dataset

The preprocessed JobPostings dataset is stored on path:
```python
../data/processed/jobpostings/
```

The data are read into Pandas **DataFrame**.



In [None]:
df_jp = read_dataframe(PROCESSED_JP_DIR, JP_FILE, JP_INDEX)
df_jp.head()

## 2. Upload parts of Orbis dataset

The preprocessed Orbis dataset is stored on path:
```python
../data/processed/orbis/
```

The data are read into Pandas **DataFrame**.



### Read the company name dataframe

We read the file containing Orbis company names.

In [None]:
df_orbis_name = read_dataframe(PROCESSED_ORBIS_DIR, ORBIS_NAME_FILE)
df_orbis_name.head()

### Read the company addresses dataframe

We read the file containing Orbis company addresses.

In [None]:
df_orbis_addresses = read_dataframe(PROCESSED_ORBIS_DIR, ORBIS_ADDR_FILE)
df_orbis_addresses.head()

### Join the Orbis dataframes

We join Orbis parts to create one dataframe.

Note: BvD ID number in addresses' dataframe is not unique.  

In [None]:
df_orbis = df_orbis_name.merge(df_orbis_addresses, on=ORBIS_INDEX, how='inner')
df_orbis.head()

### Check the dataframe

We check some values of the dataframes.

In [None]:
print_dataframe_length(df_orbis)

In [None]:
# TODO: do in orbis-name notebook
df_orbis.rename(columns={"company_standard": "NAME_standard", "company_dict_clean": "NAME_dict_clean"}, inplace=True)

In [None]:
# Check the states in Orbis
df_orbis[ORBIS_COMPANY_STATE].unique()

In [None]:
# Check the states in JobPostings
df_jp[JP_COMPANY_STATE].unique()

In [None]:
#df_orbis = df_orbis.head(100)

### Orbis index

Change name of the Orbis index (it is not the _BvD ID_ because of the missing uniqueness).

In [None]:
# Name the index for joining
# JP dataset has unique index, therefore is set during the .csv reading
df_orbis.index.name = 'orbis_index'

## 3. Records to match

Print the number of unmatched records and initialize a linking class.

In [None]:
print_unmatched_counts(df_jp, JP_COMPANY_NAME_STANDARDIZED)

In [None]:
# Define class containing methods for record linkage
linking = Linking(JP_INDEX, JP_COMPANY_NAME, JP_COMPANY_NAME_STANDARDIZED, JP_COMPANY_NAME_DICT_CLEANED,
                  JP_COMPANY_CITY, JP_COMPANY_ZIP, JP_COMPANY_STATE,
                  JP_JOB_CITY, JP_JOB_ZIP, JP_JOB_STATE,
                  ORBIS_INDEX, ORBIS_COMPANY_NAME, ORBIS_COMPANY_NAME_STANDARDIZED, ORBIS_COMPANY_NAME_DICT_CLEANED,
                  ORBIS_COMPANY_CITY, ORBIS_COMPANY_ZIP, ORBIS_COMPANY_STATE)


## 4. Blocking on ZIP code and Name

#### Orbis - Add a column for blocking

In [None]:
df_orbis['zip_and_name'] = df_orbis[ORBIS_COMPANY_ZIP].str[:1] + df_orbis[ORBIS_COMPANY_NAME_STANDARDIZED].str.replace(' ', '').str[:3]
df_orbis.head()

#### JobPostings - Add column for blocking 

In [None]:
df_jp['zip_and_name'] = df_jp[JP_COMPANY_ZIP].str[:1] + df_jp[JP_COMPANY_NAME_STANDARDIZED].str.replace(' ', '').str[:3]
df_jp.head()

###  Blocking on company ZIP code and Name - Exact matching

In [None]:
# Create index
indexer = recordlinkage.Index()
indexer.block('zip_and_name') # exact match on specified columns

# Make record pairs
candidate_links = indexer.index(df_jp, df_orbis)

print(f'Num of candidates: {len(candidate_links)}\n')

In [None]:
compare_names = recordlinkage.Compare()

compare_names.exact(JP_COMPANY_NAME_STANDARDIZED, ORBIS_COMPANY_NAME_STANDARDIZED, label='company_name_exact')
compare_names.exact(JP_COMPANY_ZIP, ORBIS_COMPANY_ZIP, label='company_zipcode_exact')
compare_names.exact(JP_COMPANY_CITY, ORBIS_COMPANY_CITY, label='company_city_exact')
compare_names.exact(JP_COMPANY_STATE, ORBIS_COMPANY_STATE, label='company_state_exact')

features_name = compare_names.compute(candidate_links, df_jp, df_orbis)

# Sum the comparison results.
features_name.sum(axis=1).value_counts().sort_index(ascending=False)

In [None]:
# Get potential matches
potential_matches_name = features_name[features_name.sum(axis=1) == 4] #.reset_index()

print(f"Num. of potential matches by name: {len(potential_matches_name)}")

potential_matches_name = linking.get_best_match(potential_matches_name, JP_INDEX, 'orbis_index')

df_merge_name = potential_matches_name.merge(df_jp, how='left', left_on=JP_INDEX, right_on=JP_INDEX)
df_merge_name = df_merge_name.merge(df_orbis, how='left', left_on='orbis_index', right_on='orbis_index')

df_merge_name = df_merge_name.drop_duplicates([JP_INDEX, ORBIS_INDEX])
print(f"Num. of best matches by name: {len(df_merge_name)}")

df_merge_name_result = df_merge_name[[JP_INDEX, ORBIS_INDEX,
                                      JP_COMPANY_NAME_STANDARDIZED, ORBIS_COMPANY_NAME_STANDARDIZED,
                                      JP_COMPANY_CITY, ORBIS_COMPANY_CITY,
                                      JP_COMPANY_ZIP, ORBIS_COMPANY_ZIP,
                                      JP_COMPANY_STATE, ORBIS_COMPANY_STATE]].copy()

df_merge_name_result.head()

#### Save the result

In [None]:
# Save dataframe to a csv file
save_dataframe(df_merge_name_result, PROCESSED_DATA_DIR, COMPANY_ZIP_NAME_EXACT)

#### Process

In [None]:
# Add matches to a new df
matched_df = df_merge_name_result.copy()
matched_df.set_index([JP_INDEX, ORBIS_INDEX], inplace=True)

# Remove matches from old JobPostings dataframe
df_jp.drop(df_merge_name_result[JP_INDEX], axis=0, inplace=True)

print_matched_counts(matched_df, JP_COMPANY_NAME_STANDARDIZED)
print_unmatched_counts(df_jp, JP_COMPANY_NAME_STANDARDIZED)

###  Blocking on company ZIP code and Name - Similarity on name, exact on address

Numeric similarity on zip codes

In [None]:
# Create index
indexer = recordlinkage.Index()
indexer.block('zip_and_name') # exact match on specified columns

# Make record pairs
candidate_links = indexer.index(df_jp, df_orbis)

print(f'Num of candidates: {len(candidate_links)}\n')

In [None]:
compare_names = recordlinkage.Compare()

compare_names.string(JP_COMPANY_NAME_STANDARDIZED, ORBIS_COMPANY_NAME_STANDARDIZED, 
                     threshold=0.95, method='jarowinkler', label='company_name_similar')
compare_names.exact(JP_COMPANY_ZIP, ORBIS_COMPANY_ZIP, label='company_zipcode_exact')
compare_names.exact(JP_COMPANY_CITY, ORBIS_COMPANY_CITY, label='company_city_exact')
compare_names.exact(JP_COMPANY_STATE, ORBIS_COMPANY_STATE, label='company_state_exact')
features_name = compare_names.compute(candidate_links, df_jp, df_orbis)

# Sum the comparison results.
features_name.sum(axis=1).value_counts().sort_index(ascending=False)

In [None]:
# Get potential matches
potential_matches_name = features_name[features_name.sum(axis=1) == 4]#.reset_index()

print(f"Num. of potential matches by name: {len(potential_matches_name)}")

potential_matches_name = linking.get_best_match(potential_matches_name, JP_INDEX, 'orbis_index')

df_merge_name = potential_matches_name.merge(df_jp, how='left', left_on=JP_INDEX, right_on=JP_INDEX)
df_merge_name = df_merge_name.merge(df_orbis, how='left', left_on='orbis_index', right_on='orbis_index')

df_merge_name = df_merge_name.drop_duplicates([JP_INDEX, ORBIS_INDEX])
print(f"Num. of best matches by name: {len(df_merge_name)}")

df_merge_name_result = df_merge_name[[JP_INDEX, ORBIS_INDEX,
                                      JP_COMPANY_NAME_STANDARDIZED, ORBIS_COMPANY_NAME_STANDARDIZED,
                                      JP_COMPANY_CITY, ORBIS_COMPANY_CITY,
                                      JP_COMPANY_ZIP, ORBIS_COMPANY_ZIP,
                                      JP_COMPANY_STATE, ORBIS_COMPANY_STATE]].copy()
df_merge_name_result.head()

#### Save the result

In [None]:
# Save dataframe to a csv file
save_dataframe(df_merge_name_result, PROCESSED_DATA_DIR, COMPANY_ZIP_NAME_EX_SIMILAR)

#### Process

In [None]:
# Process matched and not matched records
linking.process_matched(df_jp, matched_df, df_merge_name_result, JP_COMPANY_NAME_STANDARDIZED)

###  Blocking on company ZIP code and Name - All similar

In [None]:
# Create candidate pairs
candidate_links = linking.blocking(df_jp, df_orbis, 'zip_and_name')

# Compare fields of candidate pairs
features_name = linking.compare_similar_records(df_jp, df_orbis, candidate_links, addr_type='company')

# Filter candidate pairs
df_merge_name_result = linking.merge_dataframes_on_linkage_result(features_name, df_jp, df_orbis, addr_type='company')

In [None]:
df_merge_name_result.head()

In [None]:
# Save dataframe to a csv file
save_dataframe(df_merge_name_result, PROCESSED_DATA_DIR, COMPANY_ZIP_NAME_SIMILAR)

In [None]:
# Process matched and not matched records
linking.process_matched(df_jp, matched_df, df_merge_name_result, JP_COMPANY_NAME_STANDARDIZED)

In [None]:
# Drop auxiliary columns
df_jp.drop(['zip_and_name'], axis=1, inplace=True)

### Blocking on job ZIP code and Name

In [None]:
df_jp['zip_and_name'] = df_jp[JP_JOB_ZIP].str[:1] + df_jp[JP_COMPANY_NAME_STANDARDIZED].str.replace(' ', '').str[:3]
df_jp.head()

In [None]:
# Create candidate pairs
candidate_links = linking.blocking(df_jp, df_orbis, 'zip_and_name')

# Compare fields of candidate pairs
features_name = linking.compare_similar_records(df_jp, df_orbis, candidate_links, addr_type='job')

# Filter candidate pairs
df_merge_name_result = linking.merge_dataframes_on_linkage_result(features_name, df_jp, df_orbis, addr_type='job')

In [None]:
df_merge_name_result.head()

In [None]:
# Save dataframe to a csv file
save_dataframe(df_merge_name_result, PROCESSED_DATA_DIR, JOB_ZIP_NAME_SIMILAR)

In [None]:
# Process matched and not matched records
linking.process_matched(df_jp, matched_df, df_merge_name_result, JP_COMPANY_NAME_STANDARDIZED)

In [None]:
# Drop auxiliary columns
df_jp.drop(['zip_and_name'], axis=1, inplace=True)
df_orbis.drop(['zip_and_name'], axis=1, inplace=True)

## 5. Blocking on ZIP code

### Blocking on company ZIP code

In [None]:
# Create candidate pairs
candidate_links = linking.blocking(df_jp, df_orbis, JP_COMPANY_ZIP, ORBIS_COMPANY_ZIP)

# Compare fields of candidate pairs
features_name = linking.compare_similar_records(df_jp, df_orbis, candidate_links, addr_type='company')

# Filter candidate pairs
df_merge_name_result = linking.merge_dataframes_on_linkage_result(features_name, df_jp, df_orbis, addr_type='company')

In [None]:
df_merge_name_result.head()

In [None]:
# Save dataframe to a csv file
save_dataframe(df_merge_name_result, PROCESSED_DATA_DIR, COMPANY_ZIP_SIMILAR)

In [None]:
# Process matched and not matched records
linking.process_matched(df_jp, matched_df, df_merge_name_result, JP_COMPANY_NAME_STANDARDIZED)

### Blocking on job ZIP code

In [None]:
# Create candidate pairs
candidate_links = linking.blocking(df_jp, df_orbis, JP_JOB_ZIP, ORBIS_COMPANY_ZIP)

# Compare fields of candidate pairs
features_name = linking.compare_similar_records(df_jp, df_orbis, candidate_links, addr_type='job')

# Filter candidate pairs
df_merge_name_result = linking.merge_dataframes_on_linkage_result(features_name, df_jp, df_orbis, addr_type='job')

In [None]:
df_merge_name_result.head()

In [None]:
# Save dataframe to a csv file
save_dataframe(df_merge_name_result, PROCESSED_DATA_DIR, JOB_ZIP_SIMILAR)

In [None]:
# Process matched and not matched records
linking.process_matched(df_jp, matched_df, df_merge_name_result, JP_COMPANY_NAME_STANDARDIZED)

## 6. Blocking on City and Name

In [None]:
df_orbis['city_and_name'] = df_orbis[ORBIS_COMPANY_CITY].str[:2] + df_orbis[ORBIS_COMPANY_NAME_STANDARDIZED].str.replace(' ', '').str[:3]
df_orbis.head()

### Blocking on company City and Name

In [None]:
df_jp['city_and_name'] = df_jp[JP_COMPANY_CITY].str[:2] + df_jp[JP_COMPANY_NAME_STANDARDIZED].str.replace(' ', '').str[:3]
df_jp.head()

In [None]:
# Create candidate pairs
candidate_links = linking.blocking(df_jp, df_orbis, 'city_and_name')

# Compare fields of candidate pairs
features_name = linking.compare_similar_records(df_jp, df_orbis, candidate_links, addr_type='company')

# Filter candidate pairs
df_merge_name_result = linking.merge_dataframes_on_linkage_result(features_name, df_jp, df_orbis, addr_type='company')

In [None]:
df_merge_name_result.head()

In [None]:
# Save dataframe to a csv file
save_dataframe(df_merge_name_result, PROCESSED_DATA_DIR, COMPANY_CITY_NAME_SIMILAR)

In [None]:
# Process matched and not matched records
linking.process_matched(df_jp, matched_df, df_merge_name_result, JP_COMPANY_NAME_STANDARDIZED)

In [None]:
# Drop auxiliary columns
df_jp.drop(['city_and_name'], axis=1, inplace=True)

### Blocking on job City and Name

In [None]:
df_jp['city_and_name'] = df_jp[JP_JOB_CITY].str[:2] + df_jp[JP_COMPANY_NAME_STANDARDIZED].str.replace(' ', '').str[:3]
df_jp.head()

In [None]:
# Create candidate pairs
candidate_links = linking.blocking(df_jp, df_orbis, 'city_and_name')

# Compare fields of candidate pairs
features_name = linking.compare_similar_records(df_jp, df_orbis, candidate_links, addr_type='job')

# Filter candidate pairs
df_merge_name_result = linking.merge_dataframes_on_linkage_result(features_name, df_jp, df_orbis, addr_type='job')

In [None]:
df_merge_name_result.head()

In [None]:
# Save dataframe to a csv file
save_dataframe(df_merge_name_result, PROCESSED_DATA_DIR, JOB_CITY_NAME_SIMILAR)

In [None]:
# Process matched and not matched records
linking.process_matched(df_jp, matched_df, df_merge_name_result, JP_COMPANY_NAME_STANDARDIZED)

In [None]:
# Drop auxiliary columns
df_jp.drop(['city_and_name'], axis=1, inplace=True)
df_orbis.drop(['city_and_name'], axis=1, inplace=True)

## 7. Blocking on partial ZIP codes

In [None]:
partial_num = 4

df_orbis['partial_zip'] = df_orbis[ORBIS_COMPANY_ZIP].str[:partial_num]
df_orbis.head()

### Blocking on partial company ZIP code

In [None]:
df_jp['partial_company_zip'] = df_jp[JP_COMPANY_ZIP].str[:partial_num]
df_jp.head()

In [None]:
# Create candidate pairs
candidate_links = linking.blocking(df_jp, df_orbis, 'partial_company_zip', 'partial_zip')

# Compare fields of candidate pairs
features_name = linking.compare_similar_records(df_jp, df_orbis, candidate_links, addr_type='company')

# Filter candidate pairs
df_merge_name_result = linking.merge_dataframes_on_linkage_result(features_name, df_jp, df_orbis, addr_type='company')

In [None]:
df_merge_name_result.head()

In [None]:
# Save dataframe to a csv file
save_dataframe(df_merge_name_result, PROCESSED_DATA_DIR, COMPANY_PART_ZIP_SIMILAR)

In [None]:
# Process matched and not matched records
linking.process_matched(df_jp, matched_df, df_merge_name_result, JP_COMPANY_NAME_STANDARDIZED)

In [None]:
# Drop auxiliary columns
df_jp.drop(['partial_company_zip'], axis=1, inplace=True)

### Blocking on partial job ZIP code

In [None]:
df_jp['partial_job_zip'] = df_jp[JP_JOB_ZIP].str[:partial_num]
df_jp.head()

In [None]:
# Create candidate pairs
candidate_links = linking.blocking(df_jp, df_orbis, 'partial_job_zip', 'partial_zip')

# Compare fields of candidate pairs
features_name = linking.compare_similar_records(df_jp, df_orbis, candidate_links, addr_type='job')

# Filter candidate pairs
df_merge_name_result = linking.merge_dataframes_on_linkage_result(features_name, df_jp, df_orbis, addr_type='job')

In [None]:
df_merge_name_result.head()

In [None]:
# Save dataframe to a csv file
save_dataframe(df_merge_name_result, PROCESSED_DATA_DIR, JOB_PART_ZIP_SIMILAR)

In [None]:
# Process matched and not matched records
linking.process_matched(df_jp, matched_df, df_merge_name_result, JP_COMPANY_NAME_STANDARDIZED)

In [None]:
# Drop auxiliary columns
df_jp.drop(['partial_job_zip'], axis=1, inplace=True)
df_orbis.drop(['partial_zip'], axis=1, inplace=True)

## 8. Blocking on City

### Blocking on company City

In [None]:
# Create candidate pairs
candidate_links = linking.blocking(df_jp, df_orbis, JP_COMPANY_CITY, ORBIS_COMPANY_CITY)

# Compare fields of candidate pairs
features_name = linking.compare_similar_records(df_jp, df_orbis, candidate_links, addr_type='company')

# Filter candidate pairs
df_merge_name_result = linking.merge_dataframes_on_linkage_result(features_name, df_jp, df_orbis, addr_type='company')

In [None]:
df_merge_name_result.head()

In [None]:
# Save dataframe to a csv file
save_dataframe(df_merge_name_result, PROCESSED_DATA_DIR, COMPANY_CITY_SIMILAR)

In [None]:
# Process matched and not matched records
linking.process_matched(df_jp, matched_df, df_merge_name_result, JP_COMPANY_NAME_STANDARDIZED)

### Blocking on job City

In [None]:
# Create candidate pairs
candidate_links = linking.blocking(df_jp, df_orbis, JP_JOB_CITY, ORBIS_COMPANY_CITY)

# Compare fields of candidate pairs
features_name = linking.compare_similar_records(df_jp, df_orbis, candidate_links, addr_type='job')

# Filter candidate pairs
df_merge_name_result = linking.merge_dataframes_on_linkage_result(features_name, df_jp, df_orbis, addr_type='job')

In [None]:
df_merge_name_result.head()

In [None]:
# Save dataframe to a csv file
save_dataframe(df_merge_name_result, PROCESSED_DATA_DIR, JOB_CITY_SIMILAR)

In [None]:
# Process matched and not matched records
linking.process_matched(df_jp, matched_df, df_merge_name_result, JP_COMPANY_NAME_STANDARDIZED)

## 9. Blocking on Bundeslands

### Blocking on company state

In [None]:
# Create candidate pairs
candidate_links = linking.blocking(df_jp, df_orbis, JP_COMPANY_STATE, ORBIS_COMPANY_STATE)

# Compare fields of candidate pairs
features_name = linking.compare_similar_records(df_jp, df_orbis, candidate_links, addr_type='company')

# Filter candidate pairs
df_merge_name_result = linking.merge_dataframes_on_linkage_result(features_name, df_jp, df_orbis, addr_type='company')

In [None]:
df_merge_name_result.head()

In [None]:
# Save dataframe to a csv file
save_dataframe(df_merge_name_result, PROCESSED_DATA_DIR, COMPANY_STATE_SIMILAR)

In [None]:
# Process matched and not matched records
linking.process_matched(df_jp, matched_df, df_merge_name_result, JP_COMPANY_NAME_STANDARDIZED)

### Blocking on job state

In [None]:
# Create candidate pairs
candidate_links = linking.blocking(df_jp, df_orbis, JP_JOB_STATE, ORBIS_COMPANY_STATE)

# Compare fields of candidate pairs
features_name = linking.compare_similar_records(df_jp, df_orbis, candidate_links, addr_type='job')

# Filter candidate pairs
df_merge_name_result = linking.merge_dataframes_on_linkage_result(features_name, df_jp, df_orbis, addr_type='job')

In [None]:
df_merge_name_result.head()

In [None]:
# Save dataframe to a csv file
save_dataframe(df_merge_name_result, PROCESSED_DATA_DIR, JOB_STATE_SIMILAR)

In [None]:
# Process matched and not matched records
linking.process_matched(df_jp, matched_df, df_merge_name_result, JP_COMPANY_NAME_STANDARDIZED)

## 10. SortedNeighbourhoodIndex on name

### Company Addresses

In [None]:
# Create index
indexer = recordlinkage.SortedNeighbourhoodIndex(JP_COMPANY_NAME_STANDARDIZED, ORBIS_COMPANY_NAME_STANDARDIZED, window=7) # NN match on specified columns

# Make record pairs
candidate_links = indexer.index(df_jp, df_orbis)

print(f'Num of candidates: {len(candidate_links)}\n')

In [None]:
# Compare fields of candidate pairs
features_name = linking.compare_similar_records(df_jp, df_orbis, candidate_links, addr_type='company')

# Filter candidate pairs
df_merge_name_result = linking.merge_dataframes_on_linkage_result(features_name, df_jp, df_orbis, addr_type='company')

In [None]:
df_merge_name_result.head()

In [None]:
# Save dataframe to a csv file
save_dataframe(df_merge_name_result, PROCESSED_DATA_DIR, SORTED_NN_MATCHING_COMPANY)

In [None]:
# Process matched and not matched records
linking.process_matched(df_jp, matched_df, df_merge_name_result, JP_COMPANY_NAME_STANDARDIZED)

### Job Addresses

In [None]:
# Create index
indexer = recordlinkage.SortedNeighbourhoodIndex(JP_COMPANY_NAME_STANDARDIZED, ORBIS_COMPANY_NAME_STANDARDIZED, window=7) # NN match on specified columns

# Make record pairs
candidate_links = indexer.index(df_jp, df_orbis)

print(f'Num of candidates: {len(candidate_links)}\n')

In [None]:
# Compare fields of candidate pairs
features_name = linking.compare_similar_records(df_jp, df_orbis, candidate_links, addr_type='job')

# Filter candidate pairs
df_merge_name_result = linking.merge_dataframes_on_linkage_result(features_name, df_jp, df_orbis, addr_type='job')

In [None]:
df_merge_name_result.head()

In [None]:
# Save dataframe to a csv file
save_dataframe(df_merge_name_result, PROCESSED_DATA_DIR, SORTED_NN_MATCHING_JOB)

In [None]:
# Process matched and not matched records
linking.process_matched(df_jp, matched_df, df_merge_name_result, JP_COMPANY_NAME_STANDARDIZED)

## 11. ECM 

Expectation Conditional Maximization is an unsupervised classification methof.

In [None]:
df_orbis_ecm = df_orbis.rename(columns={"NAME": "company", # TODO: change to company name std. and dict. clean
                                        "NAME_standard": "company_standard",
                                        "NAME_dict_clean": "company_dict_clean",
                                  "City (native)": "company_city",
                                 "Region in country": "company_state",
                                  "Postcode": "company_zipcode"
                                 }).copy()

df_orbis_ecm

In [None]:
# Create index
indexer = recordlinkage.SortedNeighbourhoodIndex(JP_COMPANY_NAME_STANDARDIZED, window=7) # NN match on specified columns

# Make record pairs
candidate_links = indexer.index(df_jp, df_orbis_ecm)

print(f'Num of candidates: {len(candidate_links)}\n')

In [None]:
compare_names = recordlinkage.Compare()

compare_names.string(JP_COMPANY_NAME_STANDARDIZED, JP_COMPANY_NAME_STANDARDIZED, threshold=0.8, method='jarowinkler', label='company_name_similar')
#compare_names.add(CompareZipCodes(JP_COMPANY_ZIP, ORBIS_COMPANY_ZIP, label='company_zipcode_similar'))
compare_names.string(JP_COMPANY_CITY, JP_COMPANY_CITY,  threshold=0.9, method='jarowinkler', label='company_city_similar')
compare_names.string(JP_COMPANY_STATE, JP_COMPANY_STATE, threshold=0.9, method='jarowinkler', label='company_state_similar')

features_name = compare_names.compute(candidate_links, df_jp, df_orbis_ecm)


In [None]:
ecm = recordlinkage.ECMClassifier()
potential_matches_name = ecm.fit_predict(features_name)

In [None]:
df_jp.loc[potential_matches_name[1][0]]

In [None]:
df_orbis_ecm.loc[potential_matches_name[1][1]] 

In [None]:
potential_matches_name

In [None]:
potential_matches_name = potential_matches_name.to_frame()

potential_matches_name = potential_matches_name.rename(columns={"jobposting_id": "col1", "orbis_index": "col2"}).copy()

potential_matches_name = potential_matches_name.reset_index()

potential_matches_name

In [None]:
df_merge_name

In [None]:
print(f"Num. of potential matches by name: {len(potential_matches_name)}")

df_merge_name = potential_matches_name.merge(df_jp, how='left', left_on='jobposting_id', right_on='jobposting_id')
df_merge_name = df_merge_name.merge(df_orbis, how='left', left_on='orbis_index', right_on='orbis_index')

df_merge_name_result = df_merge_name[[JP_INDEX, ORBIS_INDEX,
                                      JP_COMPANY_NAME_STANDARDIZED, ORBIS_COMPANY_NAME_STANDARDIZED,
                                      JP_COMPANY_CITY, ORBIS_COMPANY_CITY,
                                      JP_COMPANY_ZIP, ORBIS_COMPANY_ZIP,
                                      JP_COMPANY_STATE, ORBIS_COMPANY_STATE]].copy()
df_merge_name_result

In [None]:
# Save dataframe to a csv file
save_dataframe(df_merge_name_result, PROCESSED_DATA_DIR, ECM_MATCHING)

## 12. Save processed data

The processed data is stored in a csv file on a path:
```python
../data/processed/linkage/
```

### Save matched

In [None]:
save_dataframe(matched_df, PROCESSED_DATA_DIR, LINKED_DF)

### Save not-matched

In [None]:
save_dataframe(pd.DataFrame(df_jp[JP_COMPANY_NAME].unique()), PROCESSED_DATA_DIR, NOT_MATCHED)

## 13. NaN values

Check records containing NaN values

### JobPostings

In [None]:
company_name_nan = len(df_jp[df_jp[JP_COMPANY_NAME_STANDARDIZED].isna()])

print(f"Company names NaN: {company_name_nan}")

company_city_nan = len(df_jp[df_jp[JP_COMPANY_CITY].isna()])

print(f"Company city NaN: {company_city_nan}")

company_state_nan = len(df_jp[df_jp[JP_COMPANY_STATE].isna()])

print(f"Company state NaN: {company_state_nan}")

company_zipcode_nan = len(df_jp[df_jp[JP_COMPANY_ZIP].isna()])

print(f"Company zipcode NaN: {company_zipcode_nan}")

In [None]:
# Any row that contains NaN
is_NaN = df_jp.isnull()
rows_with_NaN = len(df_jp[is_NaN.any(axis=1)])
print(f"Records with NaN: {rows_with_NaN}")

rows_all_NaN = len(df_jp[is_NaN.all(axis=1)])
print(f"Records only NaN: {rows_all_NaN}")

### Orbis

In [None]:
company_name_nan = len(df_orbis[df_orbis[ORBIS_COMPANY_NAME_STANDARDIZED].isna()])

print(f"Company names NaN: {company_name_nan}")

company_city_nan = len(df_orbis[df_orbis[ORBIS_COMPANY_CITY].isna()])

print(f"Company city NaN: {company_city_nan}")

company_state_nan = len(df_orbis[df_orbis[ORBIS_COMPANY_STATE].isna()])

print(f"Company state NaN: {company_state_nan}")

company_zipcode_nan = len(df_orbis[df_orbis[ORBIS_COMPANY_ZIP].isna()])

print(f"Company zipcode NaN: {company_zipcode_nan}")

In [None]:
# Any row that contains NaN
is_NaN = df_orbis.isnull()
rows_with_NaN = len(df_orbis[is_NaN.any(axis=1)])
print(f"Records with NaN: {rows_with_NaN}")

rows_all_NaN = len(df_orbis[is_NaN.all(axis=1)])
print(f"Records only NaN: {rows_all_NaN}")