In [1]:
import json
import itertools
import numpy as np

import sys
sys.path.append("..")
from utils.experiment import loadDatabase
from utils.utils import getHashFromDict

randomSeed = 11

In [2]:
DATA_PATH = "../data/preprocessed"

# **Cross-Validation for Transfer Setting**

It assumes that there is not enough target data for learning and resorts to a related domain (source) to augment the training data. To simulate low target data availability, each iteration of our cross validation for transfer settings selects one fold for training and the remaining for test. This is the opposite of traditional cross validation. 

In our transfer experiments, we also consider learning from scratch. In this case, the learning only relies on the limited target data, as simulated by our cross validation procedure.

In [3]:
EXPERIMENTS_BASE_PATH = "./experiments/crossValidation"
MODELS = ["OriginalRDNBoost", "TransferLearning", "TreeBoostler", "AnalogousToOriginalRDNBoost"]

In [4]:
def getExperimentID(experimentDict):
    experimentID = getHashFromDict(experimentDict)
    return experimentID

In [5]:
commonFixedParams = {
    "path": EXPERIMENTS_BASE_PATH,
    "randomSeed": randomSeed,
    "numberOfClauses": 8,
    "numberOfCycles": 100,
    "maxTreeDepth": 3,
    "nEstimators": 10,
    "nodeSize": 2,
    "negPosRatio": 2,
    "maxFailedNegSamplingRetries": 50,
    "ignoreSTDOUT": True,
}

In [30]:
transferDatasetParams = [
    # IMDB to Cora
    {
        "sourceDatabase": {
            "path": f"{DATA_PATH}/imdb",
            "targetPredicate": None, # Default: workedunder/2
            "resetTargetPredicate": False
        },

        "targetDatabase": {
            "path": f"{DATA_PATH}/cora",
            "targetPredicate": None, # Default: samevenue/2
            "resetTargetPredicate": False
        },

        "useRecursion": False
    },

    # Cora to IMDB
    {
        "sourceDatabase": {
            "path": f"{DATA_PATH}/cora",
            "targetPredicate": None, # Default: samevenue/2
            "resetTargetPredicate": False
        },

        "targetDatabase": {
            "path": f"{DATA_PATH}/imdb",
            "targetPredicate": None, # Default: workedunder/2
            "resetTargetPredicate": False
        },

        "useRecursion": False
    },

    # Twitter to Yeast
    {
        "sourceDatabase": {
            "path": f"{DATA_PATH}/twitter",
            "targetPredicate": None, # Default: accounttype/2
            "resetTargetPredicate": False
        },

        "targetDatabase": {
            "path": f"{DATA_PATH}/yeast",
            "targetPredicate": None, # Default: proteinclass/2
            "resetTargetPredicate": False
        },

        "useRecursion": True
    },

    # Yeast to Twitter
    {
        "sourceDatabase": {
            "path": f"{DATA_PATH}/yeast",
            "targetPredicate": None, # Default: proteinclass/2
            "resetTargetPredicate": False
        },

        "targetDatabase": {
            "path": f"{DATA_PATH}/twitter",
            "targetPredicate": None, # Default: accounttype/2
            "resetTargetPredicate": False
        },
        
        "useRecursion": True
    },

    # IMDB to UWCSE
    {
        "sourceDatabase": {
            "path": f"{DATA_PATH}/imdb",
            "targetPredicate": None, # Default: workedunder/2
            "resetTargetPredicate": False
        },

        "targetDatabase": {
            "path": f"{DATA_PATH}/uwcse",
            "targetPredicate": None, # Default: advisedby/2
            "resetTargetPredicate": False
        },

        "useRecursion": False
    },

    # UWCSE to IMDB
    {
        "sourceDatabase": {
            "path": f"{DATA_PATH}/uwcse",
            "targetPredicate": None, # Default: advisedby/2
            "resetTargetPredicate": False
        },

        "targetDatabase": {
            "path": f"{DATA_PATH}/imdb",
            "targetPredicate": None,  # Default: workedunder/2
            "resetTargetPredicate": False
        },

        "useRecursion": False
    },

    # NELL Finances to NELL Sports
    {
        "sourceDatabase": {
            "path": f"{DATA_PATH}/nell_finances",
            "targetPredicate": None, # Default: companyeconomicsector/2
            "resetTargetPredicate": False
        },

        "targetDatabase": {
            "path": f"{DATA_PATH}/nell_sports",
            "targetPredicate": None, # Default: teamplayssport/2
            "resetTargetPredicate": False
        },

        "useRecursion": True
    },

    # NELL Sports to NELL Finances
    {
        "sourceDatabase": {
            "path": f"{DATA_PATH}/nell_sports",
            "targetPredicate": None, # Default: teamplayssport/2
            "resetTargetPredicate": False
        },

        "targetDatabase": {
            "path": f"{DATA_PATH}/nell_finances",
            "targetPredicate": None, # Default: companyeconomicsector/2
            "resetTargetPredicate": False
        },
        
        "useRecursion": True
    }
]

In [31]:
def getNextModelParams(model: str, mappings = None):
    modelParams = []

    if model == "OriginalRDNBoost":
        modelParams = [{"runOriginalRDNBoost": True}]   

    elif model == "AnalogousToOriginalRDNBoost":
        modelParams = [{"runAnalogousToOriginalRDNBoost": True}]

    elif model == "TransferLearning":
        utilityAlphaValues = [0, 0.3, 0.6, 1, 1.3]
        utilityAlphaList = [
            {
                "sourceUtilityAlpha": sourceAlpha,
                "targetUtilityAlpha": targetAlpha
            } for sourceAlpha, targetAlpha in itertools.product(utilityAlphaValues, utilityAlphaValues)
        ]
        
        utilityAlphaSetIterList = [{"utilityAlphaSetIter": iteration} for iteration in [1,3,5,7]]

        weightList = [
            {
                "weight": {
                    "strategy": "scalar",
                    "parameters": {
                        "weight": 1
                    }
                }
            }
        ]

        mappingList = [
            {    
                "mapping": {
                    "relationMapping": mapping[0],
                    "termTypeMapping": mapping[1]
                }
            } for mapping in mappings
        ]

        modelParams = [
            {
                "runTransferLearning": True,
                **utilityParams,
                **utilityAlphaSetIterList,
                **weightParams, 
                **mappingParams,
            } for utilityParams, utilityAlphaSetIterList, weightParams, mappingParams in itertools.product(
                utilityAlphaList, 
                utilityAlphaSetIterList,
                weightList, 
                mappingList
            )
        ]

    elif model == "TreeBoostler":
        modelParams = [
            {
                "runTreeBoostler": True,
                "maxRevisionIterations": 2,
                "searchArgPermutation": True,
                "allowSameTargetMap": False,
                "refine": refine,
            } for refine in [True, False]
        ]

    else:
        raise ValueError(f"{model} is not a valid model.")

    for params in modelParams:
        yield params

In [None]:
np.random.seed(randomSeed)
maxMappings = 5
experiments = []
for datasetParams in transferDatasetParams:
    sourceDB = loadDatabase(
        datasetParams["sourceDatabase"]["path"], 
        targetPredicate = datasetParams["sourceDatabase"]["targetPredicate"],
        resetTargetPredicate = datasetParams["sourceDatabase"]["resetTargetPredicate"],
        useRecursion = datasetParams["useRecursion"]
    )
    targetDB = loadDatabase(
        datasetParams["targetDatabase"]["path"], 
        targetPredicate = datasetParams["targetDatabase"]["targetPredicate"],
        resetTargetPredicate = datasetParams["targetDatabase"]["resetTargetPredicate"],
        useRecursion = datasetParams["useRecursion"]
    )
    
    mappings = sourceDB.findAllValidMappings(targetDB)
    np.random.shuffle(mappings)
    mappings = mappings[:maxMappings]
    
    for model in MODELS:
        experimentID = None
        for modelParams in getNextModelParams(model = model, mappings = mappings):
            experimentDict = {
                **commonFixedParams, 
                **datasetParams,
                **modelParams
            }

            experimentID = getExperimentID(experimentDict)
            experimentDict["id"] = experimentID
            
            experiments.append(experimentDict)

In [33]:
len(experiments)

4032

In [34]:
with open("experiments-transferCrossValidation.json", "w") as f:
    json.dump(experiments, f)

# **Traditional Cross-Validation (no transfer)**

It consists of performing traditional k-fold cross validation on the target data. In other words, we consider learning from scratch with enough target data for learning.

In [27]:
EXPERIMENTS_BASE_PATH = "./experiments/noTransferCrossValidation"

In [28]:
def getExperimentID(experimentDict):
    experimentID = getHashFromDict(experimentDict)
    return experimentID

In [29]:
commonFixedParams = {
    "path": EXPERIMENTS_BASE_PATH,
    "randomSeed": randomSeed,
    "numberOfClauses": 8,
    "numberOfCycles": 100,
    "maxTreeDepth": 3,
    "nEstimators": 10,
    "nodeSize": 2,
    "negPosRatio": 2,
    "maxFailedNegSamplingRetries": 50,
    "ignoreSTDOUT": True,
}

In [30]:
datasetParams = [
    # Cora
    {
        "databasePath": f"{DATA_PATH}/cora",
        "targetPredicate": None, # Default: samevenue/2
        "resetTargetPredicate": False,
        "useRecursion": False
    },

    # IMDB
    {
        "databasePath": f"{DATA_PATH}/imdb",
        "targetPredicate": None, # Default: workedunder/2
        "resetTargetPredicate": False,
        "useRecursion": False
    },

    # Yeast
    {
        "databasePath": f"{DATA_PATH}/yeast",
        "targetPredicate": None, # Default: proteinclass/2
        "resetTargetPredicate": False,
        "useRecursion": True
    },

    # Twitter
    {
        "databasePath": f"{DATA_PATH}/twitter",
        "targetPredicate": None, # Default: accounttype/2
        "resetTargetPredicate": False,       
        "useRecursion": True
    },

    # UWCSE
    {
        "databasePath": f"{DATA_PATH}/uwcse",
        "targetPredicate": None, # Default: advisedby/2
        "resetTargetPredicate": False,
        "useRecursion": False
    },

    # NELL Sports
    {
        "databasePath": f"{DATA_PATH}/nell_sports",
        "targetPredicate": None, # Default: teamplayssport/2
        "resetTargetPredicate": False,
        "useRecursion": True
    },

    # NELL Finances
    {
        "databasePath": f"{DATA_PATH}/nell_finances",
        "targetPredicate": None, # Default: companyeconomicsector/2
        "resetTargetPredicate": False,      
        "useRecursion": True
    }
]

In [33]:
experiments = []
for params in datasetParams:    
    experimentDict = {
        **commonFixedParams, 
        **params,
    }

    experimentID = getExperimentID(experimentDict)
    experimentDict["id"] = experimentID
    
    experiments.append(experimentDict)

In [18]:
len(experiments)

7

In [35]:
with open("experiments-noTransferCrossValidation.json", "w") as f:
    json.dump(experiments, f)

# **Transfer with Noisy Source**

In this experiment, we perform transfer learning from a noisy source to a target domain. To control the noise intensity, we build both target and source sets from the same dataset. This allow us to bypass the challenge of finding a good mapping. Before cobining the source and target data, we randomly add, remove or change the types of the relations on the source. 

In [4]:
EXPERIMENTS_BASE_PATH = "./experiments/noisyTransferLearning"

In [5]:
def getExperimentID(experimentDict):
    experimentID = getHashFromDict(experimentDict)
    return experimentID

In [6]:
commonFixedParams = {
    "path": EXPERIMENTS_BASE_PATH,
    "randomSeed": randomSeed,
    "numberOfClauses": 8,
    "numberOfCycles": 100,
    "maxTreeDepth": 3,
    "nEstimators": 10,
    "nodeSize": 2,
    "negPosRatio": 2,
    "maxFailedNegSamplingRetries": 50,
    "ignoreSTDOUT": True,
    "trainNSplits": 5,
    "trainSourceSplits": 4
}

In [27]:
datasetParams = [
    # NELL Finances
    {
        "databasePath": f"{DATA_PATH}/nell_finances",
        "targetPredicate": None, # Default: companyeconomicsector/2
        "resetTargetPredicate": False,      
        "useRecursion": True
    }, 

    # Yeast
    {
        "databasePath": f"{DATA_PATH}/yeast",
        "targetPredicate": None, # Default: proteinclass/2
        "resetTargetPredicate": False,
        "useRecursion": True
    },

    # NELL Sports
    {
        "databasePath": f"{DATA_PATH}/nell_sports",
        "targetPredicate": None, # Default: teamplayssport/2
        "resetTargetPredicate": False,
        "useRecursion": True
    },

    # Cora
    {
        "databasePath": f"{DATA_PATH}/cora",
        "targetPredicate": None, # Default: samevenue/2
        "resetTargetPredicate": False,
        "useRecursion": False
    },

    # UWCSE
    {
        "databasePath": f"{DATA_PATH}/uwcse",
        "targetPredicate": None, # Default: advisedby/2
        "resetTargetPredicate": False,
        "useRecursion": False
    },

    # Twitter
    {
        "databasePath": f"{DATA_PATH}/twitter",
        "targetPredicate": None, # Default: accounttype/2
        "resetTargetPredicate": False,       
        "useRecursion": True
    },

    # IMDB
    {
        "databasePath": f"{DATA_PATH}/imdb",
        "targetPredicate": None, # Default: workedunder/2
        "resetTargetPredicate": False,
        "useRecursion": False
    },
]

In [28]:
def getNextModelParams():
    utilityAlphaValues = [0, 0.3, 0.6, 1, 1.3]
    utilityAlphaList = [
        {
            "sourceUtilityAlpha": sourceAlpha,
            "targetUtilityAlpha": targetAlpha
        } for sourceAlpha, targetAlpha in itertools.product(utilityAlphaValues, utilityAlphaValues)
    ]
    
    utilityAlphaSetIterList = [{"utilityAlphaSetIter": iteration} for iteration in [1]]

    weightList = [
        {
            "weight": {
                "strategy": "scalar",
                "parameters": {
                    "weight": 1
                }
            }
        }
    ]

    noiseStrengthValues = [(1e-5)*(2**i) for i in range(0, 15)]
    noiseStrengthList = [{"noiseStrength": strength} for strength in noiseStrengthValues]

    paramsGrid = [
        {
            **utilityParams,
            **utilityAlphaSetIterList,
            **weightParams, 
            **noiseStrengthParams
        } for utilityParams, utilityAlphaSetIterList, weightParams, noiseStrengthParams in itertools.product(
            utilityAlphaList, 
            utilityAlphaSetIterList,
            weightList,
            noiseStrengthList
        )
    ]

    for params in paramsGrid:
        yield params

In [29]:
experiments = []
for params in datasetParams:    
    for paramsGrid in getNextModelParams():
        experimentDict = {
            **commonFixedParams, 
            **params,
            **paramsGrid
        }

        experimentID = getExperimentID(experimentDict)
        experimentDict["id"] = experimentID
        
        experiments.append(experimentDict)

In [22]:
len(experiments)

2625

In [31]:
with open("experiments-noisyTransferLearning.json", "w") as f:
    json.dump(experiments, f)

# **Progressive Target Data Availability**

It evaluates how target data availability impacts the performance of our instance-based transfer learning model. We only consider the best settings for each pair of source and target domains, according to the results from the cross validation for transfer settings. In particular, we define the best setting based on the AUC PR.

In [6]:
EXPERIMENTS_BASE_PATH = "./experiments/learningCurve"

In [7]:
def getExperimentID(experimentDict):
    experimentID = getHashFromDict(experimentDict)
    return experimentID

In [18]:
learningFromScratchOriginalRDNBoostExpIDs = [
    "0726cac0b41d9aa866dbf57bd36b115ed1c50e11afedc0232f4809c47533219a", # NELL Sports
    "51d101f2e9cfac1aa4edb1f463accf3f6b5d41f573f52ae1e90a28db8ad7b022", # IMDB
    "6a9f022b5fc34609ec98a9898cafb1845f9e40acb61a13255e18e0fdd935f419", # Yeast
    "816f88e322ad9332974b410b99743c591cfb28291770844b23d5ab2e50cf7967", # Cora
    "b66e86ba8b85a4ad471123e9d4010515430a0c49c6fa23f7496e373ed78183cf", # Twitter
    "bf61e2b7f32e7599b7a4c2e147a01836132f2ca951dcf02c9642f97c877c9e1a", # Finances
    "c0b61053859b77b0901fecdeaad7db8405a209cd76add3a212044068dae41802", # UW-CSE
]

In [19]:
learningFromScratchOriginalRDNBoostSettings = []
for expID in learningFromScratchOriginalRDNBoostExpIDs:
    with open(f"./experiments/crossValidation/{expID}/setting.json") as f:
        expSetting = json.load(f)
        expSetting["path"] = EXPERIMENTS_BASE_PATH
        learningFromScratchOriginalRDNBoostSettings.append(expSetting)

In [20]:
bestAUCPRCrossValidationExpIDs = [
    "bd52831309ada286d62e64cf24e949319f73c548fd60103f5c595d2a3b80e1c2", # IMDB to Cora
    "396a63c068864d171cb3eeb5c9340cea81e4e0ce00592bd492485edfba59c5ff", # Cora to IMDB
    "f3ae43b9215c2493380521cec736ff40fe7b9e98a9e093af156ec3ca880807cc", # IMDB to UW-CSE
    "6686ac04ddaf3e507af9c6b94b8f868e37c2d1a12fdccecaa8770b8892934d76", # Twitter to Yeast
    "b45ead65a1670ade43e08d3f429838d56e21dd723ebd05a0db46df3323738b3e", # Yeast to Twitter
    "9d42911e8115f7e1f34b8333583d28b0bef5ccc46bcff5a7ba664af957adef29", # NELL Sports to NELL Finances
    "54ab7a686188f35592556347bf5251921a8ade4adb0ab22b7bcc078c9800b684", # NELL Finances to NELL Sports
]

In [21]:
bestAUCPRCrossValidationSettings = []
for expID in bestAUCPRCrossValidationExpIDs:
    with open(f"./experiments/crossValidation/{expID}/setting.json") as f:
        expSetting = json.load(f)
        expSetting["path"] = EXPERIMENTS_BASE_PATH
        bestAUCPRCrossValidationSettings.append(expSetting)

In [22]:
worstAUCPRCrossValidationExpIDs = [
    "8f1f2f42ac13df76c11fdd4fc2012f1a02d64609e64fa0258d1e34c50a28301a", # IMDB to Cora
    "51f933537100240bacb4f7a6341c4f77a9b233a4e493d5938c556dd990ddb4f1", # Cora to IMDB
    "ed84223eca8dea790d61b5ebed802d1bb652cf449866cd8d00740f4e6423c7fb", # IMDB to UW-CSE
    "1083b5b58bc7c476d69917dc5a42cc31fdbc6910b4a51102867b0c71be10da74", # Twitter to Yeast
    "82685ff50ecd1e70faeec5f45a5feb4702b3d939876d71d7f9de55684df757ed", # Yeast to Twitter
    "81cb6ed097c877693916a86800636138255b05b772fdcd024958faa184cd04e5", # NELL Sports to NELL Finances
    "a375d163f0e4c861940f9cbcceb0c5254eaa606a9d38144abce4a493d700af92", # NELL Finances to NELL Sports
]

In [23]:
worstAUCPRCrossValidationSettings = []
for expID in worstAUCPRCrossValidationExpIDs:
    with open(f"./experiments/crossValidation/{expID}/setting.json") as f:
        expSetting = json.load(f)
        expSetting["path"] = EXPERIMENTS_BASE_PATH
        worstAUCPRCrossValidationSettings.append(expSetting)

In [24]:
experiments = bestAUCPRCrossValidationSettings + worstAUCPRCrossValidationSettings + learningFromScratchOriginalRDNBoostSettings
len(experiments)

21

In [26]:
with open("experiments-learningCurve.json", "w") as f:
    json.dump(experiments, f)