In [1]:
SRLEARN_PATH = "../../srlearn"
PROJECT_PATH = ".."
import sys
sys.path.append(SRLEARN_PATH)
sys.path.append(PROJECT_PATH)

from srlearn.database import Database

from utils.experiment import loadDatabase, getLogger

from pprint import pprint

In [2]:
DATA_PATH = "./preprocessed"

In [3]:
logger = getLogger("Data Analysis")

In [6]:
def getStatistics(database: Database):
    statistics = database.getStatistics()

    print("Database Statistics")
    print("===================\n")
    print("Number of Constants:", statistics["totalConstants"])
    print("Number of Types:", statistics["totalTermTypes"])
    print("Number of Predicates:", statistics["totalPredicates"])
    print("Number of Positive Examples:", statistics["totalPositiveExamples"])
    print("Number of Facts:", statistics["totalGroundLiterals"])
    print("Target Relation:", statistics["targetRelation"])
    print()
    print("Database Schema")
    print("===============\n")
    print(
        "\n".join(
            [f"{pred}({','.join(terms)})" for pred, terms in statistics["schema"].items()]
        )
    )

# **Data Statistics**

| Dataset | # Folds | # Constants | # Types | # Predicates | # Facts | # Pos Examples | Target Relation |
| - | - | - | - | - | - | - | - |
| IMDB | 5 | 297 | 3 | 6 | 696 | 382 | workedunder |
| Cora | 5 | 2457 | 5 | 10 | 38336 | 2640 | samevenue |
| UW-CSE | 5 | 914 | 9 | 14 | 2274 | 113 | advisedby |
| Twitter | 2 | 273 | 3 | 3 | 2312 | 221 | accounttype |
| Yeast | 4 | 2470 | 7 | 7 | 15015 | 369 | proteinclass |
| NELL Finances | 3 | 3340 | 5 | 10 | 4579 | 762 | companyeconomicsector |
| NELL Sports | 3 | 4538 | 4 | 8 | 9236 | 392 | teamplayssport |

## **IMDB**

In [8]:
imdb = loadDatabase(
    path = f"{DATA_PATH}/imdb",
    folds = None,
    useRecursion = False,
    targetPredicate = None,
    resetTargetPredicate = False, 
    negPosRatio = 1,
    maxFailedNegSamplingRetries = 50,
    logger = logger
)

  logger.warn("All available database folds will be loaded. Set `folds` if it is desired to load only a few fold.")


2024-05-14 19:15:37,593 - Data Analysis - DEBUG - 1/5 folds loaded with success.
2024-05-14 19:15:37,677 - Data Analysis - DEBUG - 2/5 folds loaded with success.
2024-05-14 19:15:37,772 - Data Analysis - DEBUG - 3/5 folds loaded with success.
2024-05-14 19:15:37,871 - Data Analysis - DEBUG - 4/5 folds loaded with success.
2024-05-14 19:15:37,938 - Data Analysis - DEBUG - 5/5 folds loaded with success.
  logger.warn(f"No target predicate was given. It will be set as `{defaultTargetPredicate}` by default. Set `targetPredicate` if it is desired to set another relation as the target relation.")


In [9]:
print("Database: IMDB\n")
getStatistics(imdb)

Database: IMDB

Database Statistics

Number of Constants: 297
Number of Types: 3
Number of Predicates: 6
Number of Positive Examples: 382
Number of Facts: 696
Target Relation: workedunder

Database Schema

workedunder(person,person)
female(person)
actor(person)
director(person)
movie(movie,person)
genre(person,genre)


## **Cora**

In [22]:
cora = loadDatabase(
    path = f"{DATA_PATH}/cora",
    folds = None,
    useRecursion = False,
    targetPredicate = None,
    resetTargetPredicate = False, 
    negPosRatio = 1,
    maxFailedNegSamplingRetries = 50,
    logger = logger
)

2024-05-14 19:18:56,207 - Data Analysis - DEBUG - 1/5 folds loaded with success.
2024-05-14 19:18:56,316 - Data Analysis - DEBUG - 2/5 folds loaded with success.
2024-05-14 19:18:56,434 - Data Analysis - DEBUG - 3/5 folds loaded with success.
2024-05-14 19:18:56,584 - Data Analysis - DEBUG - 4/5 folds loaded with success.
2024-05-14 19:18:56,753 - Data Analysis - DEBUG - 5/5 folds loaded with success.


In [11]:
print("Database: Cora\n")
getStatistics(cora)

Database: Cora

Database Statistics

Number of Constants: 2457
Number of Types: 5
Number of Predicates: 10
Number of Positive Examples: 2640
Number of Facts: 38336
Target Relation: samevenue

Database Schema

sameauthor(author,author)
samebib(class,class)
sametitle(title,title)
samevenue(venue,venue)
author(class,author)
title(class,title)
venue(class,venue)
haswordauthor(author,word)
haswordtitle(title,word)
haswordvenue(venue,word)


## **UW-CSE**

In [12]:
uwcse = loadDatabase(
    path = f"{DATA_PATH}/uwcse",
    folds = None,
    useRecursion = False,
    targetPredicate = None,
    resetTargetPredicate = False, 
    negPosRatio = 1,
    maxFailedNegSamplingRetries = 50,
    logger = logger
)

2024-05-14 19:17:16,328 - Data Analysis - DEBUG - 1/5 folds loaded with success.
2024-05-14 19:17:16,461 - Data Analysis - DEBUG - 2/5 folds loaded with success.
2024-05-14 19:17:16,585 - Data Analysis - DEBUG - 3/5 folds loaded with success.
2024-05-14 19:17:16,729 - Data Analysis - DEBUG - 4/5 folds loaded with success.
2024-05-14 19:17:16,865 - Data Analysis - DEBUG - 5/5 folds loaded with success.


In [13]:
print("Database: UW-CSE\n")
getStatistics(uwcse)

Database: UW-CSE

Database Statistics

Number of Constants: 914
Number of Types: 9
Number of Predicates: 14
Number of Positive Examples: 113
Number of Facts: 2274
Target Relation: advisedby

Database Schema

professor(person)
student(person)
advisedby(person,person)
tempadvisedby(person,person)
ta(course,person,quarter)
hasposition(person,faculty)
publication(title,person)
inphase(person,prequals)
courselevel(course,level)
yearsinprogram(person,year)
projectmember(project,person)
sameproject(project,project)
samecourse(course,course)
sameperson(person,person)


## **Twitter**

In [14]:
twitter = loadDatabase(
    path = f"{DATA_PATH}/twitter",
    folds = None,
    useRecursion = False,
    targetPredicate = None,
    resetTargetPredicate = False, 
    negPosRatio = 1,
    maxFailedNegSamplingRetries = 50,
    logger = logger
)

2024-05-14 19:17:27,756 - Data Analysis - DEBUG - 1/2 folds loaded with success.
2024-05-14 19:17:27,855 - Data Analysis - DEBUG - 2/2 folds loaded with success.


In [15]:
print("Database: Twitter\n")
getStatistics(twitter)

Database: Twitter

Database Statistics

Number of Constants: 273
Number of Types: 3
Number of Predicates: 3
Number of Positive Examples: 221
Number of Facts: 2312
Target Relation: accounttype

Database Schema

accounttype(account,type)
tweets(account,word)
follows(account,account)


## **Yeast**

In [16]:
yeast = loadDatabase(
    path = f"{DATA_PATH}/yeast",
    folds = None,
    useRecursion = False,
    targetPredicate = None,
    resetTargetPredicate = False, 
    negPosRatio = 1,
    maxFailedNegSamplingRetries = 50,
    logger = logger
)

2024-05-14 19:17:37,708 - Data Analysis - DEBUG - 1/4 folds loaded with success.
2024-05-14 19:17:37,828 - Data Analysis - DEBUG - 2/4 folds loaded with success.
2024-05-14 19:17:37,958 - Data Analysis - DEBUG - 3/4 folds loaded with success.
2024-05-14 19:17:38,093 - Data Analysis - DEBUG - 4/4 folds loaded with success.


In [17]:
print("Database: Yeast\n")
getStatistics(yeast)

Database: Yeast

Database Statistics

Number of Constants: 2470
Number of Types: 7
Number of Predicates: 7
Number of Positive Examples: 369
Number of Facts: 15015
Target Relation: proteinclass

Database Schema

location(protein,loc)
interaction(protein,protein)
proteinclass(protein,class)
enzyme(protein,enz)
function(protein,fun)
complex(protein,com)
phenotype(protein,phe)


## **NELL Finances**

In [18]:
nellFinances = loadDatabase(
    path = f"{DATA_PATH}/nell_finances",
    folds = None,
    useRecursion = False,
    targetPredicate = None,
    resetTargetPredicate = False, 
    negPosRatio = 1,
    maxFailedNegSamplingRetries = 50,
    logger = logger
)

2024-05-14 19:17:51,490 - Data Analysis - DEBUG - 1/3 folds loaded with success.
2024-05-14 19:17:51,583 - Data Analysis - DEBUG - 2/3 folds loaded with success.
2024-05-14 19:17:51,679 - Data Analysis - DEBUG - 3/3 folds loaded with success.


In [19]:
print("Database: NELL Finances\n")
getStatistics(nellFinances)

Database: NELL Finances

Database Statistics

Number of Constants: 3340
Number of Types: 5
Number of Predicates: 10
Number of Positive Examples: 762
Number of Facts: 4579
Target Relation: companyeconomicsector

Database Schema

countryhascompanyoffice(country,company)
companyeconomicsector(company,sector)
economicsectorcompany(sector,company)
companyceo(company,person)
companyalsoknownas(company,company)
cityhascompanyoffice(city,company)
acquired(company,company)
bankbankincountry(person,country)
bankboughtbank(company,company)
bankchiefexecutiveceo(company,person)


## **NELL Sports**

In [20]:
nellSports = loadDatabase(
    path = f"{DATA_PATH}/nell_sports",
    folds = None,
    useRecursion = False,
    targetPredicate = None,
    resetTargetPredicate = False, 
    negPosRatio = 1,
    maxFailedNegSamplingRetries = 50,
    logger = logger
)

2024-05-14 19:18:00,759 - Data Analysis - DEBUG - 1/3 folds loaded with success.
2024-05-14 19:18:00,940 - Data Analysis - DEBUG - 2/3 folds loaded with success.
2024-05-14 19:18:01,123 - Data Analysis - DEBUG - 3/3 folds loaded with success.


In [21]:
print("Database: NELL Sports\n")
getStatistics(nellSports)

Database: NELL Sports

Database Statistics

Number of Constants: 4538
Number of Types: 4
Number of Predicates: 8
Number of Positive Examples: 392
Number of Facts: 9236
Target Relation: teamplayssport

Database Schema

athleteledsportsteam(athlete,sportsteam)
athleteplaysforteam(athlete,sportsteam)
athleteplaysinleague(athlete,sportsleague)
athleteplayssport(athlete,sport)
teamalsoknownas(sportsteam,sportsteam)
teamplaysagainstteam(sportsteam,sportsteam)
teamplaysinleague(sportsteam,sportsleague)
teamplayssport(sportsteam,sport)
