In [1]:
import pandas as pd
from pathlib import Path


def load_all_csvs(directory='.', pattern='*.csv', recursive=False, source_col='source_file'):
    p = Path(directory)
    files = sorted(p.rglob(pattern) if recursive else p.glob(pattern))
    dfs = []
    errors = {}
    for f in files:
        try:
            df = pd.read_csv(f, low_memory=False)
        except UnicodeDecodeError:
            try:
                df = pd.read_csv(f, encoding='latin1', low_memory=False)
            except Exception as e:
                errors[str(f)] = str(e)
                continue
        except Exception as e:
            errors[str(f)] = str(e)
            continue
        df[source_col] = str(f)  # full path
        dfs.append(df)
    if not dfs:
        combined = pd.DataFrame()
    else:
        combined = pd.concat(dfs, ignore_index=True, sort=False)
    return combined, files, errors

combined_df, csv_files, load_errors = load_all_csvs(directory='.', recursive=False)
print(f"Found {len(csv_files)} csv files, loaded {len(combined_df)} rows, errors: {len(load_errors)}")

Found 14 csv files, loaded 476 rows, errors: 0


In [2]:
combined_df.head()

Unnamed: 0,Predicate,Frequency,Uniqueness,Entropy,Entity Quality,Edge Rank,Score,is_good_predicate,source_file
0,<http://schema.org/ratingcount>,0.753135,0.20906,0.860268,0.692534,0.006998,12.588596,True,_http___schema.org_aggregaterating_.csv
1,<http://schema.org/ratingvalue>,0.992617,0.024826,0.422303,1.0,0.009195,12.497784,True,_http___schema.org_aggregaterating_.csv
2,<http://schema.org/reviewcount>,0.280096,0.085644,0.52642,0.325808,0.002848,12.487683,True,_http___schema.org_aggregaterating_.csv
3,<http://schema.org/itemreviewed>,0.083455,0.953862,1.0,0.053134,0.001302,12.487263,True,_http___schema.org_aggregaterating_.csv
4,<http://schema.org/bestrating>,0.7191,0.000861,0.132134,0.593892,0.007324,12.484704,True,_http___schema.org_aggregaterating_.csv


In [3]:
import torch.nn as nn

import torch
from torch.utils.data import TensorDataset, DataLoader

model = nn.Sequential(
    nn.Linear(5, 16),
    nn.Linear(16, 8),
    nn.Linear(8, 2),
    nn.Softmax(dim=1)
)


opt = torch.optim.Adam(model.parameters(), lr=1e-3)

features = ["Frequency", "Uniqueness", "Entropy", "Entity Quality", "Edge Rank"]

X = combined_df[features]
Y = combined_df["is_good_predicate"]
counts = Y.value_counts()
pos = int(counts.get(True, 0))
neg = int(counts.get(False, 0))

if pos == 0:
    loss_fn = nn.CrossEntropyLoss()
else:
    # inverse-frequency base weights
    total = pos + neg
    w0 = total / (2 * neg) if neg > 0 else 1.0
    w1 = total / (2 * pos)
    recall_bias = 2.0
    w1 *= recall_bias
    class_weights = torch.tensor([w0, w1], dtype=torch.float32)
    print(f"class counts: neg={neg}, pos={pos} -> class_weights={class_weights.tolist()}")
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

X_tensor = torch.tensor(X.values, dtype=torch.float32)
y_tensor = torch.tensor(Y.astype(int).values, dtype=torch.long)

# 50/50 split to verify that we do not overfit
n_samples = len(X_tensor)
perm = torch.randperm(n_samples)
split = int(0.5 * n_samples)
train_idx, val_idx = perm[:split], perm[split:]

train_ds = TensorDataset(X_tensor[train_idx], y_tensor[train_idx])
val_ds = TensorDataset(X_tensor[val_idx], y_tensor[val_idx])

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=64, shuffle=False)

# training loop
epochs = 30
for epoch in range(1, epochs + 1):
    # training
    model.train()
    running_loss = 0.0
    running_correct = 0
    running_total = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)

        opt.zero_grad()
        logits = model[:-1](xb) 
        loss = loss_fn(logits, yb)
        loss.backward()
        opt.step()

        running_loss += loss.item() * xb.size(0)
        preds = logits.argmax(dim=1)
        running_correct += (preds == yb).sum().item()
        running_total += xb.size(0)

    train_loss = running_loss / running_total
    train_acc = running_correct / running_total

    # validation
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            logits = model[:-1](xb)
            loss = loss_fn(logits, yb)

            val_loss += loss.item() * xb.size(0)
            preds = logits.argmax(dim=1)
            val_correct += (preds == yb).sum().item()
            val_total += xb.size(0)

    val_loss /= val_total
    val_acc = val_correct / val_total


    print(f"Epoch {epoch:02d} | train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} val_acc: {val_acc:.4f}")

class counts: neg=390, pos=86 -> class_weights=[0.6102564334869385, 5.534883499145508]
Epoch 01 | train_loss: 0.6534 train_acc: 0.5546 | val_loss: 0.6346 val_acc: 0.1807
Epoch 02 | train_loss: 0.6322 train_acc: 0.1807 | val_loss: 0.6094 val_acc: 0.1807
Epoch 03 | train_loss: 0.6101 train_acc: 0.1807 | val_loss: 0.5898 val_acc: 0.1807
Epoch 04 | train_loss: 0.5986 train_acc: 0.1807 | val_loss: 0.5725 val_acc: 0.1807
Epoch 05 | train_loss: 0.5767 train_acc: 0.1807 | val_loss: 0.5581 val_acc: 0.1807
Epoch 06 | train_loss: 0.5729 train_acc: 0.1807 | val_loss: 0.5457 val_acc: 0.1807
Epoch 07 | train_loss: 0.5568 train_acc: 0.1807 | val_loss: 0.5358 val_acc: 0.1807
Epoch 08 | train_loss: 0.5530 train_acc: 0.1807 | val_loss: 0.5259 val_acc: 0.1807
Epoch 09 | train_loss: 0.5375 train_acc: 0.1807 | val_loss: 0.5166 val_acc: 0.1807
Epoch 10 | train_loss: 0.5325 train_acc: 0.1807 | val_loss: 0.5072 val_acc: 0.1807
Epoch 11 | train_loss: 0.5370 train_acc: 0.1807 | val_loss: 0.4978 val_acc: 0.1807


In [None]:
dummy_input = torch.randn(1, 5)

torch.onnx.export(
    model,                      
    dummy_input,                    
    "model.onnx",               
    input_names = ['input'],       
    output_names = ['output'],     
    opset_version=11             
)