# HOW TO RUN THIS FILE
1. edit the ROOT_PATH variable to match the path from your google drive to the eva shared folder
2. make sure that the runtime type is set to GPU. the code requires CUDA to work. (you can set it to CPU only mode, but it's about 10x slower)

In [1]:
!nvidia-smi

Sun Dec  5 22:03:44 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0    40W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import os
import sys
# EDIT THIS VARIABLE:
#ROOT_PATH = "/content/drive/MyDrive/CS245 Project Shared/eva"
ROOT_PATH = "/content/drive/MyDrive/CS245 Project/CS245 Project Shared/eva"
SRC_PATH = os.path.join(ROOT_PATH, "src")
sys.path.append(SRC_PATH)

# Copy from GCP bucket

In [None]:
from google.colab import auth
auth.authenticate_user()
project_id = "personal-243018"
!gcloud config set project {project_id}
!gsutil ls

Updated property [core/project].


To take a quick anonymous survey, run:
  $ gcloud survey

gs://benl_eva_bucket/


In [None]:
bucket_name = "benl_eva_bucket"
#!gsutil -m cp -r "/content/drive/MyDrive/CS245 Project Shared/eva/data/en(fr)_dbp15k_link_img_dict_full.pkl" gs://{bucket_name}/
for i in range(15):
    !gsutil -m cp -r gs://{bucket_name}/{i}.pkl.gz "/content/drive/MyDrive/CS245 Project/CS245 Project Shared/eva/data/img_dict_pkls/"

Copying gs://benl_eva_bucket/0.pkl.gz...
\ [1/1 files][399.1 MiB/399.1 MiB] 100% Done                                    
Operation completed over 1 objects/399.1 MiB.                                    
Copying gs://benl_eva_bucket/1.pkl.gz...
- [1/1 files][646.4 MiB/646.4 MiB] 100% Done  50.7 MiB/s ETA 00:00:00           
Operation completed over 1 objects/646.4 MiB.                                    
Copying gs://benl_eva_bucket/2.pkl.gz...
\ [1/1 files][762.2 MiB/762.2 MiB] 100% Done  52.0 MiB/s ETA 00:00:00           
Operation completed over 1 objects/762.2 MiB.                                    
Copying gs://benl_eva_bucket/3.pkl.gz...
- [1/1 files][852.8 MiB/852.8 MiB] 100% Done  55.4 MiB/s ETA 00:00:00           
Operation completed over 1 objects/852.8 MiB.                                    
Copying gs://benl_eva_bucket/4.pkl.gz...
- [1/1 files][615.2 MiB/615.2 MiB] 100% Done  47.9 MiB/s ETA 00:00:00           
Operation completed over 1 objects/615.2 MiB.                 

# EVA Code

In [4]:
from run_dbp15k import *
def run_eva(emb_sizes=(100,100,200), gcn_emb_size=200, verbose=True, save_emb=False):
    parser = argparse.ArgumentParser()
    parser.add_argument("--file_dir", type=str, default="data/DBP15K/zh_en", required=False, help="input dataset file directory, ('data/DBP15K/zh_en', 'data/DWY100K/dbp_wd')")
    parser.add_argument("--rate", type=float, default=0.3, help="training set rate")
    
    parser.add_argument("--cuda", action="store_true", default=True, help="whether to use cuda or not")
    parser.add_argument("--seed", type=int, default=2021, help="random seed")
    parser.add_argument("--epochs", type=int, default=1000, help="number of epochs to train")
    parser.add_argument("--check_point", type=int, default=100, help="check point")
    parser.add_argument("--hidden_units", type=str, default="128,128,128", help="hidden units in each hidden layer(including in_dim and out_dim), splitted with comma")
    parser.add_argument("--heads", type=str, default="2,2", help="heads in each gat layer, splitted with comma")
    parser.add_argument("--instance_normalization", action="store_true", default=False, help="enable instance normalization")
    parser.add_argument("--lr", type=float, default=0.005, help="initial learning rate")
    parser.add_argument("--weight_decay", type=float, default=0, help="weight decay (L2 loss on parameters)")
    parser.add_argument("--dropout", type=float, default=0.0, help="dropout rate for layers")
    parser.add_argument("--attn_dropout", type=float, default=0.0, help="dropout rate for gat layers")
    parser.add_argument("--dist", type=int, default=2, help="L1 distance or L2 distance. ('1', '2')")
    parser.add_argument("--csls", action="store_true", default=False, help="use CSLS for inference")
    parser.add_argument("--csls_k", type=int, default=10, help="top k for csls")
    parser.add_argument("--il", action="store_true", default=False, help="Iterative learning?")
    parser.add_argument("--semi_learn_step", type=int, default=10, help="If IL, what's the update step?")
    parser.add_argument("--il_start", type=int, default=500, help="If Il, when to start?")
    parser.add_argument("--bsize", type=int, default=7500, help="batch size")
    parser.add_argument("--unsup", action="store_true", default=False)
    parser.add_argument("--unsup_k", type=int, default=1000, help="|visual seed|")
    #parser.add_argument("--long_tail_analysis", action="store_true", default=False)
    parser.add_argument("--lta_split", type=int, default=0, help="split in {0,1,2,3,|splits|-1}")
    args = argparse.Namespace( 
        file_dir=os.path.join(ROOT_PATH, "data/DBP15K/fr_en"), 
        rate=0.3, 
        lr=0.0005, 
        epochs=50, #1000 originally
        hidden_units=f"400,400,{gcn_emb_size}", 
        check_point=50, 
        bsize=7500, 
        il=False, # True originally 
        il_start=500, 
        semi_learn_step=5, 
        csls=True, 
        csls_k=3, 
        seed=0,
    )
    args = parser.parse_args(args=[], namespace=args)
    main(
        args, 
        emb_sizes=emb_sizes, 
        save_emb=save_emb, 
        root_path=ROOT_PATH,
        verbose=verbose
    )

In [None]:
# default run:
results = run_eva()

# EfficientNet Encodings

First we investigate what format the image features need to be in.

In [5]:
import pickle

In [None]:
# load image dictionary
ori_img_features_path = os.path.join(ROOT_PATH,"data/pkls/fr_en_GA_id_img_feature_dict.pkl")
with open(ori_img_features_path, 'rb') as f:
    img_dict = pickle.load(f)
print("example of an item:")
print(list(img_dict.items())[0])
print("number of embeddings:")
print(len(img_dict.keys()))

example of an item:
(7656, array([1.0869268 , 0.12827268, 0.02897369, ..., 0.3845102 , 0.0988225 ,
       0.15103804], dtype=float32))
number of embeddings:
28032


It appears to be a dictionary with keys being the entity id and the value being the image feature vector.

We can recover the mapping between the IDs and the link names like so:

In [6]:
FILE_DIR = os.path.join(ROOT_PATH, "data/DBP15K/fr_en")
ent2id_dict, _, _, _, _, _ = read_raw_data(FILE_DIR, [1,2])
list(ent2id_dict.items())[0]

loading raw data...


('http://fr.dbpedia.org/resource/Saint-Joseph-de-Coleraine', 0)

In [7]:
import gzip
import time

Now we make a list of the english and french labels we have:

In [None]:
# check if we already extracted the labels
en_labels_path = os.path.join(ROOT_PATH, "data", "en_labels.pkl")
fr_labels_path = os.path.join(ROOT_PATH, "data", "fr_labels.pkl")
img_dict_path = os.path.join(ROOT_PATH, "data", "img_dict_pkls")
def extract_labels(path):
    labels = list()
    for dirent in os.scandir(path):
        tic = time.time()
        with gzip.open(dirent.path) as f:
            data = pickle.load(f)
            labels += list(data.keys())
            del data
        print(f"Loaded {dirent.name} in {time.time() - tic:.2f}s, num_labels = {len(labels)}")
    print(f"{len(labels)} labels loaded")
    return labels

if os.path.isfile(en_labels_path):
    print("English labels found; loading...")
    with open(en_labels_path, "rb") as f:
        en_labels = pickle.load(f)
else:
    print("English labels not found; extracting...")
    en_labels = extract_labels(os.path.join(img_dict_path, "en_fr_english_imgs"))
if os.path.isfile(fr_labels_path):
    print("French labels found; loading...")
    with open(fr_labels_path, "rb") as f:
        fr_labels = pickle.load(f)
else:
    print("French labels not found; extracting...")
    fr_labels = extract_labels(os.path.join(img_dict_path, "en_fr_french_imgs"))

English labels not found; extracting...
Loaded 0.pkl.gz in 11.31s, num_labels = 1000
Loaded 1.pkl.gz in 13.31s, num_labels = 2000
Loaded 2.pkl.gz in 15.81s, num_labels = 3000
Loaded 3.pkl.gz in 17.40s, num_labels = 4000
Loaded 4.pkl.gz in 12.71s, num_labels = 5000
Loaded 5.pkl.gz in 19.89s, num_labels = 6000
Loaded 6.pkl.gz in 13.97s, num_labels = 7000
Loaded 7.pkl.gz in 19.46s, num_labels = 8000
Loaded 8.pkl.gz in 18.08s, num_labels = 9000
Loaded 9.pkl.gz in 16.57s, num_labels = 10000
Loaded 10.pkl.gz in 20.86s, num_labels = 11000
Loaded 11.pkl.gz in 13.44s, num_labels = 12000
Loaded 12.pkl.gz in 12.08s, num_labels = 13000
Loaded 13.pkl.gz in 11.69s, num_labels = 14000
Loaded 14.pkl.gz in 3.02s, num_labels = 14174
14174 labels loaded
French labels not found; extracting...
Loaded 0.pkl.gz in 9.29s, num_labels = 1000
Loaded 1.pkl.gz in 13.53s, num_labels = 2000
Loaded 10.pkl.gz in 8.03s, num_labels = 3000
Loaded 11.pkl.gz in 4.05s, num_labels = 4000
Loaded 12.pkl.gz in 3.92s, num_labels

In [None]:
with open(en_labels_path, "wb") as f:
    pickle.dump(en_labels, f)
with open(fr_labels_path, "wb") as f:
    pickle.dump(fr_labels, f)

In [None]:
print(len(en_labels), len(fr_labels), len(en_labels) + len(fr_labels))
print(en_labels[0])
print(fr_labels[0])

14174 13858 28032
http://dbpedia.org/resource/United_States
http://fr.dbpedia.org/resource/États-Unis


now we translate these labels into ids

In [None]:
labels = en_labels + fr_labels
enfr_ids = list()
num_en_labels = 0
num_fr_labels = 0
for label in labels:
    if label not in ent2id_dict:
        print(f"Could not find {label}")
        continue
    if "//dbpedia" in label:
        num_en_labels += 1
    elif "//fr.dbpedia" in label:
        num_fr_labels += 1
    enfr_ids.append(ent2id_dict[label])
print(f"en labels: {num_en_labels}; fr labels: {num_fr_labels}")
print(f"total labels: {len(enfr_ids)}")

en labels: 14174; fr labels: 13858
total labels: 28032


## Testing EffNet Pretrained
let's check out one of the images to see how we can do inference on them

In [None]:
filepath = os.path.join(ROOT_PATH, "data", "img_dict_pkls", "0.pkl.gz")
with gzip.open(filepath) as f:
    data = pickle.load(f)
img = data[labels[0]]

In [None]:
transforms.ToTensor()(img.convert("RGB")).shape

torch.Size([3, 158, 300])

In [None]:
import PIL
from torchvision import transforms, models, datasets

In [None]:
efficientnet_b0 = models.efficientnet_b0(pretrained=True)

Downloading: "https://download.pytorch.org/models/efficientnet_b0_rwightman-3dd342df.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b0_rwightman-3dd342df.pth


  0%|          | 0.00/20.5M [00:00<?, ?B/s]

In [None]:
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
preprocess = transforms.Compose(
    [transforms.Resize(256),
     transforms.CenterCrop(224),
     transforms.ToTensor(),
     normalize])
batch_t = None
for i in range(5):
    img_t = preprocess(data[labels[i]].convert("RGB"))
    img_t = torch.unsqueeze(img_t, 0)
    if batch_t is None:
        batch_t = img_t
    else:
        batch_t = torch.cat([batch_t, img_t])
print(img_t.shape)
print(batch_t.shape)

torch.Size([1, 3, 224, 224])
torch.Size([5, 3, 224, 224])


In [None]:
imgfeatures = None
def featurehook(model, input, output):
    global imgfeatures
    imgfeatures = output.detach()
efficientnet_b0.avgpool.register_forward_hook(featurehook)

<torch.utils.hooks.RemovableHandle at 0x7f59e3acb5d0>

In [None]:
efficientnet_b0.eval()
output = efficientnet_b0(batch_t)
print(output.shape)
print(torch.flatten(imgfeatures, 1).shape)

torch.Size([5, 1000])
torch.Size([5, 1280])


## Extracting image features for EffNet

In [8]:
import PIL
from torchvision import transforms, models, datasets

In [15]:
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                    std=[0.229, 0.224, 0.225])
preprocess = transforms.Compose(
    [transforms.Resize(256),
     transforms.CenterCrop(224),
     transforms.ToTensor(),
     normalize])
def extract_effnet_features(model, 
                            ent2id_dict, 
                            preprocess=preprocess, 
                            batch_size=185, 
                            use_cuda=True,
                            verbose=True):
    imgfeatures = None
    def featurehook(model, input, output):
        global imgfeatures
        imgfeatures = output.detach()
    model.avgpool.register_forward_hook(featurehook)
    model.eval()

    def run_model(model, batch_t, all_img_features):
        global imgfeatures
        tic = time.time()
        torch.cuda.empty_cache()
        if use_cuda:
            batch_t = batch_t.cuda()
        model(batch_t)
        if verbose:
            print(f"Inference in {time.time()-tic:.2f}s")
            print(torch.flatten(imgfeatures, 1).shape)
        if use_cuda:
            imgfeatures = imgfeatures.cpu()
        if all_img_features is None:
            all_img_features = torch.flatten(imgfeatures, 1).numpy()
        else:
            all_img_features = np.concatenate((all_img_features, 
                                            torch.flatten(imgfeatures, 1).numpy()))
        return all_img_features

    all_img_features = None
    en_dict_path = os.path.join(ROOT_PATH, "data", "img_dict_pkls", "en_fr_english_imgs")
    fr_dict_path = os.path.join(ROOT_PATH, "data", "img_dict_pkls", "en_fr_french_imgs")
    id_list = list()
    for dirent in list(os.scandir(en_dict_path)) + list(os.scandir(fr_dict_path)):
        batch_t = None
        tic = time.time()
        with gzip.open(dirent.path) as f:
            data = pickle.load(f)
            if "english" in dirent.path:
                print(f"Loaded english/{dirent.name} in {time.time()-tic:.2f}s")
            else:
                print(f"Loaded french/{dirent.name} in {time.time()-tic:.2f}s")
            tic = time.time()
            for label, img in data.items():
                id_list.append(ent2id_dict[label])
                img_t = preprocess(img.convert("RGB"))
                img_t = torch.unsqueeze(img_t, 0)
                if batch_t is None:
                    batch_t = img_t
                else:
                    batch_t = torch.cat([batch_t, img_t])
                if batch_t.shape[0] == batch_size:
                    if verbose:
                        print(f"Prepared batch ({batch_size}) in {time.time()-tic:.2f}s")
                    all_img_features = run_model(model, batch_t, all_img_features)                    
                    del batch_t
                    del img_t
                    batch_t = None
                    tic = time.time()
            # run on remainder
            if batch_t is not None:
                all_img_features = run_model(model, batch_t, all_img_features)                    
    return id_list, all_img_features

In [19]:
efficientnet_b0 = models.efficientnet_b0(pretrained=True)
#efficientnet_b0.cuda()

Downloading: "https://download.pytorch.org/models/efficientnet_b0_rwightman-3dd342df.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b0_rwightman-3dd342df.pth


  0%|          | 0.00/20.5M [00:00<?, ?B/s]

In [21]:
id_list, all_img_features = extract_effnet_features(efficientnet_b0, ent2id_dict, use_cuda=False)

Loaded english/0.pkl.gz in 8.12s


  "Palette images with Transparency expressed in bytes should be "


Prepared batch (185) in 2.33s
Inference in 20.57s
torch.Size([185, 1280])
Prepared batch (185) in 2.77s
Inference in 8.78s
torch.Size([185, 1280])
Prepared batch (185) in 2.34s
Inference in 8.70s
torch.Size([185, 1280])
Prepared batch (185) in 2.45s
Inference in 8.76s
torch.Size([185, 1280])
Prepared batch (185) in 2.50s
Inference in 8.60s
torch.Size([185, 1280])
Inference in 4.06s
torch.Size([75, 1280])
Loaded english/1.pkl.gz in 14.13s
Prepared batch (185) in 2.47s
Inference in 8.83s
torch.Size([185, 1280])
Prepared batch (185) in 2.71s
Inference in 8.14s
torch.Size([185, 1280])
Prepared batch (185) in 2.62s
Inference in 8.15s
torch.Size([185, 1280])
Prepared batch (185) in 2.76s
Inference in 8.12s
torch.Size([185, 1280])
Prepared batch (185) in 2.72s
Inference in 8.08s
torch.Size([185, 1280])
Inference in 3.40s
torch.Size([75, 1280])
Loaded english/2.pkl.gz in 15.18s
Prepared batch (185) in 2.46s
Inference in 8.52s
torch.Size([185, 1280])
Prepared batch (185) in 2.55s
Inference in 8

In [23]:
print(len(id_list))
print(all_img_features.shape)

28032
(28032, 1280)


In [24]:
all_img_features[0:2]

array([[ 0.5572252 , -0.2117167 , -0.1850568 , ..., -0.15802604,
        -0.20168622,  0.23016961],
       [-0.00828076, -0.08807116,  0.0238584 , ..., -0.08425538,
        -0.10938895, -0.10650282]], dtype=float32)

In [25]:
np.save(os.path.join(ROOT_PATH, "data", "effnet_b0_imgfeatures.npy"), all_img_features)

Now we try efficientnet b7

In [16]:
# the input size is different, so we need to change the preprocessing step
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                    std=[0.229, 0.224, 0.225])
preprocess_b7 = transforms.Compose(
    [transforms.Resize(633),
     transforms.CenterCrop(600),
     transforms.ToTensor(),
     normalize])
efficientnet_b7 = models.efficientnet_b7(pretrained=True)
efficientnet_b7.cuda()
id_list_b7, all_img_features_b7 = extract_effnet_features(
    efficientnet_b7,
    ent2id_dict,
    preprocess=preprocess_b7,
    batch_size=1,
    use_cuda=True,
    verbose=False)

Loaded english/0.pkl.gz in 7.44s


  "Palette images with Transparency expressed in bytes should be "


Loaded english/1.pkl.gz in 12.89s
Loaded english/2.pkl.gz in 15.64s
Loaded english/3.pkl.gz in 16.64s
Loaded english/4.pkl.gz in 18.51s
Loaded english/5.pkl.gz in 22.97s
Loaded english/6.pkl.gz in 15.20s
Loaded english/7.pkl.gz in 16.36s
Loaded english/8.pkl.gz in 16.65s
Loaded english/9.pkl.gz in 20.98s
Loaded english/10.pkl.gz in 21.65s
Loaded english/11.pkl.gz in 12.97s
Loaded english/12.pkl.gz in 11.98s
Loaded english/13.pkl.gz in 10.76s
Loaded english/14.pkl.gz in 2.89s
Loaded french/0.pkl.gz in 8.47s
Loaded french/1.pkl.gz in 13.05s
Loaded french/10.pkl.gz in 7.38s
Loaded french/11.pkl.gz in 3.44s
Loaded french/12.pkl.gz in 3.74s
Loaded french/13.pkl.gz in 43.05s
Loaded french/2.pkl.gz in 15.56s
Loaded french/3.pkl.gz in 20.87s
Loaded french/4.pkl.gz in 20.15s
Loaded french/5.pkl.gz in 19.68s
Loaded french/6.pkl.gz in 11.73s
Loaded french/7.pkl.gz in 15.75s
Loaded french/8.pkl.gz in 17.24s
Loaded french/9.pkl.gz in 12.43s


total runtime: 2h 30m

In [17]:
np.save(os.path.join(ROOT_PATH, "data", "effnet_b7_imgfeatures.npy"), all_img_features_b7)

In [18]:
with open(os.path.join(ROOT_PATH, "data", "effnet_b7_img_ids.pkl"), "wb") as f:
    pickle.dump(id_list_b7, f)

# Failed attempt to open super large pkl file

In [None]:
import PIL

They provide the original images as a dictionary with keys being the entity label and the value being the image:

In [None]:
fr_en_images_path = os.path.join(ROOT_PATH, "data", "en(fr)_dbp15k_link_img_dict_full.pkl")
f = open(fr_en_images_path, 'rb')
data = f.read(int(1e9))
print(data[30:90])

b'resource/United_Statesq\x01cPIL.PngImagePlugin\nPngImageFile\nq\x02)'


In [None]:
fr_en_images_path = os.path.join(ROOT_PATH, "data", "en(fr)_dbp15k_link_img_dict_full.pkl")
f = open(fr_en_images_path, 'rb')
#elems = sPickle.s_load(f)

In [None]:
def s_load(file_obj):
    """ load contents from file_obj, returning a generator that yields one
        element at a time """
    cur_elt = []
    for line in file_obj:
        cur_elt.append(line)

        if line == '\n':
            pickled_elt_str = ''.join(cur_elt)
            elt = loads(pickled_elt_str)
            cur_elt = []
            yield elt

In [None]:
elem1 = next(elems)

UnpicklingError: ignored

In [None]:
data[1860:2000]

b'K\xfdK\xfeK\xfeK\xfeK\xffK\xffK\xffeB(\xb9\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'

In [None]:
import pickletools

In [None]:
pickletools.dis(data[:1875])

    0: \x80 PROTO      3
    2: }    EMPTY_DICT
    3: q    BINPUT     0
    5: (    MARK
    6: X        BINUNICODE 'http://dbpedia.org/resource/United_States'
   52: q        BINPUT     1
   54: c        GLOBAL     'PIL.PngImagePlugin PngImageFile'
   87: q        BINPUT     2
   89: )        EMPTY_TUPLE
   90: \x81     NEWOBJ
   91: q        BINPUT     3
   93: ]        EMPTY_LIST
   94: q        BINPUT     4
   96: (        MARK
   97: }            EMPTY_DICT
   98: q            BINPUT     5
  100: (            MARK
  101: X                BINUNICODE 'gamma'
  111: q                BINPUT     6
  113: G                BINFLOAT   0.45455
  122: X                BINUNICODE 'chromaticity'
  139: q                BINPUT     7
  141: (                MARK
  142: G                    BINFLOAT   0.3127
  151: G                    BINFLOAT   0.329
  160: G                    BINFLOAT   0.64
  169: G                    BINFLOAT   0.33
  178: G                    BINFLOAT   0.3
  187: G     

ValueError: ignored

In [None]:
entryname = data[data.find(b"http"):data.find(b"cPIL")]
remainder_data = data[data.find(b"q\x01cPIL"):]
print(remainder_data[:remainder_data.find(b"http")+6][-10:])
pickle.loads(remainder_data[:remainder_data.find(b"http")] + b"q.")
print(entryname)
#data = f.read(int(5.369e9))
#fr_en_images = pickle.loads(data)

b'%\x00\x00\x00http:/'


UnpicklingError: ignored

In [None]:
# now let's check if the every entry in our images pickle file is in our ent2id dict:
num_missing = 0
for linkname in fr_en_images.keys():
    if linkname not in ent2id_dict:
        num_missing += 1
print(f"Missing {num_missing} entries")