### Data preprocessing

In [1]:
import os
import numpy as np
import pandas as pd
from collections import Counter

In [2]:
def load_dataset(path):
    with open(path, "r") as f:
        lines = f.read().strip().split("\n")
    n = int(lines[0])
    photos = []
    for i in range(1, n + 1):
        parts = lines[i].split()
        orientation = parts[0]  # 'H' lub 'V'
        tags = set(parts[2:])
        photos.append({
            "id": i - 1,
            "orientation": orientation,
            "tags": tags
        })
    return photos

In [6]:
photos = load_dataset("data/d_pet_pictures.txt")
print(f"Liczba zdjęć: {len(photos)}")
print(f"Przykładowe zdjęcie: {photos[0:20]}")

Liczba zdjęć: 90000
Przykładowe zdjęcie: [{'id': 0, 'orientation': 'H', 'tags': {'t86', 'th5', 'tf1', 't6', 'tl5', 't82', 'tw3', 'tc4', 'tm2', 't47', 'ts2', 't52', 'tl2', 't44', 'tc5', 'td4'}}, {'id': 1, 'orientation': 'V', 'tags': {'t56', 't87', 'tx4', 'th5', 't85', 't42', 'tw6', 'tw3', 't82', 'tn6', 'tp6', 'tm2', 't47', 't52', 'tz2'}}, {'id': 2, 'orientation': 'H', 'tags': {'tc4', 'tb5', 't61', 't51'}}, {'id': 3, 'orientation': 'V', 'tags': {'tm5', 'tq1', 'tr5', 'tw3', 'tv3', 'tn6', 'tb7', 'th4', 't72', 't52', 'tx1', 'tc1', 't86'}}, {'id': 4, 'orientation': 'H', 'tags': {'tz6', 'tw5', 'tq2', 't53', 't32', 't85', 'tw3', 'tn6', 't8', 't47', 'th2', 't01', 'tl5'}}, {'id': 5, 'orientation': 'V', 'tags': {'td', 't02', 'th5', 't37', 'tq1', 't44', 'th4', 'tv2', 'tp6', 'tn3', 'tn6', 'ts2', 't52', 'tr2', 'tl5'}}, {'id': 6, 'orientation': 'V', 'tags': {'t87', 'tq6', 't05', 't22', 'tv4', 'tn6', 'th4', 't8', 'tz'}}, {'id': 7, 'orientation': 'V', 'tags': {'t37', 'tq6', 'tr5', 'th4', 'tc4', 'tb7', 

In [14]:
def split_vertical_photos(photos):
    horizontal_photos = [p for p in photos if p["orientation"] == "H"]
    vertical_photos = [p for p in photos if p["orientation"] == "V"]
    return horizontal_photos, vertical_photos

In [22]:
def analyze_dataset(photos):
    n_total = len(photos)

    tag_counts = [len(p["tags"]) for p in photos]
    all_tags = [tag for p in photos for tag in p["tags"]]
    tag_counter = Counter(all_tags)
    top_tags = tag_counter.most_common(20)

    print(f"Number of Photos: {n_total}")
    print(f"Avg number of tags: {np.mean(tag_counts):.2f}")
    print(f"Median number of tags: {np.median(tag_counts)}")
    print(f"Number of unic tags: {len(tag_counter)}")
    print("Top 20 tags:")
    for tag, count in top_tags:
        print(f"   {tag:20s} {count}")

In [23]:
h_photos, v_photos = split_vertical_photos(photos)
analyze_dataset(photos)
analyze_dataset(h_photos)
analyze_dataset(v_photos)  

Number of Photos: 90000
Avg number of tags: 10.03
Median number of tags: 10.0
Number of unic tags: 220
Top 20 tags:
   tx1                  17846
   tq1                  17817
   t82                  17812
   t85                  17805
   tq2                  17796
   tp2                  17796
   t05                  17795
   ts2                  17790
   tz6                  17787
   t02                  17776
   tq6                  17771
   t47                  17770
   tm2                  17763
   tb5                  17763
   tc4                  17762
   tn6                  17761
   t86                  17749
   th5                  17749
   tz4                  17745
   t34                  17743
Number of Photos: 30000
Avg number of tags: 10.04
Median number of tags: 10.0
Number of unic tags: 220
Top 20 tags:
   tn6                  6017
   tx1                  6013
   t05                  5986
   tm2                  5981
   t85                  5973
   t86                 