Skip to content

Commit

Permalink
Merge pull request #3299 from Minnozz/absorb
Browse files Browse the repository at this point in the history
Track which Author/Work/Edition a duplicate has been merged into
  • Loading branch information
mouse-reeve committed Apr 9, 2024
2 parents ca6dbcb + 4a690e6 commit 7363033
Show file tree
Hide file tree
Showing 19 changed files with 405 additions and 116 deletions.
37 changes: 25 additions & 12 deletions bookwyrm/management/commands/deduplicate_book_data.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
""" PROCEED WITH CAUTION: uses deduplication fields to permanently
merge book data objects """

from django.core.management.base import BaseCommand
from django.db.models import Count
from bookwyrm import models
from bookwyrm.management.merge import merge_objects


def dedupe_model(model):
def dedupe_model(model, dry_run=False):
"""combine duplicate editions and update related models"""
print(f"deduplicating {model.__name__}:")
fields = model._meta.get_fields()
dedupe_fields = [
f for f in fields if hasattr(f, "deduplication_field") and f.deduplication_field
Expand All @@ -16,30 +17,42 @@ def dedupe_model(model):
dupes = (
model.objects.values(field.name)
.annotate(Count(field.name))
.filter(**{"%s__count__gt" % field.name: 1})
.filter(**{f"{field.name}__count__gt": 1})
.exclude(**{field.name: ""})
.exclude(**{f"{field.name}__isnull": True})
)

for dupe in dupes:
value = dupe[field.name]
if not value or value == "":
continue
print("----------")
print(dupe)
objs = model.objects.filter(**{field.name: value}).order_by("id")
canonical = objs.first()
print("keeping", canonical.remote_id)
action = "would merge" if dry_run else "merging"
print(
f"{action} into {model.__name__} {canonical.remote_id} based on {field.name} {value}:"
)
for obj in objs[1:]:
print(obj.remote_id)
merge_objects(canonical, obj)
print(f"- {obj.remote_id}")
absorbed_fields = obj.merge_into(canonical, dry_run=dry_run)
print(f" absorbed fields: {absorbed_fields}")


class Command(BaseCommand):
"""deduplicate allllll the book data models"""

help = "merges duplicate book data"

def add_arguments(self, parser):
"""add the arguments for this command"""
parser.add_argument(
"--dry_run",
action="store_true",
help="don't actually merge, only print what would happen",
)

# pylint: disable=no-self-use,unused-argument
def handle(self, *args, **options):
"""run deduplications"""
dedupe_model(models.Edition)
dedupe_model(models.Work)
dedupe_model(models.Author)
dedupe_model(models.Edition, dry_run=options["dry_run"])
dedupe_model(models.Work, dry_run=options["dry_run"])
dedupe_model(models.Author, dry_run=options["dry_run"])
50 changes: 0 additions & 50 deletions bookwyrm/management/merge.py

This file was deleted.

12 changes: 10 additions & 2 deletions bookwyrm/management/merge_command.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from bookwyrm.management.merge import merge_objects
from django.core.management.base import BaseCommand


Expand All @@ -9,6 +8,11 @@ def add_arguments(self, parser):
"""add the arguments for this command"""
parser.add_argument("--canonical", type=int, required=True)
parser.add_argument("--other", type=int, required=True)
parser.add_argument(
"--dry_run",
action="store_true",
help="don't actually merge, only print what would happen",
)

# pylint: disable=no-self-use,unused-argument
def handle(self, *args, **options):
Expand All @@ -26,4 +30,8 @@ def handle(self, *args, **options):
print("other book doesn’t exist!")
return

merge_objects(canonical, other)
absorbed_fields = other.merge_into(canonical, dry_run=options["dry_run"])

action = "would be" if options["dry_run"] else "has been"
print(f"{other.remote_id} {action} merged into {canonical.remote_id}")
print(f"absorbed fields: {absorbed_fields}")
48 changes: 48 additions & 0 deletions bookwyrm/migrations/0197_mergedauthor_mergedbook.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Generated by Django 3.2.24 on 2024-02-28 21:30

from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
("bookwyrm", "0196_merge_pr3134_into_main"),
]

operations = [
migrations.CreateModel(
name="MergedBook",
fields=[
("deleted_id", models.IntegerField(primary_key=True, serialize=False)),
(
"merged_into",
models.ForeignKey(
on_delete=django.db.models.deletion.PROTECT,
related_name="absorbed",
to="bookwyrm.book",
),
),
],
options={
"abstract": False,
},
),
migrations.CreateModel(
name="MergedAuthor",
fields=[
("deleted_id", models.IntegerField(primary_key=True, serialize=False)),
(
"merged_into",
models.ForeignKey(
on_delete=django.db.models.deletion.PROTECT,
related_name="absorbed",
to="bookwyrm.author",
),
),
],
options={
"abstract": False,
},
),
]
5 changes: 4 additions & 1 deletion bookwyrm/models/author.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
""" database schema for info about authors """

import re
from typing import Tuple, Any

Expand All @@ -10,13 +11,15 @@
from bookwyrm.settings import DOMAIN
from bookwyrm.utils.db import format_trigger

from .book import BookDataModel
from .book import BookDataModel, MergedAuthor
from . import fields


class Author(BookDataModel):
"""basic biographic info"""

merged_model = MergedAuthor

wikipedia_link = fields.CharField(
max_length=255, blank=True, null=True, deduplication_field=True
)
Expand Down
121 changes: 116 additions & 5 deletions bookwyrm/models/book.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
""" database schema for books and shelves """

from itertools import chain
import re
from typing import Any
from typing import Any, Dict
from typing_extensions import Self

from django.contrib.postgres.search import SearchVectorField
from django.contrib.postgres.indexes import GinIndex
from django.core.cache import cache
from django.db import models, transaction
from django.db.models import Prefetch
from django.db.models import Prefetch, ManyToManyField
from django.dispatch import receiver
from django.utils.translation import gettext_lazy as _
from model_utils import FieldTracker
Expand Down Expand Up @@ -108,10 +110,115 @@ def broadcast(self, activity, sender, software="bookwyrm", **kwargs):
"""only send book data updates to other bookwyrm instances"""
super().broadcast(activity, sender, software=software, **kwargs)

def merge_into(self, canonical: Self, dry_run=False) -> Dict[str, Any]:
"""merge this entity into another entity"""
if canonical.id == self.id:
raise ValueError(f"Cannot merge {self} into itself")

absorbed_fields = canonical.absorb_data_from(self, dry_run=dry_run)

if dry_run:
return absorbed_fields

canonical.save()

self.merged_model.objects.create(deleted_id=self.id, merged_into=canonical)

# move related models to canonical
related_models = [
(r.remote_field.name, r.related_model) for r in self._meta.related_objects
]
# pylint: disable=protected-access
for related_field, related_model in related_models:
# Skip the ManyToMany fields that aren’t auto-created. These
# should have a corresponding OneToMany field in the model for
# the linking table anyway. If we update it through that model
# instead then we won’t lose the extra fields in the linking
# table.
# pylint: disable=protected-access
related_field_obj = related_model._meta.get_field(related_field)
if isinstance(related_field_obj, ManyToManyField):
through = related_field_obj.remote_field.through
if not through._meta.auto_created:
continue
related_objs = related_model.objects.filter(**{related_field: self})
for related_obj in related_objs:
try:
setattr(related_obj, related_field, canonical)
related_obj.save()
except TypeError:
getattr(related_obj, related_field).add(canonical)
getattr(related_obj, related_field).remove(self)

self.delete()
return absorbed_fields

def absorb_data_from(self, other: Self, dry_run=False) -> Dict[str, Any]:
"""fill empty fields with values from another entity"""
absorbed_fields = {}
for data_field in self._meta.get_fields():
if not hasattr(data_field, "activitypub_field"):
continue
canonical_value = getattr(self, data_field.name)
other_value = getattr(other, data_field.name)
if not other_value:
continue
if isinstance(data_field, fields.ArrayField):
if new_values := list(set(other_value) - set(canonical_value)):
# append at the end (in no particular order)
if not dry_run:
setattr(self, data_field.name, canonical_value + new_values)
absorbed_fields[data_field.name] = new_values
elif isinstance(data_field, fields.PartialDateField):
if (
(not canonical_value)
or (other_value.has_day and not canonical_value.has_day)
or (other_value.has_month and not canonical_value.has_month)
):
if not dry_run:
setattr(self, data_field.name, other_value)
absorbed_fields[data_field.name] = other_value
else:
if not canonical_value:
if not dry_run:
setattr(self, data_field.name, other_value)
absorbed_fields[data_field.name] = other_value
return absorbed_fields


class MergedBookDataModel(models.Model):
"""a BookDataModel instance that has been merged into another instance. kept
to be able to redirect old URLs"""

deleted_id = models.IntegerField(primary_key=True)

class Meta:
"""abstract just like BookDataModel"""

abstract = True


class MergedBook(MergedBookDataModel):
"""an Book that has been merged into another one"""

merged_into = models.ForeignKey(
"Book", on_delete=models.PROTECT, related_name="absorbed"
)


class MergedAuthor(MergedBookDataModel):
"""an Author that has been merged into another one"""

merged_into = models.ForeignKey(
"Author", on_delete=models.PROTECT, related_name="absorbed"
)


class Book(BookDataModel):
"""a generic book, which can mean either an edition or a work"""

merged_model = MergedBook

connector = models.ForeignKey("Connector", on_delete=models.PROTECT, null=True)

# book/work metadata
Expand Down Expand Up @@ -192,9 +299,13 @@ def edition_info(self):
"""properties of this edition, as a string"""
items = [
self.physical_format if hasattr(self, "physical_format") else None,
f"{self.languages[0]} language"
if self.languages and self.languages[0] and self.languages[0] != "English"
else None,
(
f"{self.languages[0]} language"
if self.languages
and self.languages[0]
and self.languages[0] != "English"
else None
),
str(self.published_date.year) if self.published_date else None,
", ".join(self.publishers) if hasattr(self, "publishers") else None,
]
Expand Down
Loading

0 comments on commit 7363033

Please sign in to comment.