Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
  • 2 commits
  • 8 files changed
  • 0 comments
  • 1 contributor
Jun 04, 2012
Anssi Kääriäinen Fixed #17788 -- Added a batch_size argument for qs.bulk_create()
The qs.bulk_create() method did not work with large batches together
with SQLite3. This commit added a way to split the bulk into smaller
batches. The default batch size is unlimited except for SQLite3 where
the batch size is limited to 1000 SQL parameters per batch.

Thanks to everybody who participated in the discussion at Trac.
Special thanks to alpar for the report.
d4eacac
Anssi Kääriäinen Fixed review issues noted by claudep. Refs #17788.
The issues fixed were some comment issues, use of '//' operator instead
of '/' operator. In addition, added some defensive programming
enchantments for division by zero and zero sized batches.
332fb93
30  django/db/backends/__init__.py
@@ -474,6 +474,14 @@ def autoinc_sql(self, table, column):
474 474
         This SQL is executed when a table is created.
475 475
         """
476 476
         return None
  477
+    
  478
+    def bulk_batch_size(self, fields, objs):
  479
+        """
  480
+        Returns the maximum allowed batch size for the backend. The fields
  481
+        are the fields going to be inserted in the batch, the objs contains
  482
+        all the objects to be inserted.
  483
+        """
  484
+        return len(objs)
477 485
 
478 486
     def date_extract_sql(self, lookup_type, field_name):
479 487
         """
@@ -511,6 +519,17 @@ def deferrable_sql(self):
511 519
         during a CREATE TABLE statement.
512 520
         """
513 521
         return ''
  522
+    
  523
+    def distinct_sql(self, fields):
  524
+        """
  525
+        Returns an SQL DISTINCT clause which removes duplicate rows from the
  526
+        result set. If any fields are given, only the given fields are being
  527
+        checked for duplicates.
  528
+        """
  529
+        if fields:
  530
+            raise NotImplementedError('DISTINCT ON fields is not supported by this database backend')
  531
+        else:
  532
+            return 'DISTINCT'
514 533
 
515 534
     def drop_foreignkey_sql(self):
516 535
         """
@@ -567,17 +586,6 @@ def fulltext_search_sql(self, field_name):
567 586
         """
568 587
         raise NotImplementedError('Full-text search is not implemented for this database backend')
569 588
 
570  
-    def distinct_sql(self, fields):
571  
-        """
572  
-        Returns an SQL DISTINCT clause which removes duplicate rows from the
573  
-        result set. If any fields are given, only the given fields are being
574  
-        checked for duplicates.
575  
-        """
576  
-        if fields:
577  
-            raise NotImplementedError('DISTINCT ON fields is not supported by this database backend')
578  
-        else:
579  
-            return 'DISTINCT'
580  
-
581 589
     def last_executed_query(self, cursor, sql, params):
582 590
         """
583 591
         Returns a string of the query last executed by the given cursor, with
9  django/db/backends/sqlite3/base.py
@@ -83,7 +83,7 @@ class DatabaseFeatures(BaseDatabaseFeatures):
83 83
     supports_1000_query_parameters = False
84 84
     supports_mixed_date_datetime_comparisons = False
85 85
     has_bulk_insert = True
86  
-    can_combine_inserts_with_and_without_auto_increment_pk = True
  86
+    can_combine_inserts_with_and_without_auto_increment_pk = False
87 87
 
88 88
     def _supports_stddev(self):
89 89
         """Confirm support for STDDEV and related stats functions
@@ -104,6 +104,13 @@ def _supports_stddev(self):
104 104
         return has_support
105 105
 
106 106
 class DatabaseOperations(BaseDatabaseOperations):
  107
+    def bulk_batch_size(self, fields, objs):
  108
+        """
  109
+        SQLite has a compile-time default (SQLITE_LIMIT_VARIABLE_NUMBER) of
  110
+        999 variables per query. 
  111
+        """
  112
+        return (999 // len(fields)) if len(fields) > 0 else len(objs)
  113
+
107 114
     def date_extract_sql(self, lookup_type, field_name):
108 115
         # sqlite doesn't support extract, so we fake it with the user-defined
109 116
         # function django_extract that's registered in connect(). Note that
29  django/db/models/query.py
@@ -388,7 +388,7 @@ def create(self, **kwargs):
388 388
         obj.save(force_insert=True, using=self.db)
389 389
         return obj
390 390
 
391  
-    def bulk_create(self, objs):
  391
+    def bulk_create(self, objs, batch_size=None):
392 392
         """
393 393
         Inserts each of the instances into the database. This does *not* call
394 394
         save() on each of the instances, does not send any pre/post save
@@ -401,8 +401,10 @@ def bulk_create(self, objs):
401 401
         # this could be implemented if you didn't have an autoincrement pk,
402 402
         # and 2) you could do it by doing O(n) normal inserts into the parent
403 403
         # tables to get the primary keys back, and then doing a single bulk
404  
-        # insert into the childmost table. We're punting on these for now
405  
-        # because they are relatively rare cases.
  404
+        # insert into the childmost table. Some databases might allow doing
  405
+        # this by using RETURNING clause for the insert query. We're punting
  406
+        # on these for now because they are relatively rare cases.
  407
+        assert batch_size is None or batch_size > 0
406 408
         if self.model._meta.parents:
407 409
             raise ValueError("Can't bulk create an inherited model")
408 410
         if not objs:
@@ -418,13 +420,14 @@ def bulk_create(self, objs):
418 420
         try:
419 421
             if (connection.features.can_combine_inserts_with_and_without_auto_increment_pk
420 422
                 and self.model._meta.has_auto_field):
421  
-                self.model._base_manager._insert(objs, fields=fields, using=self.db)
  423
+                self._batched_insert(objs, fields, batch_size)
422 424
             else:
423 425
                 objs_with_pk, objs_without_pk = partition(lambda o: o.pk is None, objs)
424 426
                 if objs_with_pk:
425  
-                    self.model._base_manager._insert(objs_with_pk, fields=fields, using=self.db)
  427
+                    self._batched_insert(objs_with_pk, fields, batch_size)
426 428
                 if objs_without_pk:
427  
-                    self.model._base_manager._insert(objs_without_pk, fields=[f for f in fields if not isinstance(f, AutoField)], using=self.db)
  429
+                    fields= [f for f in fields if not isinstance(f, AutoField)]
  430
+                    self._batched_insert(objs_without_pk, fields, batch_size)
428 431
             if forced_managed:
429 432
                 transaction.commit(using=self.db)
430 433
             else:
@@ -860,6 +863,20 @@ def db(self):
860 863
     ###################
861 864
     # PRIVATE METHODS #
862 865
     ###################
  866
+    def _batched_insert(self, objs, fields, batch_size):
  867
+        """
  868
+        A little helper method for bulk_insert to insert the bulk one batch
  869
+        at a time. Inserts recursively a batch from the front of the bulk and
  870
+        then _batched_insert() the remaining objects again.
  871
+        """
  872
+        if not objs:
  873
+            return
  874
+        ops = connections[self.db].ops
  875
+        batch_size = (batch_size or max(ops.bulk_batch_size(fields, objs), 1))
  876
+        for batch in [objs[i:i+batch_size]
  877
+                      for i in range(0, len(objs), batch_size)]:
  878
+            self.model._base_manager._insert(batch, fields=fields,
  879
+                                             using=self.db)
863 880
 
864 881
     def _clone(self, klass=None, setup=False, **kwargs):
865 882
         if klass is None:
20  docs/ref/models/querysets.txt
@@ -1349,7 +1349,7 @@ has a side effect on your data. For more, see `Safe methods`_ in the HTTP spec.
1349 1349
 bulk_create
1350 1350
 ~~~~~~~~~~~
1351 1351
 
1352  
-.. method:: bulk_create(objs)
  1352
+.. method:: bulk_create(objs, batch_size=None)
1353 1353
 
1354 1354
 .. versionadded:: 1.4
1355 1355
 
@@ -1371,20 +1371,12 @@ This has a number of caveats though:
1371 1371
 * If the model's primary key is an :class:`~django.db.models.AutoField` it
1372 1372
   does not retrieve and set the primary key attribute, as ``save()`` does.
1373 1373
 
1374  
-.. admonition:: Limits of SQLite
  1374
+The ``batch_size`` parameter controls how many objects are created in single
  1375
+query. The default is to create all objects in one batch, except for SQLite
  1376
+where the default is such that at maximum 999 variables per query is used.
1375 1377
 
1376  
-    SQLite sets a limit on the number of parameters per SQL statement. The
1377  
-    maximum is defined by the SQLITE_MAX_VARIABLE_NUMBER_ compilation option,
1378  
-    which defaults to 999. For instance, if your model has 8 fields (including
1379  
-    the primary key), you cannot create more than 999 // 8 = 124 instances at
1380  
-    a time. If you exceed this limit, you'll get an exception::
1381  
-
1382  
-        django.db.utils.DatabaseError: too many SQL variables
1383  
-
1384  
-    If your application's performance requirements exceed SQLite's limits, you
1385  
-    should switch to another database engine, such as PostgreSQL.
1386  
-
1387  
-.. _SQLITE_MAX_VARIABLE_NUMBER: http://sqlite.org/limits.html#max_variable_number
  1378
+.. versionadded:: 1.5
  1379
+    The ``batch_size`` parameter was added in version 1.5.
1388 1380
 
1389 1381
 count
1390 1382
 ~~~~~
5  docs/releases/1.5.txt
@@ -85,6 +85,11 @@ Django 1.5 also includes several smaller improvements worth noting:
85 85
 * In the localflavor for Canada, "pq" was added to the acceptable codes for
86 86
   Quebec. It's an old abbreviation.
87 87
 
  88
+* :meth:`QuerySet.bulk_create()
  89
+  <django.db.models.query.QuerySet.bulk_create>` has now a batch_size
  90
+  argument. By default the batch_size is unlimited except for SQLite where
  91
+  single batch is limited so that 1000 parameters per query isn't exceeded.
  92
+
88 93
 Backwards incompatible changes in 1.5
89 94
 =====================================
90 95
 
6  tests/regressiontests/bulk_create/models.py
@@ -18,4 +18,8 @@ class Pizzeria(Restaurant):
18 18
     pass
19 19
 
20 20
 class State(models.Model):
21  
-    two_letter_code = models.CharField(max_length=2, primary_key=True)
  21
+    two_letter_code = models.CharField(max_length=2, primary_key=True)
  22
+
  23
+class TwoFields(models.Model):
  24
+    f1 = models.IntegerField(unique=True)
  25
+    f2 = models.IntegerField(unique=True)
45  tests/regressiontests/bulk_create/tests.py
@@ -2,9 +2,11 @@
2 2
 
3 3
 from operator import attrgetter
4 4
 
5  
-from django.test import TestCase, skipIfDBFeature, skipUnlessDBFeature
  5
+from django.db import connection
  6
+from django.test import TestCase, skipIfDBFeature
  7
+from django.test.utils import override_settings
6 8
 
7  
-from .models import Country, Restaurant, Pizzeria, State
  9
+from .models import Country, Restaurant, Pizzeria, State, TwoFields
8 10
 
9 11
 
10 12
 class BulkCreateTests(TestCase):
@@ -27,7 +29,6 @@ def test_simple(self):
27 29
         self.assertEqual(created, [])
28 30
         self.assertEqual(Country.objects.count(), 4)
29 31
 
30  
-    @skipUnlessDBFeature("has_bulk_insert")
31 32
     def test_efficiency(self):
32 33
         with self.assertNumQueries(1):
33 34
             Country.objects.bulk_create(self.data)
@@ -69,3 +70,41 @@ def test_zero_as_autoval(self):
69 70
         invalid_country = Country(id=0, name='Poland', iso_two_letter='PL')
70 71
         with self.assertRaises(ValueError):
71 72
             Country.objects.bulk_create([valid_country, invalid_country])
  73
+
  74
+    def test_large_batch(self):
  75
+        with override_settings(DEBUG=True):
  76
+            connection.queries = []
  77
+            TwoFields.objects.bulk_create([
  78
+                   TwoFields(f1=i, f2=i+1) for i in range(0, 1001)
  79
+                ])
  80
+            self.assertTrue(len(connection.queries) < 10)
  81
+        self.assertEqual(TwoFields.objects.count(), 1001)
  82
+        self.assertEqual(
  83
+            TwoFields.objects.filter(f1__gte=450, f1__lte=550).count(),
  84
+            101)
  85
+        self.assertEqual(TwoFields.objects.filter(f2__gte=901).count(), 101)
  86
+
  87
+    def test_large_batch_mixed(self):
  88
+        """
  89
+        Test inserting a large batch with objects having primary key set
  90
+        mixed together with objects without PK set.
  91
+        """
  92
+        with override_settings(DEBUG=True):
  93
+            connection.queries = []
  94
+            TwoFields.objects.bulk_create([
  95
+                TwoFields(id=i if i % 2 == 0 else None, f1=i, f2=i+1)
  96
+                for i in range(100000, 101000)])
  97
+            self.assertTrue(len(connection.queries) < 10)
  98
+        self.assertEqual(TwoFields.objects.count(), 1000)
  99
+        # We can't assume much about the ID's created, except that the above
  100
+        # created IDs must exists.
  101
+        id_range = range(100000, 101000, 2)
  102
+        self.assertEqual(TwoFields.objects.filter(id__in=id_range).count(), 500)
  103
+
  104
+    def test_explicit_batch_size(self):
  105
+        objs = [TwoFields(f1=i, f2=i) for i in range(0, 100)]
  106
+        with self.assertNumQueries(2):
  107
+            TwoFields.objects.bulk_create(objs, 50)
  108
+        TwoFields.objects.all().delete()
  109
+        with self.assertNumQueries(1):
  110
+            TwoFields.objects.bulk_create(objs, len(objs))
3  tests/regressiontests/queries/tests.py
@@ -1863,8 +1863,7 @@ def test_ticket14244(self):
1863 1863
         # Test that the "in" lookup works with lists of 1000 items or more.
1864 1864
         Number.objects.all().delete()
1865 1865
         numbers = range(2500)
1866  
-        for num in numbers:
1867  
-            _ = Number.objects.create(num=num)
  1866
+        Number.objects.bulk_create(Number(num=num) for num in numbers)
1868 1867
         self.assertEqual(
1869 1868
             Number.objects.filter(num__in=numbers[:1000]).count(),
1870 1869
             1000

No commit comments for this range

Something went wrong with that request. Please try again.