Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reactivate Google's PickLevelForMemTableOutput() #163

Merged
merged 4 commits into from Aug 9, 2015
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
22 changes: 11 additions & 11 deletions db/corruption_test.cc
Expand Up @@ -222,8 +222,8 @@ TEST(CorruptionTest, NewFileErrorDuringWrite) {
const int num = 3 + (Options().write_buffer_size / kValueSize);
std::string value_storage;
Status s;
for (int i = 0;
s.ok() && i < num && 0==env_.num_writable_file_errors_;
for (int i = 0;
s.ok() && i < num && 0==env_.num_writable_file_errors_;
i++) {
WriteBatch batch;
batch.Put("a", Value(100, &value_storage));
Expand All @@ -242,16 +242,16 @@ TEST(CorruptionTest, TableFile) {
dbi->TEST_CompactRange(0, NULL, NULL);
dbi->TEST_CompactRange(1, NULL, NULL);

Corrupt(kTableFile, 100, 1, 2);
Corrupt(kTableFile, 100, 1, config::kMaxMemCompactLevel);
Check(95, 99);
}

TEST(CorruptionTest, TableFileIndexData) {
Build(100000); // Enough to build multiple Tables
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_CompactMemTable();
// was level-2, now level-0
Corrupt(kTableFile, -2000, 500, 0);

Corrupt(kTableFile, -2000, 500, config::kMaxMemCompactLevel);
Reopen();
Check(50000, 99999);
}
Expand Down Expand Up @@ -305,10 +305,10 @@ TEST(CorruptionTest, CompactionInputError) {
Build(10);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_CompactMemTable();
//const int last = config::kMaxMemCompactLevel; // Riak does not "move" files
//ASSERT_EQ(1, Property("leveldb.num-files-at-level" + NumberToString(last)));
const int last = config::kMaxMemCompactLevel; // Riak does not "move" files
ASSERT_EQ(1, Property("leveldb.num-files-at-level" + NumberToString(last)));

Corrupt(kTableFile, 100, 1, 0);
Corrupt(kTableFile, 100, 1, last);
Check(5, 9);

// Force compactions by writing lots of values
Expand All @@ -328,10 +328,10 @@ TEST(CorruptionTest, CompactionInputErrorParanoid) {
// Fill levels >= 1 so memtable compaction outputs to level 1
// matthewv 1/10/14 - what does "levels" have to do with this,
// switching to compaction trigger.
// 7/10/14 - compaction starts between 4 and 6 files ... assume 4
// 7/10/14 - compaction starts between 4 and 6 files ... assume 4 and 1 move
// (will make a new, descriptive constant for 4)
for (int level = Property("leveldb.num-files-at-level0")+1;
level < (config::kL0_GroomingTrigger -1); level++) {
level < config::kL0_GroomingTrigger; level++) {
dbi->Put(WriteOptions(), "", "begin");
dbi->Put(WriteOptions(), "~", "end");
dbi->TEST_CompactMemTable();
Expand Down Expand Up @@ -360,7 +360,7 @@ TEST(CorruptionTest, UnrelatedKeys) {
Build(10);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_CompactMemTable();
Corrupt(kTableFile, 100, 1, 0);
Corrupt(kTableFile, 100, 1, config::kMaxMemCompactLevel);

std::string tmp1, tmp2;
ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
Expand Down
55 changes: 40 additions & 15 deletions db/db_impl.cc
Expand Up @@ -715,14 +715,51 @@ Status DBImpl::WriteLevel0Table(volatile MemTable* mem, VersionEdit* edit,
const Slice min_user_key = meta.smallest.user_key();
const Slice max_user_key = meta.largest.user_key();
if (base != NULL) {
level = base->PickLevelForMemTableOutput(min_user_key, max_user_key);
int level_limit;
if (0!=options_.tiered_slow_level && (options_.tiered_slow_level-1)<config::kMaxMemCompactLevel)
level_limit=options_.tiered_slow_level-1;
else
level_limit=config::kMaxMemCompactLevel;

// remember, mutex is held so safe to push file into a non-compacting level
level = base->PickLevelForMemTableOutput(min_user_key, max_user_key, level_limit);
if (versions_->IsCompactionSubmitted(level) || !versions_->NeighborCompactionsQuiet(level))
level=0;

if (0!=level)
{
Status move_s;
std::string old_name, new_name;

old_name=TableFileName(options_, meta.number, 0);
new_name=TableFileName(options_, meta.number, level);
s=env_->RenameFile(old_name, new_name);
move_s=env_->RenameFile(old_name, new_name);

if (move_s.ok())
{
// builder already added file to table_cache with 2 references and
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The builder added the file to table_cache with mutex_ unlocked; is there any possibility that another thread could have referenced the file via the table_cache before we get to this point?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new file is not added to the manifest until after this routine returns. The “edit” parameter gets set at line 767 with the new file number and its level. No thread can access this file in the table cache because the file number is not known outside this routine. The “edit” object subsequently gets posted to the manifest. That is when the file number becomes visible.

On Aug 8, 2015, at 4:06 PM, Paul A. Place <notifications@github.com mailto:notifications@github.com> wrote:

In db/db_impl.cc #163 (comment):

         std::string old_name, new_name;

         old_name=TableFileName(options_, meta.number, 0);
         new_name=TableFileName(options_, meta.number, level);
  •        s=env_->RenameFile(old_name, new_name);
    
  •        move_s=env_->RenameFile(old_name, new_name);
    
  •        if (move_s.ok())
    
  •        {
    
  •            // builder already added file to table_cache with 2 references and
    
    The builder added the file to table_cache with mutex_ unlocked; is there any possibility that another thread could have referenced the file via the table_cache before we get to this point?


Reply to this email directly or view it on GitHub https://github.com/basho/leveldb/pull/163/files#r36581285.

// marked as level 0 (used by cache warming) ... going to remove from cache
// and add again correctly
table_cache_->Evict(meta.number, true);
meta.level=level;

// sadly, we must hold the mutex during this file open
// since operating in non-overlapped level
Iterator* it=table_cache_->NewIterator(ReadOptions(),
meta.number,
meta.file_size,
meta.level);
delete it;

// argh! logging while holding mutex ... cannot release
Log(options_.info_log, "Level-0 table #%llu: moved to level %d",
(unsigned long long) meta.number,
level);
} // if
else
{
level=0;
} // else
} // if
}

Expand All @@ -736,24 +773,12 @@ Status DBImpl::WriteLevel0Table(volatile MemTable* mem, VersionEdit* edit,
stats.bytes_written = meta.file_size;
stats_[level].Add(stats);

if (0!=meta.num_entries && s.ok())
{
// This SetWriteRate() call removed because this thread
// has priority (others blocked on mutex) and thus created
// misleading estimates of disk write speed
// 2x since mem to disk, not disk to disk
// env_->SetWriteRate(2*stats.micros/meta.num_entries);
// 2x since mem to disk, not disk to disk
// env_->SetWriteRate(2*stats.micros/meta.num_entries);
} // if

// Riak adds extra reference to file, must remove it
// in this race condition upon close
if (s.ok() && shutting_down_.Acquire_Load()) {
versions_->GetTableCache()->Evict(meta.number, true);
table_cache_->Evict(meta.number, versions_->IsLevelOverlapped(level));
}


return s;
}

Expand Down
56 changes: 27 additions & 29 deletions db/db_test.cc
Expand Up @@ -1199,26 +1199,27 @@ TEST(DBTest, DeletionMarkers1) {
Put("foo", "v1");
ASSERT_OK(dbfull()->TEST_CompactMemTable());
const int last = config::kMaxMemCompactLevel;
//ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level
ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level

// Place a table at level last-1 to prevent merging with preceding mutation
Put("a", "begin");
Put("z", "end");
dbfull()->TEST_CompactMemTable();
//ASSERT_EQ(NumTableFilesAtLevel(last), 1);
//ASSERT_EQ(NumTableFilesAtLevel(last-1), 1);
ASSERT_EQ(NumTableFilesAtLevel(last), 1);
ASSERT_EQ(NumTableFilesAtLevel(last-1), 1);

Delete("foo");
Put("foo", "v2");
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level last-2
ASSERT_OK(dbfull()->TEST_CompactMemTable()); // stays at level 0
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); // riak 1.3, DEL merged out by BuildTable
Slice z("z");
dbfull()->TEST_CompactRange(last-2, NULL, &z);
dbfull()->TEST_CompactRange(0, NULL, &z);
dbfull()->TEST_CompactRange(1, NULL, &z);
// DEL eliminated, but v1 remains because we aren't compacting that level
// (DEL can be eliminated because v2 hides v1).
//ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); Riak 1.4 has merged to level 1
//dbfull()->TEST_CompactRange(last-1, NULL, NULL);
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); // Riak 1.4 has merged to level 1
dbfull()->TEST_CompactRange(last-1, NULL, NULL);
// Merging last-1 w/ last, so we are the base level for "foo", so
// DEL is removed. (as is v1).
ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]");
Expand All @@ -1228,36 +1229,36 @@ TEST(DBTest, DeletionMarkers2) {
Put("foo", "v1");
ASSERT_OK(dbfull()->TEST_CompactMemTable());
const int last = config::kMaxMemCompactLevel;
ASSERT_EQ(NumTableFilesAtLevel(0), 1); // foo => v1 is now in last level
ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level
dbfull()->TEST_CompactRange(0, NULL, NULL);
ASSERT_EQ(NumTableFilesAtLevel(1), 1); // foo => v1 is now in last level
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level
ASSERT_EQ(NumTableFilesAtLevel(last-1), 0);

// Place a table at level last-1 to prevent merging with preceding mutation
Put("a", "begin");
Put("z", "end");
dbfull()->TEST_CompactMemTable();
ASSERT_EQ(NumTableFilesAtLevel(0), 1);
dbfull()->TEST_CompactMemTable(); // goes to last-1
ASSERT_EQ(NumTableFilesAtLevel(last-1), 1);

Delete("foo");
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level last-2
ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level 0
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
dbfull()->TEST_CompactRange(0, NULL, NULL); // Riak overlaps level 1
// DEL kept: "last" file overlaps
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
// Merging last-1 w/ last, so we are the base level for "foo", so
// DEL is removed. (as is v1).
dbfull()->TEST_CompactRange(1, NULL, NULL);
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL ]");
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");

dbfull()->TEST_CompactRange(2, NULL, NULL);
ASSERT_EQ(AllEntriesFor("foo"), "[ ]");
}

TEST(DBTest, OverlapInLevel0) {
do {
ASSERT_EQ(config::kMaxMemCompactLevel, 2) << "Fix test to match config";
ASSERT_EQ(config::kMaxMemCompactLevel, 3) << "Fix test to match config";

// Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0.
ASSERT_OK(Put("100", "v100"));
Expand All @@ -1269,7 +1270,7 @@ TEST(DBTest, OverlapInLevel0) {
ASSERT_OK(Delete("999"));
dbfull()->TEST_CompactMemTable();
dbfull()->TEST_CompactRange(0, NULL, NULL);
ASSERT_EQ("0,1,1", FilesPerLevel());
ASSERT_EQ("0,0,1,1", FilesPerLevel());

// Make files spanning the following ranges in level-0:
// files[0] 200 .. 900
Expand All @@ -1282,7 +1283,7 @@ TEST(DBTest, OverlapInLevel0) {
ASSERT_OK(Put("600", "v600"));
ASSERT_OK(Put("900", "v900"));
dbfull()->TEST_CompactMemTable();
ASSERT_EQ("2,1,1", FilesPerLevel());
ASSERT_EQ("2,0,1,1", FilesPerLevel());

// Compact away the placeholder files we created initially
dbfull()->TEST_CompactRange(1, NULL, NULL);
Expand Down Expand Up @@ -1421,40 +1422,37 @@ TEST(DBTest, CustomComparator) {
}

TEST(DBTest, ManualCompaction) {
ASSERT_EQ(config::kMaxMemCompactLevel, 2)
ASSERT_EQ(config::kMaxMemCompactLevel, 3)
<< "Need to update this test to match kMaxMemCompactLevel";

MakeTables(3, "p", "q");
ASSERT_EQ("3", FilesPerLevel());
ASSERT_EQ("1,0,1,1", FilesPerLevel());

// Compaction range falls before files
// (Basho compacts all at zero no matter what)
Compact("", "c");
//ASSERT_EQ("3", FilesPerLevel());
ASSERT_EQ("0,1", FilesPerLevel());
ASSERT_EQ("0,1,1,1", FilesPerLevel());

// Compaction range falls after files
Compact("r", "z");
//ASSERT_EQ("3", FilesPerLevel());
ASSERT_EQ("0,1", FilesPerLevel());
ASSERT_EQ("0,1,1,1", FilesPerLevel());

// Compaction range overlaps files
Compact("p1", "p9");
ASSERT_EQ("0,1", FilesPerLevel());
ASSERT_EQ("0,0,0,1", FilesPerLevel());

// Populate a different range
MakeTables(3, "c", "e");
ASSERT_EQ("3,1", FilesPerLevel());
ASSERT_EQ("1,0,1,2", FilesPerLevel());

// Compact just the new range
Compact("b", "f");
ASSERT_EQ("0,2", FilesPerLevel());
ASSERT_EQ("0,0,0,2", FilesPerLevel());

// Compact all
MakeTables(1, "a", "z");
ASSERT_EQ("1,2", FilesPerLevel());
ASSERT_EQ("0,0,1,2", FilesPerLevel());
db_->CompactRange(NULL, NULL);
ASSERT_EQ("0,3", FilesPerLevel());
ASSERT_EQ("0,0,0,1", FilesPerLevel());
}

TEST(DBTest, DBOpen_Options) {
Expand Down
3 changes: 2 additions & 1 deletion db/dbformat.h
Expand Up @@ -44,7 +44,8 @@ static const size_t kL0_StopWritesTrigger = 12;
// expensive manifest file operations. We do not push all the way to
// the largest level since that can generate a lot of wasted disk
// space if the same key space is being repeatedly overwritten.
static const int kMaxMemCompactLevel = 2;
// Basho: push to kNumOverlapLevels +1 ... beyond "landing level"
static const int kMaxMemCompactLevel = 3;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment states that kMaxMemCompactLevel is kNumOverlapLevels +1, so why not just actually define it to be that value?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

changed as recommended.

On Aug 8, 2015, at 4:09 PM, Paul A. Place <notifications@github.com mailto:notifications@github.com> wrote:

In db/dbformat.h #163 (comment):

@@ -44,7 +44,8 @@ static const size_t kL0_StopWritesTrigger = 12;
// expensive manifest file operations. We do not push all the way to
// the largest level since that can generate a lot of wasted disk
// space if the same key space is being repeatedly overwritten.
-static const int kMaxMemCompactLevel = 2;
+// Basho: push to kNumOverlapLevels +1 ... beyond "landing level"
+static const int kMaxMemCompactLevel = 3;
The comment states that kMaxMemCompactLevel is kNumOverlapLevels +1, so why not just actually define it to be that value?


Reply to this email directly or view it on GitHub https://github.com/basho/leveldb/pull/163/files#r36581315.


} // namespace config

Expand Down
24 changes: 19 additions & 5 deletions db/version_set.cc
Expand Up @@ -475,17 +475,18 @@ bool Version::OverlapInLevel(int level,

int Version::PickLevelForMemTableOutput(
const Slice& smallest_user_key,
const Slice& largest_user_key) {
const Slice& largest_user_key,
const int level_limit) {
int level = 0;
#if 0
#if 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this code is being put back into production, should remove the "#if 1/#endif" conditional compilation statements.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

corrected as recommended

On Aug 8, 2015, at 4:11 PM, Paul A. Place notifications@github.com wrote:

In db/version_set.cc #163 (comment):

int level = 0;
-#if 0
+#if 1
Since this code is being put back into production, should remove the "#if 1/#endif" conditional compilation statements.


Reply to this email directly or view it on GitHub https://github.com/basho/leveldb/pull/163/files#r36581331.

// test if level 1 m_OverlappedFiles is false, proceded only then
if (!OverlapInLevel(0, &smallest_user_key, &largest_user_key)) {
// Push to next level if there is no overlap in next level,
// and the #bytes overlapping in the level after that are limited.
InternalKey start(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek);
InternalKey limit(largest_user_key, 0, static_cast<ValueType>(0));
std::vector<FileMetaData*> overlaps;
while (level < config::kMaxMemCompactLevel) {
while (level < level_limit) {
if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) {
break;
}
Expand All @@ -496,6 +497,10 @@ int Version::PickLevelForMemTableOutput(
}
level++;
}
// do not waste a move into an overlapped level, breaks
// different performance improvement
if (gLevelTraits[level].m_OverlappedFiles)
level=0;
}
#endif
return level;
Expand Down Expand Up @@ -1057,6 +1062,15 @@ void VersionSet::MarkFileNumberUsed(uint64_t number) {
}
}


bool
VersionSet::NeighborCompactionsQuiet(int level)
{
return((0==level || !m_CompactionStatus[level-1].m_Submitted)
&& !m_CompactionStatus[level+1].m_Submitted);
} // VersionSet::NeighborCompactionsQuiet


bool
VersionSet::Finalize(Version* v)
{
Expand Down Expand Up @@ -1095,7 +1109,7 @@ VersionSet::Finalize(Version* v)
else
{
// must not have compactions scheduled on neither level below nor level above
compact_ok=(!m_CompactionStatus[level-1].m_Submitted && !m_CompactionStatus[level+1].m_Submitted);
compact_ok=NeighborCompactionsQuiet(level);
} // else
} // if

Expand Down Expand Up @@ -1265,7 +1279,7 @@ VersionSet::UpdatePenalty(
penalty_score = penalty_score * penalty_score * penalty_score;

// if no penalty so far, set a minor penalty to the landing
// level to help it flush. because first sorted layer needs to
// level to help it flush. because first sorted layer needs to
// clear before next dump of overlapped files.
if (penalty_score<1.0 && config::kNumOverlapLevels==level)
{
Expand Down
5 changes: 4 additions & 1 deletion db/version_set.h
Expand Up @@ -102,7 +102,8 @@ class Version {
// Return the level at which we should place a new memtable compaction
// result that covers the range [smallest_user_key,largest_user_key].
int PickLevelForMemTableOutput(const Slice& smallest_user_key,
const Slice& largest_user_key);
const Slice& largest_user_key,
const int level_limit);

size_t NumFiles(int level) const { return files_[level].size(); }

Expand Down Expand Up @@ -317,6 +318,8 @@ class VersionSet {
{ m_CompactionStatus[level].m_Running=false;
m_CompactionStatus[level].m_Submitted=false;}

bool NeighborCompactionsQuiet(int level);

private:
class Builder;

Expand Down