diff --git a/.classpath b/.classpath index 9daadee954..009f6148a2 100644 --- a/.classpath +++ b/.classpath @@ -9,17 +9,20 @@ + + - + - + + @@ -40,7 +43,7 @@ - + @@ -53,9 +56,13 @@ - + + + + + diff --git a/.settings/org.eclipse.jdt.ui.prefs b/.settings/org.eclipse.jdt.ui.prefs index 038d4674f0..b5553d65f5 100644 --- a/.settings/org.eclipse.jdt.ui.prefs +++ b/.settings/org.eclipse.jdt.ui.prefs @@ -1,4 +1,4 @@ -#Tue Jan 13 14:27:58 PST 2009 +#Sat Sep 22 05:05:45 PDT 2012 cleanup.add_default_serial_version_id=true cleanup.add_generated_serial_version_id=false cleanup.add_missing_annotations=true diff --git a/META-INF/MANIFEST.MF b/META-INF/MANIFEST.MF index 86ad47e2ea..08a19d78e9 100644 --- a/META-INF/MANIFEST.MF +++ b/META-INF/MANIFEST.MF @@ -1,7 +1,8 @@ Manifest-Version: 1.0 Ant-Version: Apache Ant 1.7.1 Created-By: 20.2-b06 (Sun Microsystems Inc.) +Voldemort-Implementation-Version: 1.3.0 Implementation-Title: Voldemort -Implementation-Version: 0.96 +Implementation-Version: 1.3.0 Implementation-Vendor: LinkedIn diff --git a/bin/PREUPGRADE_FOR_1_1_X_README b/bin/PREUPGRADE_FOR_1_1_X_README new file mode 100644 index 0000000000..2cde462b4e --- /dev/null +++ b/bin/PREUPGRADE_FOR_1_1_X_README @@ -0,0 +1,110 @@ +This directory contains utility to convert BDB JE data between different versions of Voldemort. + +Need for Conversion +------------------- +Voldemort has been using "sorted duplicates" feature of BDB JE to handle +conflicting writes to the same key. At the very minimum, the conversion gets +rid of BDB sorted duplicates support and handles duplicates in the Voldemort +storage layer itself. The decision was made after months of closely working +with Oracle JE team, to understand the factors affecting performance. + +Data Formats +------------ +This section describes the data formats themselves. + +1) Base Format (Base) +--------------------- +This is the format used by Voldemort up until 1.1.x, relying on BDB JE for +duplicate handling + +Disadvantages: +-- The manner in which BDB JE handles duplicates is not suitable for an + application with small percent of 2-3 duplicates i.e Voldemort. +-- Data bloat issue that prevented us from migrating to any higher 4.x version + to be able to control cache eviction +-- Incompatible with how duplicates are handled in JE5. +-- May incur additional locking costs for the "duplicates" subtree + +2) New duplicate format (NewDup) +-------------------------------- +This format is supported from release 1.1.x, where Voldemort storage layer +handles duplicates and BDB JE version is bumped up to JE 4.1.17 + +Advantages: +-- Ability to move data off disk. This is very GC friendly, relying on OS page + cache for the data and using the JVM heap only for index. This is achieved + by setting "bdb.cache.evictln" server parameter to "true" +-- Ability to evict data brought into the cache during scans, minimize impact + on online traffic (Restore, Rebalance, Retention). This is achieved by + setting "bdb.minimize.scan.impact" to "true" +-- Thinner storage layer. eg: BdbStorageEngine.put() does not incur the cost + of an additional delete() +-- General speed up due to elimination of duplicates + +This format is the minimum requirement to be able to upgrade to 1.1.x & higher + +3) Partition Scan format (PidScan) +---------------------------------- +This is a super set of 'NewDup' format, supported 1.1.x upwards. In addition to +eliminating duplicates and upgrading to JE 4.1.17, it adds a 2 byte prefix +representing the partition id to each key. + +Advantages: +-- Speed up Restore and Rebalancing linearly to the number of partitions + actually fetched. (which means much shorter degraded mode performance) + +This is an optional format. You can turn it off, by setting +bdb.prefix.keys.with.partitionid=false, if you don't like for some reason + +Note : We have not seen the extra 2 bytes cause any overhead to online +performance + +IMPORTANT: IT IS REQUIRED TO CONVERT TO EITHER 'NewDup' or 'PidScan' TO RUN +VOLDEMORT WITH BDB, STARTING RELEASE 1.1.x + +Running the Conversion Utility +------------------------------ +The tool provides the ability to convert one database from a source environment +to a destination environment. You need to run the tool for each of the databases +or voldemort store you have. You can bring one Voldemort server at a time and +perform the conversion and bring it up on the appropriate release + +Note: For users running with "bdb.one.env.per.store=false", it means you will +have to run the tool with the same --src --dest options for each database +contained. + +In addition to BDB environment locations, the tool needs the cluster.xml to generate +the partition prefix. + +$./voldemort-convert-bdb.sh --src + --dest + --store + --cluster-xml + --from-format + --to-format + --je-log-size + --btree-nodemax + +We recommend you run the following to move to release 1.1.x & up. + +$./voldemort-convert-bdb.sh --src /path/to/src/env + --dest /path/to/dest/env + --store teststore + --cluster-xml /path/to/cluster/xml + --from-format Base + --to-format PidScan + + + + + + + + + + + + diff --git a/bin/generate_cluster_xml.py b/bin/generate_cluster_xml.py index 1811fdd2fc..8160988580 100644 --- a/bin/generate_cluster_xml.py +++ b/bin/generate_cluster_xml.py @@ -10,6 +10,8 @@ # Setup and argument parser parser = argparse.ArgumentParser(description='Build a voldemort cluster.xml.') # Add supported arguments +parser.add_argument('-f', '--file', type=str, dest='file', + help='the file of the list of hosts(one per line)') parser.add_argument('-N', '--name', type=str, default='voldemort', dest='name', help='the name you want to give the cluster') parser.add_argument('-n', '--nodes', type=int, default=2, dest='nodes', @@ -44,7 +46,11 @@ sys.exit(1) # Store arguments -nodes = args.nodes +if args.file: + hostList = open(args.file).readlines() + nodes = len(hostList) +else: + nodes = args.nodes partitions = args.partitions name = args.name http_port = args.http_port @@ -73,7 +79,10 @@ print " " print " %d" % i - print " host%d" % i + if args.file: + print " %s" % hostList[i].strip() + else: + print " host%d" % i print " %d" % http_port print " %d" % sock_port print " %d" % admin_port diff --git a/bin/run-class.bat b/bin/run-class.bat new file mode 100644 index 0000000000..b0e3618df0 --- /dev/null +++ b/bin/run-class.bat @@ -0,0 +1,46 @@ +@echo off + +REM +REM Copyright 2013 Carlos Tasada +REM +REM Licensed under the Apache License, Version 2.0 (the "License"); +REM you may not use this file except in compliance with the License. +REM You may obtain a copy of the License at +REM +REM http://www.apache.org/licenses/LICENSE-2.0 +REM +REM Unless required by applicable law or agreed to in writing, software +REM distributed under the License is distributed on an "AS IS" BASIS, +REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +REM See the License for the specific language governing permissions and +REM limitations under the License. +REM +REM ** This Windows BAT file is not tested with each Voldemort release. ** + +set argC=0 +for %%a in (%*) do set /a argC+=1 +if %argC% geq 1 goto :continue +echo %0 java-class-name [options] +goto :eof +:continue + +SET BASE_DIR=%~dp0.. +SET CLASSPATH=. + +set VOLDEMORT_CONFIG_DIR=%1%/config + +for %%j in ("%BASE_DIR%\dist\*.jar") do (call :append_classpath "%%j") +for %%j in ("%BASE_DIR%\lib\*.jar") do (call :append_classpath "%%j") +set CLASSPATH=%CLASSPATH%;"%BASE_DIR%\dist\resources" +goto :run + +:append_classpath +set CLASSPATH=%CLASSPATH%;%1 +goto :eof + +:run +if "%VOLD_OPTS%" == "" set "VOLD_OPTS=-Xmx2G -server -Dcom.sun.management.jmxremote" +java -Dlog4j.configuration=%VOLDEMORT_CONFIG_DIR%\log4j.properties %VOLD_OPTS% -cp %CLASSPATH% %* + +endlocal +:eof \ No newline at end of file diff --git a/bin/run-class.sh b/bin/run-class.sh index f0d1fbe391..465637dd4a 100755 --- a/bin/run-class.sh +++ b/bin/run-class.sh @@ -1,7 +1,7 @@ #!/bin/bash # -# Copyright 2008-2009 LinkedIn, Inc +# Copyright 2008-2013 LinkedIn, Inc # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,7 +21,10 @@ if [ $# -lt 1 ]; then exit 1 fi -base_dir=$(dirname $0)/.. +script_path=$(readlink -f "$0") +script_dir=`dirname "$script_path"` + +base_dir=`dirname "$script_dir"` for file in $base_dir/lib/*.jar; do @@ -43,5 +46,8 @@ if [ -z "$VOLD_OPTS" ]; then VOLD_OPTS="-Xmx2G -server -Dcom.sun.management.jmxremote " fi +# add '-Dlog4j.debug ' to debug log4j issues. +LOG4JPROPERTIES="-Dlog4j.configuration=file:///${base_dir}/src/java/log4j.properties" + export CLASSPATH -java -Dlog4j.configuration=$base_dir/src/java/log4j.properties $VOLD_OPTS -cp $CLASSPATH $@ +java $LOG4JPROPERTIES $VOLD_OPTS -cp $CLASSPATH $@ diff --git a/bin/voldemort-admin-tool.bat b/bin/voldemort-admin-tool.bat new file mode 100644 index 0000000000..9629277257 --- /dev/null +++ b/bin/voldemort-admin-tool.bat @@ -0,0 +1,22 @@ +@echo off + +REM +REM Copyright 2013 Carlos Tasada +REM +REM Licensed under the Apache License, Version 2.0 (the "License"); +REM you may not use this file except in compliance with the License. +REM You may obtain a copy of the License at +REM +REM http://www.apache.org/licenses/LICENSE-2.0 +REM +REM Unless required by applicable law or agreed to in writing, software +REM distributed under the License is distributed on an "AS IS" BASIS, +REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +REM See the License for the specific language governing permissions and +REM limitations under the License. +REM +REM ** This Windows BAT file is not tested with each Voldemort release. ** + +SET BASE_DIR=%~dp0.. + +call "%BASE_DIR%/bin/run-class.bat" voldemort.VoldemortAdminTool %* \ No newline at end of file diff --git a/bin/voldemort-convert-bdb.sh b/bin/voldemort-convert-bdb.sh new file mode 100755 index 0000000000..52af1efd09 --- /dev/null +++ b/bin/voldemort-convert-bdb.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# +# Copyright 2008-2012 LinkedIn, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +base_dir=$(dirname $0)/.. + +for file in $base_dir/dist/*.jar; +do + CLASSPATH=$CLASSPATH:$file +done + +for file in $base_dir/lib/*.jar; +do + CLASSPATH=$CLASSPATH:$file +done + +for file in $base_dir/contrib/*/lib/*.jar; +do + CLASSPATH=$CLASSPATH:$file +done + +CLASSPATH=$CLASSPATH:$base_dir/dist/resources + +JVM_OPTS="-server -Xms5g -Xmx5g -XX:NewSize=1024m -XX:MaxNewSize=1024m -XX:+AlwaysPreTouch -XX:+UseCompressedOops -XX:+UseConcMarkSweepGC -XX:+UseParNewGC -XX:CMSInitiatingOccupancyFraction=70 -XX:SurvivorRatio=2" + +java -Dlog4j.configuration=src/java/log4j.properties $JVM_OPTS -cp $CLASSPATH voldemort.store.bdb.dataconversion.BdbConvertData $@ diff --git a/bin/voldemort-performance-tool.bat b/bin/voldemort-performance-tool.bat new file mode 100644 index 0000000000..bb401c1509 --- /dev/null +++ b/bin/voldemort-performance-tool.bat @@ -0,0 +1,22 @@ +@echo off + +REM +REM Copyright 2013 Carlos Tasada +REM +REM Licensed under the Apache License, Version 2.0 (the "License"); +REM you may not use this file except in compliance with the License. +REM You may obtain a copy of the License at +REM +REM http://www.apache.org/licenses/LICENSE-2.0 +REM +REM Unless required by applicable law or agreed to in writing, software +REM distributed under the License is distributed on an "AS IS" BASIS, +REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +REM See the License for the specific language governing permissions and +REM limitations under the License. +REM +REM ** This Windows BAT file is not tested with each Voldemort release. ** + +SET BASE_DIR=%~dp0.. + +call "%BASE_DIR%/bin/run-class.bat" voldemort.performance.benchmark.Benchmark %* \ No newline at end of file diff --git a/bin/voldemort-rebalance.bat b/bin/voldemort-rebalance.bat new file mode 100644 index 0000000000..7c918de15c --- /dev/null +++ b/bin/voldemort-rebalance.bat @@ -0,0 +1,22 @@ +@echo off + +REM +REM Copyright 2013 Carlos Tasada +REM +REM Licensed under the Apache License, Version 2.0 (the "License"); +REM you may not use this file except in compliance with the License. +REM You may obtain a copy of the License at +REM +REM http://www.apache.org/licenses/LICENSE-2.0 +REM +REM Unless required by applicable law or agreed to in writing, software +REM distributed under the License is distributed on an "AS IS" BASIS, +REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +REM See the License for the specific language governing permissions and +REM limitations under the License. +REM +REM ** This Windows BAT file is not tested with each Voldemort release. ** + +SET BASE_DIR=%~dp0.. + +call "%BASE_DIR%/bin/run-class.bat" voldemort.client.rebalance.RebalanceCLI %* \ No newline at end of file diff --git a/bin/voldemort-server.bat b/bin/voldemort-server.bat new file mode 100644 index 0000000000..e4dd896057 --- /dev/null +++ b/bin/voldemort-server.bat @@ -0,0 +1,49 @@ +@echo off + +REM +REM Copyright 2013 Carlos Tasada +REM +REM Licensed under the Apache License, Version 2.0 (the "License"); +REM you may not use this file except in compliance with the License. +REM You may obtain a copy of the License at +REM +REM http://www.apache.org/licenses/LICENSE-2.0 +REM +REM Unless required by applicable law or agreed to in writing, software +REM distributed under the License is distributed on an "AS IS" BASIS, +REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +REM See the License for the specific language governing permissions and +REM limitations under the License. +REM +REM ** This Windows BAT file is not tested with each Voldemort release. ** + +set argC=0 +for %%a in (%*) do set /a argC+=1 +if %argC% leq 2 goto :continue +echo USAGE: bin/voldemort-server.bat [voldemort_home_dir] [voldemort_config_dir] +goto :eof +:continue + +setlocal + +SET BASE_DIR=%~dp0.. +SET CLASSPATH=. + +set VOLDEMORT_CONFIG_DIR=%1%/config +rem call %VOLDEMORT_CONFIG_DIR%/voldemort-env.bat + +for %%j in ("%BASE_DIR%\dist\*.jar") do (call :append_classpath "%%j") +for %%j in ("%BASE_DIR%\lib\*.jar") do (call :append_classpath "%%j") +set CLASSPATH=%CLASSPATH%:"%BASE_DIR%\dist\resources" +goto :run + +:append_classpath +set CLASSPATH=%CLASSPATH%;%1 +goto :eof + +:run +if "%VOLD_OPTS%" == "" set "VOLD_OPTS=-Xmx2G -server -Dcom.sun.management.jmxremote" +java %VOLD_OPTS% -cp %CLASSPATH% voldemort.server.VoldemortServer %* + +endlocal +:eof \ No newline at end of file diff --git a/bin/voldemort-shell.bat b/bin/voldemort-shell.bat new file mode 100644 index 0000000000..3300cef41d --- /dev/null +++ b/bin/voldemort-shell.bat @@ -0,0 +1,34 @@ +@echo off + +REM +REM Copyright 2013 Carlos Tasada +REM +REM Licensed under the Apache License, Version 2.0 (the "License"); +REM you may not use this file except in compliance with the License. +REM You may obtain a copy of the License at +REM +REM http://www.apache.org/licenses/LICENSE-2.0 +REM +REM Unless required by applicable law or agreed to in writing, software +REM distributed under the License is distributed on an "AS IS" BASIS, +REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +REM See the License for the specific language governing permissions and +REM limitations under the License. +REM +REM ** This Windows BAT file is not tested with each Voldemort release. ** + +set argC=0 +for %%a in (%*) do set /a argC+=1 +if %argC% geq 2 goto :continue +echo "USAGE: bin/voldemort-shell.bat store_name bootstrap_url [command_file] [--client-zone-id ]" +goto :eof +:continue + +setlocal +SET BASE_DIR=%~dp0.. + +call "%BASE_DIR%\bin\run-class.bat" jline.ConsoleRunner voldemort.VoldemortClientShell %* + +endlocal + +:eof \ No newline at end of file diff --git a/build.properties b/build.properties index 2bc92bcf07..d851cb378b 100644 --- a/build.properties +++ b/build.properties @@ -42,4 +42,5 @@ tomcat.context=/voldemort javac.version=1.5 ## Release -curr.release=1.0.0 +curr.release=1.3.1 + diff --git a/build.xml b/build.xml index daefcc5334..88158310c2 100644 --- a/build.xml +++ b/build.xml @@ -79,7 +79,9 @@ - + + + @@ -122,6 +124,12 @@ + - + @@ -149,8 +158,9 @@ + - + @@ -422,7 +432,7 @@ - + diff --git a/clients/python/voldemort/protocol/voldemort_admin_pb2.py b/clients/python/voldemort/protocol/voldemort_admin_pb2.py index 008ac0cda0..08937e79ba 100644 --- a/clients/python/voldemort/protocol/voldemort_admin_pb2.py +++ b/clients/python/voldemort/protocol/voldemort_admin_pb2.py @@ -10,7 +10,7 @@ DESCRIPTOR = descriptor.FileDescriptor( name='voldemort-admin.proto', package='voldemort', - serialized_pb='\n\x15voldemort-admin.proto\x12\tvoldemort\x1a\x16voldemort-client.proto\"!\n\x12GetMetadataRequest\x12\x0b\n\x03key\x18\x01 \x02(\x0c\"]\n\x13GetMetadataResponse\x12%\n\x07version\x18\x01 \x01(\x0b\x32\x14.voldemort.Versioned\x12\x1f\n\x05\x65rror\x18\x02 \x01(\x0b\x32\x10.voldemort.Error\"M\n\x15UpdateMetadataRequest\x12\x0b\n\x03key\x18\x01 \x02(\x0c\x12\'\n\tversioned\x18\x02 \x02(\x0b\x32\x14.voldemort.Versioned\"9\n\x16UpdateMetadataResponse\x12\x1f\n\x05\x65rror\x18\x01 \x01(\x0b\x32\x10.voldemort.Error\"7\n\tFileEntry\x12\x11\n\tfile_name\x18\x01 \x02(\t\x12\x17\n\x0f\x66ile_size_bytes\x18\x02 \x02(\x03\"F\n\x0ePartitionEntry\x12\x0b\n\x03key\x18\x01 \x02(\x0c\x12\'\n\tversioned\x18\x02 \x02(\x0b\x32\x14.voldemort.Versioned\"\x8e\x01\n\x1dUpdatePartitionEntriesRequest\x12\r\n\x05store\x18\x01 \x02(\t\x12\x32\n\x0fpartition_entry\x18\x02 \x02(\x0b\x32\x19.voldemort.PartitionEntry\x12*\n\x06\x66ilter\x18\x03 \x01(\x0b\x32\x1a.voldemort.VoldemortFilter\"A\n\x1eUpdatePartitionEntriesResponse\x12\x1f\n\x05\x65rror\x18\x01 \x01(\x0b\x32\x10.voldemort.Error\"-\n\x0fVoldemortFilter\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x0c\n\x04\x64\x61ta\x18\x02 \x02(\x0c\"\xaf\x01\n\x18UpdateSlopEntriesRequest\x12\r\n\x05store\x18\x01 \x02(\t\x12\x0b\n\x03key\x18\x02 \x02(\x0c\x12\'\n\x07version\x18\x03 \x02(\x0b\x32\x16.voldemort.VectorClock\x12,\n\x0crequest_type\x18\x04 \x02(\x0e\x32\x16.voldemort.RequestType\x12\r\n\x05value\x18\x05 \x01(\x0c\x12\x11\n\ttransform\x18\x06 \x01(\x0c\"<\n\x19UpdateSlopEntriesResponse\x12\x1f\n\x05\x65rror\x18\x01 \x01(\x0b\x32\x10.voldemort.Error\"d\n\x1a\x46\x65tchPartitionFilesRequest\x12\r\n\x05store\x18\x01 \x02(\t\x12\x37\n\x14replica_to_partition\x18\x02 \x03(\x0b\x32\x19.voldemort.PartitionTuple\"\xd7\x01\n\x1c\x46\x65tchPartitionEntriesRequest\x12\x37\n\x14replica_to_partition\x18\x01 \x03(\x0b\x32\x19.voldemort.PartitionTuple\x12\r\n\x05store\x18\x02 \x02(\t\x12*\n\x06\x66ilter\x18\x03 \x01(\x0b\x32\x1a.voldemort.VoldemortFilter\x12\x14\n\x0c\x66\x65tch_values\x18\x04 \x01(\x08\x12\x14\n\x0cskip_records\x18\x05 \x01(\x03\x12\x17\n\x0finitial_cluster\x18\x06 \x01(\t\"\x81\x01\n\x1d\x46\x65tchPartitionEntriesResponse\x12\x32\n\x0fpartition_entry\x18\x01 \x01(\x0b\x32\x19.voldemort.PartitionEntry\x12\x0b\n\x03key\x18\x02 \x01(\x0c\x12\x1f\n\x05\x65rror\x18\x03 \x01(\x0b\x32\x10.voldemort.Error\"\xac\x01\n\x1d\x44\x65letePartitionEntriesRequest\x12\r\n\x05store\x18\x01 \x02(\t\x12\x37\n\x14replica_to_partition\x18\x02 \x03(\x0b\x32\x19.voldemort.PartitionTuple\x12*\n\x06\x66ilter\x18\x03 \x01(\x0b\x32\x1a.voldemort.VoldemortFilter\x12\x17\n\x0finitial_cluster\x18\x04 \x01(\t\"P\n\x1e\x44\x65letePartitionEntriesResponse\x12\r\n\x05\x63ount\x18\x01 \x01(\x03\x12\x1f\n\x05\x65rror\x18\x02 \x01(\x0b\x32\x10.voldemort.Error\"\xcf\x01\n\x1dInitiateFetchAndUpdateRequest\x12\x0f\n\x07node_id\x18\x01 \x02(\x05\x12\r\n\x05store\x18\x02 \x02(\t\x12*\n\x06\x66ilter\x18\x03 \x01(\x0b\x32\x1a.voldemort.VoldemortFilter\x12\x37\n\x14replica_to_partition\x18\x04 \x03(\x0b\x32\x19.voldemort.PartitionTuple\x12\x17\n\x0finitial_cluster\x18\x05 \x01(\t\x12\x10\n\x08optimize\x18\x06 \x01(\x08\"1\n\x1b\x41syncOperationStatusRequest\x12\x12\n\nrequest_id\x18\x01 \x02(\x05\"/\n\x19\x41syncOperationStopRequest\x12\x12\n\nrequest_id\x18\x01 \x02(\x05\"=\n\x1a\x41syncOperationStopResponse\x12\x1f\n\x05\x65rror\x18\x01 \x01(\x0b\x32\x10.voldemort.Error\"2\n\x19\x41syncOperationListRequest\x12\x15\n\rshow_complete\x18\x02 \x02(\x08\"R\n\x1a\x41syncOperationListResponse\x12\x13\n\x0brequest_ids\x18\x01 \x03(\x05\x12\x1f\n\x05\x65rror\x18\x02 \x01(\x0b\x32\x10.voldemort.Error\":\n\x0ePartitionTuple\x12\x14\n\x0creplica_type\x18\x01 \x02(\x05\x12\x12\n\npartitions\x18\x02 \x03(\x05\"e\n\x16PerStorePartitionTuple\x12\x12\n\nstore_name\x18\x01 \x02(\t\x12\x37\n\x14replica_to_partition\x18\x02 \x03(\x0b\x32\x19.voldemort.PartitionTuple\"\xf8\x01\n\x19RebalancePartitionInfoMap\x12\x12\n\nstealer_id\x18\x01 \x02(\x05\x12\x10\n\x08\x64onor_id\x18\x02 \x02(\x05\x12\x0f\n\x07\x61ttempt\x18\x03 \x02(\x05\x12\x43\n\x18replica_to_add_partition\x18\x04 \x03(\x0b\x32!.voldemort.PerStorePartitionTuple\x12\x46\n\x1breplica_to_delete_partition\x18\x05 \x03(\x0b\x32!.voldemort.PerStorePartitionTuple\x12\x17\n\x0finitial_cluster\x18\x06 \x02(\t\"f\n\x1cInitiateRebalanceNodeRequest\x12\x46\n\x18rebalance_partition_info\x18\x01 \x02(\x0b\x32$.voldemort.RebalancePartitionInfoMap\"m\n#InitiateRebalanceNodeOnDonorRequest\x12\x46\n\x18rebalance_partition_info\x18\x01 \x03(\x0b\x32$.voldemort.RebalancePartitionInfoMap\"\x8a\x01\n\x1c\x41syncOperationStatusResponse\x12\x12\n\nrequest_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x64\x65scription\x18\x02 \x01(\t\x12\x0e\n\x06status\x18\x03 \x01(\t\x12\x10\n\x08\x63omplete\x18\x04 \x01(\x08\x12\x1f\n\x05\x65rror\x18\x05 \x01(\x0b\x32\x10.voldemort.Error\"\'\n\x16TruncateEntriesRequest\x12\r\n\x05store\x18\x01 \x02(\t\":\n\x17TruncateEntriesResponse\x12\x1f\n\x05\x65rror\x18\x01 \x01(\x0b\x32\x10.voldemort.Error\"*\n\x0f\x41\x64\x64StoreRequest\x12\x17\n\x0fstoreDefinition\x18\x01 \x02(\t\"3\n\x10\x41\x64\x64StoreResponse\x12\x1f\n\x05\x65rror\x18\x01 \x01(\x0b\x32\x10.voldemort.Error\"\'\n\x12\x44\x65leteStoreRequest\x12\x11\n\tstoreName\x18\x01 \x02(\t\"6\n\x13\x44\x65leteStoreResponse\x12\x1f\n\x05\x65rror\x18\x01 \x01(\x0b\x32\x10.voldemort.Error\"P\n\x11\x46\x65tchStoreRequest\x12\x12\n\nstore_name\x18\x01 \x02(\t\x12\x11\n\tstore_dir\x18\x02 \x02(\t\x12\x14\n\x0cpush_version\x18\x03 \x01(\x03\"9\n\x10SwapStoreRequest\x12\x12\n\nstore_name\x18\x01 \x02(\t\x12\x11\n\tstore_dir\x18\x02 \x02(\t\"P\n\x11SwapStoreResponse\x12\x1f\n\x05\x65rror\x18\x01 \x01(\x0b\x32\x10.voldemort.Error\x12\x1a\n\x12previous_store_dir\x18\x02 \x01(\t\"@\n\x14RollbackStoreRequest\x12\x12\n\nstore_name\x18\x01 \x02(\t\x12\x14\n\x0cpush_version\x18\x02 \x02(\x03\"8\n\x15RollbackStoreResponse\x12\x1f\n\x05\x65rror\x18\x01 \x01(\x0b\x32\x10.voldemort.Error\"&\n\x10RepairJobRequest\x12\x12\n\nstore_name\x18\x01 \x01(\t\"4\n\x11RepairJobResponse\x12\x1f\n\x05\x65rror\x18\x01 \x01(\x0b\x32\x10.voldemort.Error\"=\n\x14ROStoreVersionDirMap\x12\x12\n\nstore_name\x18\x01 \x02(\t\x12\x11\n\tstore_dir\x18\x02 \x02(\t\"/\n\x19GetROMaxVersionDirRequest\x12\x12\n\nstore_name\x18\x01 \x03(\t\"y\n\x1aGetROMaxVersionDirResponse\x12:\n\x11ro_store_versions\x18\x01 \x03(\x0b\x32\x1f.voldemort.ROStoreVersionDirMap\x12\x1f\n\x05\x65rror\x18\x02 \x01(\x0b\x32\x10.voldemort.Error\"3\n\x1dGetROCurrentVersionDirRequest\x12\x12\n\nstore_name\x18\x01 \x03(\t\"}\n\x1eGetROCurrentVersionDirResponse\x12:\n\x11ro_store_versions\x18\x01 \x03(\x0b\x32\x1f.voldemort.ROStoreVersionDirMap\x12\x1f\n\x05\x65rror\x18\x02 \x01(\x0b\x32\x10.voldemort.Error\"/\n\x19GetROStorageFormatRequest\x12\x12\n\nstore_name\x18\x01 \x03(\t\"y\n\x1aGetROStorageFormatResponse\x12:\n\x11ro_store_versions\x18\x01 \x03(\x0b\x32\x1f.voldemort.ROStoreVersionDirMap\x12\x1f\n\x05\x65rror\x18\x02 \x01(\x0b\x32\x10.voldemort.Error\"@\n\x17\x46\x61iledFetchStoreRequest\x12\x12\n\nstore_name\x18\x01 \x02(\t\x12\x11\n\tstore_dir\x18\x02 \x02(\t\";\n\x18\x46\x61iledFetchStoreResponse\x12\x1f\n\x05\x65rror\x18\x01 \x01(\x0b\x32\x10.voldemort.Error\"\xe6\x01\n\x1bRebalanceStateChangeRequest\x12K\n\x1drebalance_partition_info_list\x18\x01 \x03(\x0b\x32$.voldemort.RebalancePartitionInfoMap\x12\x16\n\x0e\x63luster_string\x18\x02 \x02(\t\x12\x0f\n\x07swap_ro\x18\x03 \x02(\x08\x12\x1f\n\x17\x63hange_cluster_metadata\x18\x04 \x02(\x08\x12\x1e\n\x16\x63hange_rebalance_state\x18\x05 \x02(\x08\x12\x10\n\x08rollback\x18\x06 \x02(\x08\"?\n\x1cRebalanceStateChangeResponse\x12\x1f\n\x05\x65rror\x18\x01 \x01(\x0b\x32\x10.voldemort.Error\"G\n DeleteStoreRebalanceStateRequest\x12\x12\n\nstore_name\x18\x01 \x02(\t\x12\x0f\n\x07node_id\x18\x02 \x02(\x05\"D\n!DeleteStoreRebalanceStateResponse\x12\x1f\n\x05\x65rror\x18\x01 \x01(\x0b\x32\x10.voldemort.Error\"h\n\x13NativeBackupRequest\x12\x12\n\nstore_name\x18\x01 \x02(\t\x12\x12\n\nbackup_dir\x18\x02 \x02(\t\x12\x14\n\x0cverify_files\x18\x03 \x02(\x08\x12\x13\n\x0bincremental\x18\x04 \x02(\x08\">\n\x14ReserveMemoryRequest\x12\x12\n\nstore_name\x18\x01 \x02(\t\x12\x12\n\nsize_in_mb\x18\x02 \x02(\x03\"8\n\x15ReserveMemoryResponse\x12\x1f\n\x05\x65rror\x18\x01 \x01(\x0b\x32\x10.voldemort.Error\"\xf0\x0e\n\x15VoldemortAdminRequest\x12)\n\x04type\x18\x01 \x02(\x0e\x32\x1b.voldemort.AdminRequestType\x12\x33\n\x0cget_metadata\x18\x02 \x01(\x0b\x32\x1d.voldemort.GetMetadataRequest\x12\x39\n\x0fupdate_metadata\x18\x03 \x01(\x0b\x32 .voldemort.UpdateMetadataRequest\x12J\n\x18update_partition_entries\x18\x04 \x01(\x0b\x32(.voldemort.UpdatePartitionEntriesRequest\x12H\n\x17\x66\x65tch_partition_entries\x18\x05 \x01(\x0b\x32\'.voldemort.FetchPartitionEntriesRequest\x12J\n\x18\x64\x65lete_partition_entries\x18\x06 \x01(\x0b\x32(.voldemort.DeletePartitionEntriesRequest\x12K\n\x19initiate_fetch_and_update\x18\x07 \x01(\x0b\x32(.voldemort.InitiateFetchAndUpdateRequest\x12\x46\n\x16\x61sync_operation_status\x18\x08 \x01(\x0b\x32&.voldemort.AsyncOperationStatusRequest\x12H\n\x17initiate_rebalance_node\x18\t \x01(\x0b\x32\'.voldemort.InitiateRebalanceNodeRequest\x12\x42\n\x14\x61sync_operation_stop\x18\n \x01(\x0b\x32$.voldemort.AsyncOperationStopRequest\x12\x42\n\x14\x61sync_operation_list\x18\x0b \x01(\x0b\x32$.voldemort.AsyncOperationListRequest\x12;\n\x10truncate_entries\x18\x0c \x01(\x0b\x32!.voldemort.TruncateEntriesRequest\x12-\n\tadd_store\x18\r \x01(\x0b\x32\x1a.voldemort.AddStoreRequest\x12\x33\n\x0c\x64\x65lete_store\x18\x0e \x01(\x0b\x32\x1d.voldemort.DeleteStoreRequest\x12\x31\n\x0b\x66\x65tch_store\x18\x0f \x01(\x0b\x32\x1c.voldemort.FetchStoreRequest\x12/\n\nswap_store\x18\x10 \x01(\x0b\x32\x1b.voldemort.SwapStoreRequest\x12\x37\n\x0erollback_store\x18\x11 \x01(\x0b\x32\x1f.voldemort.RollbackStoreRequest\x12\x44\n\x16get_ro_max_version_dir\x18\x12 \x01(\x0b\x32$.voldemort.GetROMaxVersionDirRequest\x12L\n\x1aget_ro_current_version_dir\x18\x13 \x01(\x0b\x32(.voldemort.GetROCurrentVersionDirRequest\x12\x44\n\x15\x66\x65tch_partition_files\x18\x14 \x01(\x0b\x32%.voldemort.FetchPartitionFilesRequest\x12@\n\x13update_slop_entries\x18\x16 \x01(\x0b\x32#.voldemort.UpdateSlopEntriesRequest\x12>\n\x12\x66\x61iled_fetch_store\x18\x18 \x01(\x0b\x32\".voldemort.FailedFetchStoreRequest\x12\x43\n\x15get_ro_storage_format\x18\x19 \x01(\x0b\x32$.voldemort.GetROStorageFormatRequest\x12\x46\n\x16rebalance_state_change\x18\x1a \x01(\x0b\x32&.voldemort.RebalanceStateChangeRequest\x12/\n\nrepair_job\x18\x1b \x01(\x0b\x32\x1b.voldemort.RepairJobRequest\x12X\n initiate_rebalance_node_on_donor\x18\x1c \x01(\x0b\x32..voldemort.InitiateRebalanceNodeOnDonorRequest\x12Q\n\x1c\x64\x65lete_store_rebalance_state\x18\x1d \x01(\x0b\x32+.voldemort.DeleteStoreRebalanceStateRequest\x12\x35\n\rnative_backup\x18\x1e \x01(\x0b\x32\x1e.voldemort.NativeBackupRequest\x12\x37\n\x0ereserve_memory\x18\x1f \x01(\x0b\x32\x1f.voldemort.ReserveMemoryRequest*\xc8\x05\n\x10\x41\x64minRequestType\x12\x10\n\x0cGET_METADATA\x10\x00\x12\x13\n\x0fUPDATE_METADATA\x10\x01\x12\x1c\n\x18UPDATE_PARTITION_ENTRIES\x10\x02\x12\x1b\n\x17\x46\x45TCH_PARTITION_ENTRIES\x10\x03\x12\x1c\n\x18\x44\x45LETE_PARTITION_ENTRIES\x10\x04\x12\x1d\n\x19INITIATE_FETCH_AND_UPDATE\x10\x05\x12\x1a\n\x16\x41SYNC_OPERATION_STATUS\x10\x06\x12\x1b\n\x17INITIATE_REBALANCE_NODE\x10\x07\x12\x18\n\x14\x41SYNC_OPERATION_STOP\x10\x08\x12\x18\n\x14\x41SYNC_OPERATION_LIST\x10\t\x12\x14\n\x10TRUNCATE_ENTRIES\x10\n\x12\r\n\tADD_STORE\x10\x0b\x12\x10\n\x0c\x44\x45LETE_STORE\x10\x0c\x12\x0f\n\x0b\x46\x45TCH_STORE\x10\r\x12\x0e\n\nSWAP_STORE\x10\x0e\x12\x12\n\x0eROLLBACK_STORE\x10\x0f\x12\x1a\n\x16GET_RO_MAX_VERSION_DIR\x10\x10\x12\x1e\n\x1aGET_RO_CURRENT_VERSION_DIR\x10\x11\x12\x19\n\x15\x46\x45TCH_PARTITION_FILES\x10\x12\x12\x17\n\x13UPDATE_SLOP_ENTRIES\x10\x14\x12\x16\n\x12\x46\x41ILED_FETCH_STORE\x10\x16\x12\x19\n\x15GET_RO_STORAGE_FORMAT\x10\x17\x12\x1a\n\x16REBALANCE_STATE_CHANGE\x10\x18\x12\x0e\n\nREPAIR_JOB\x10\x19\x12$\n INITIATE_REBALANCE_NODE_ON_DONOR\x10\x1a\x12 \n\x1c\x44\x45LETE_STORE_REBALANCE_STATE\x10\x1b\x12\x11\n\rNATIVE_BACKUP\x10\x1c\x12\x12\n\x0eRESERVE_MEMORY\x10\x1d\x42-\n\x1cvoldemort.client.protocol.pbB\x0bVAdminProtoH\x01') + serialized_pb='\n\x15voldemort-admin.proto\x12\tvoldemort\x1a\x16voldemort-client.proto\"!\n\x12GetMetadataRequest\x12\x0b\n\x03key\x18\x01 \x02(\x0c\"]\n\x13GetMetadataResponse\x12%\n\x07version\x18\x01 \x01(\x0b\x32\x14.voldemort.Versioned\x12\x1f\n\x05\x65rror\x18\x02 \x01(\x0b\x32\x10.voldemort.Error\"M\n\x15UpdateMetadataRequest\x12\x0b\n\x03key\x18\x01 \x02(\x0c\x12\'\n\tversioned\x18\x02 \x02(\x0b\x32\x14.voldemort.Versioned\"9\n\x16UpdateMetadataResponse\x12\x1f\n\x05\x65rror\x18\x01 \x01(\x0b\x32\x10.voldemort.Error\"7\n\tFileEntry\x12\x11\n\tfile_name\x18\x01 \x02(\t\x12\x17\n\x0f\x66ile_size_bytes\x18\x02 \x02(\x03\"F\n\x0ePartitionEntry\x12\x0b\n\x03key\x18\x01 \x02(\x0c\x12\'\n\tversioned\x18\x02 \x02(\x0b\x32\x14.voldemort.Versioned\"\x8e\x01\n\x1dUpdatePartitionEntriesRequest\x12\r\n\x05store\x18\x01 \x02(\t\x12\x32\n\x0fpartition_entry\x18\x02 \x02(\x0b\x32\x19.voldemort.PartitionEntry\x12*\n\x06\x66ilter\x18\x03 \x01(\x0b\x32\x1a.voldemort.VoldemortFilter\"A\n\x1eUpdatePartitionEntriesResponse\x12\x1f\n\x05\x65rror\x18\x01 \x01(\x0b\x32\x10.voldemort.Error\"-\n\x0fVoldemortFilter\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x0c\n\x04\x64\x61ta\x18\x02 \x02(\x0c\"\xaf\x01\n\x18UpdateSlopEntriesRequest\x12\r\n\x05store\x18\x01 \x02(\t\x12\x0b\n\x03key\x18\x02 \x02(\x0c\x12\'\n\x07version\x18\x03 \x02(\x0b\x32\x16.voldemort.VectorClock\x12,\n\x0crequest_type\x18\x04 \x02(\x0e\x32\x16.voldemort.RequestType\x12\r\n\x05value\x18\x05 \x01(\x0c\x12\x11\n\ttransform\x18\x06 \x01(\x0c\"<\n\x19UpdateSlopEntriesResponse\x12\x1f\n\x05\x65rror\x18\x01 \x01(\x0b\x32\x10.voldemort.Error\"d\n\x1a\x46\x65tchPartitionFilesRequest\x12\r\n\x05store\x18\x01 \x02(\t\x12\x37\n\x14replica_to_partition\x18\x02 \x03(\x0b\x32\x19.voldemort.PartitionTuple\"\xef\x01\n\x1c\x46\x65tchPartitionEntriesRequest\x12\x37\n\x14replica_to_partition\x18\x01 \x03(\x0b\x32\x19.voldemort.PartitionTuple\x12\r\n\x05store\x18\x02 \x02(\t\x12*\n\x06\x66ilter\x18\x03 \x01(\x0b\x32\x1a.voldemort.VoldemortFilter\x12\x14\n\x0c\x66\x65tch_values\x18\x04 \x01(\x08\x12\x14\n\x0cskip_records\x18\x05 \x01(\x03\x12\x17\n\x0finitial_cluster\x18\x06 \x01(\t\x12\x16\n\x0e\x66\x65tch_orphaned\x18\x07 \x01(\x08\"\x81\x01\n\x1d\x46\x65tchPartitionEntriesResponse\x12\x32\n\x0fpartition_entry\x18\x01 \x01(\x0b\x32\x19.voldemort.PartitionEntry\x12\x0b\n\x03key\x18\x02 \x01(\x0c\x12\x1f\n\x05\x65rror\x18\x03 \x01(\x0b\x32\x10.voldemort.Error\"\xac\x01\n\x1d\x44\x65letePartitionEntriesRequest\x12\r\n\x05store\x18\x01 \x02(\t\x12\x37\n\x14replica_to_partition\x18\x02 \x03(\x0b\x32\x19.voldemort.PartitionTuple\x12*\n\x06\x66ilter\x18\x03 \x01(\x0b\x32\x1a.voldemort.VoldemortFilter\x12\x17\n\x0finitial_cluster\x18\x04 \x01(\t\"P\n\x1e\x44\x65letePartitionEntriesResponse\x12\r\n\x05\x63ount\x18\x01 \x01(\x03\x12\x1f\n\x05\x65rror\x18\x02 \x01(\x0b\x32\x10.voldemort.Error\"\xcf\x01\n\x1dInitiateFetchAndUpdateRequest\x12\x0f\n\x07node_id\x18\x01 \x02(\x05\x12\r\n\x05store\x18\x02 \x02(\t\x12*\n\x06\x66ilter\x18\x03 \x01(\x0b\x32\x1a.voldemort.VoldemortFilter\x12\x37\n\x14replica_to_partition\x18\x04 \x03(\x0b\x32\x19.voldemort.PartitionTuple\x12\x17\n\x0finitial_cluster\x18\x05 \x01(\t\x12\x10\n\x08optimize\x18\x06 \x01(\x08\"1\n\x1b\x41syncOperationStatusRequest\x12\x12\n\nrequest_id\x18\x01 \x02(\x05\"/\n\x19\x41syncOperationStopRequest\x12\x12\n\nrequest_id\x18\x01 \x02(\x05\"=\n\x1a\x41syncOperationStopResponse\x12\x1f\n\x05\x65rror\x18\x01 \x01(\x0b\x32\x10.voldemort.Error\"2\n\x19\x41syncOperationListRequest\x12\x15\n\rshow_complete\x18\x02 \x02(\x08\"R\n\x1a\x41syncOperationListResponse\x12\x13\n\x0brequest_ids\x18\x01 \x03(\x05\x12\x1f\n\x05\x65rror\x18\x02 \x01(\x0b\x32\x10.voldemort.Error\":\n\x0ePartitionTuple\x12\x14\n\x0creplica_type\x18\x01 \x02(\x05\x12\x12\n\npartitions\x18\x02 \x03(\x05\"e\n\x16PerStorePartitionTuple\x12\x12\n\nstore_name\x18\x01 \x02(\t\x12\x37\n\x14replica_to_partition\x18\x02 \x03(\x0b\x32\x19.voldemort.PartitionTuple\"\xf8\x01\n\x19RebalancePartitionInfoMap\x12\x12\n\nstealer_id\x18\x01 \x02(\x05\x12\x10\n\x08\x64onor_id\x18\x02 \x02(\x05\x12\x0f\n\x07\x61ttempt\x18\x03 \x02(\x05\x12\x43\n\x18replica_to_add_partition\x18\x04 \x03(\x0b\x32!.voldemort.PerStorePartitionTuple\x12\x46\n\x1breplica_to_delete_partition\x18\x05 \x03(\x0b\x32!.voldemort.PerStorePartitionTuple\x12\x17\n\x0finitial_cluster\x18\x06 \x02(\t\"f\n\x1cInitiateRebalanceNodeRequest\x12\x46\n\x18rebalance_partition_info\x18\x01 \x02(\x0b\x32$.voldemort.RebalancePartitionInfoMap\"m\n#InitiateRebalanceNodeOnDonorRequest\x12\x46\n\x18rebalance_partition_info\x18\x01 \x03(\x0b\x32$.voldemort.RebalancePartitionInfoMap\"\x8a\x01\n\x1c\x41syncOperationStatusResponse\x12\x12\n\nrequest_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x64\x65scription\x18\x02 \x01(\t\x12\x0e\n\x06status\x18\x03 \x01(\t\x12\x10\n\x08\x63omplete\x18\x04 \x01(\x08\x12\x1f\n\x05\x65rror\x18\x05 \x01(\x0b\x32\x10.voldemort.Error\"\'\n\x16TruncateEntriesRequest\x12\r\n\x05store\x18\x01 \x02(\t\":\n\x17TruncateEntriesResponse\x12\x1f\n\x05\x65rror\x18\x01 \x01(\x0b\x32\x10.voldemort.Error\"*\n\x0f\x41\x64\x64StoreRequest\x12\x17\n\x0fstoreDefinition\x18\x01 \x02(\t\"3\n\x10\x41\x64\x64StoreResponse\x12\x1f\n\x05\x65rror\x18\x01 \x01(\x0b\x32\x10.voldemort.Error\"\'\n\x12\x44\x65leteStoreRequest\x12\x11\n\tstoreName\x18\x01 \x02(\t\"6\n\x13\x44\x65leteStoreResponse\x12\x1f\n\x05\x65rror\x18\x01 \x01(\x0b\x32\x10.voldemort.Error\"P\n\x11\x46\x65tchStoreRequest\x12\x12\n\nstore_name\x18\x01 \x02(\t\x12\x11\n\tstore_dir\x18\x02 \x02(\t\x12\x14\n\x0cpush_version\x18\x03 \x01(\x03\"9\n\x10SwapStoreRequest\x12\x12\n\nstore_name\x18\x01 \x02(\t\x12\x11\n\tstore_dir\x18\x02 \x02(\t\"P\n\x11SwapStoreResponse\x12\x1f\n\x05\x65rror\x18\x01 \x01(\x0b\x32\x10.voldemort.Error\x12\x1a\n\x12previous_store_dir\x18\x02 \x01(\t\"@\n\x14RollbackStoreRequest\x12\x12\n\nstore_name\x18\x01 \x02(\t\x12\x14\n\x0cpush_version\x18\x02 \x02(\x03\"8\n\x15RollbackStoreResponse\x12\x1f\n\x05\x65rror\x18\x01 \x01(\x0b\x32\x10.voldemort.Error\"&\n\x10RepairJobRequest\x12\x12\n\nstore_name\x18\x01 \x01(\t\"4\n\x11RepairJobResponse\x12\x1f\n\x05\x65rror\x18\x01 \x01(\x0b\x32\x10.voldemort.Error\"=\n\x14ROStoreVersionDirMap\x12\x12\n\nstore_name\x18\x01 \x02(\t\x12\x11\n\tstore_dir\x18\x02 \x02(\t\"/\n\x19GetROMaxVersionDirRequest\x12\x12\n\nstore_name\x18\x01 \x03(\t\"y\n\x1aGetROMaxVersionDirResponse\x12:\n\x11ro_store_versions\x18\x01 \x03(\x0b\x32\x1f.voldemort.ROStoreVersionDirMap\x12\x1f\n\x05\x65rror\x18\x02 \x01(\x0b\x32\x10.voldemort.Error\"3\n\x1dGetROCurrentVersionDirRequest\x12\x12\n\nstore_name\x18\x01 \x03(\t\"}\n\x1eGetROCurrentVersionDirResponse\x12:\n\x11ro_store_versions\x18\x01 \x03(\x0b\x32\x1f.voldemort.ROStoreVersionDirMap\x12\x1f\n\x05\x65rror\x18\x02 \x01(\x0b\x32\x10.voldemort.Error\"/\n\x19GetROStorageFormatRequest\x12\x12\n\nstore_name\x18\x01 \x03(\t\"y\n\x1aGetROStorageFormatResponse\x12:\n\x11ro_store_versions\x18\x01 \x03(\x0b\x32\x1f.voldemort.ROStoreVersionDirMap\x12\x1f\n\x05\x65rror\x18\x02 \x01(\x0b\x32\x10.voldemort.Error\"@\n\x17\x46\x61iledFetchStoreRequest\x12\x12\n\nstore_name\x18\x01 \x02(\t\x12\x11\n\tstore_dir\x18\x02 \x02(\t\";\n\x18\x46\x61iledFetchStoreResponse\x12\x1f\n\x05\x65rror\x18\x01 \x01(\x0b\x32\x10.voldemort.Error\"\xe6\x01\n\x1bRebalanceStateChangeRequest\x12K\n\x1drebalance_partition_info_list\x18\x01 \x03(\x0b\x32$.voldemort.RebalancePartitionInfoMap\x12\x16\n\x0e\x63luster_string\x18\x02 \x02(\t\x12\x0f\n\x07swap_ro\x18\x03 \x02(\x08\x12\x1f\n\x17\x63hange_cluster_metadata\x18\x04 \x02(\x08\x12\x1e\n\x16\x63hange_rebalance_state\x18\x05 \x02(\x08\x12\x10\n\x08rollback\x18\x06 \x02(\x08\"?\n\x1cRebalanceStateChangeResponse\x12\x1f\n\x05\x65rror\x18\x01 \x01(\x0b\x32\x10.voldemort.Error\"G\n DeleteStoreRebalanceStateRequest\x12\x12\n\nstore_name\x18\x01 \x02(\t\x12\x0f\n\x07node_id\x18\x02 \x02(\x05\"D\n!DeleteStoreRebalanceStateResponse\x12\x1f\n\x05\x65rror\x18\x01 \x01(\x0b\x32\x10.voldemort.Error\"h\n\x13NativeBackupRequest\x12\x12\n\nstore_name\x18\x01 \x02(\t\x12\x12\n\nbackup_dir\x18\x02 \x02(\t\x12\x14\n\x0cverify_files\x18\x03 \x02(\x08\x12\x13\n\x0bincremental\x18\x04 \x02(\x08\">\n\x14ReserveMemoryRequest\x12\x12\n\nstore_name\x18\x01 \x02(\t\x12\x12\n\nsize_in_mb\x18\x02 \x02(\x03\"8\n\x15ReserveMemoryResponse\x12\x1f\n\x05\x65rror\x18\x01 \x01(\x0b\x32\x10.voldemort.Error\"\xf0\x0e\n\x15VoldemortAdminRequest\x12)\n\x04type\x18\x01 \x02(\x0e\x32\x1b.voldemort.AdminRequestType\x12\x33\n\x0cget_metadata\x18\x02 \x01(\x0b\x32\x1d.voldemort.GetMetadataRequest\x12\x39\n\x0fupdate_metadata\x18\x03 \x01(\x0b\x32 .voldemort.UpdateMetadataRequest\x12J\n\x18update_partition_entries\x18\x04 \x01(\x0b\x32(.voldemort.UpdatePartitionEntriesRequest\x12H\n\x17\x66\x65tch_partition_entries\x18\x05 \x01(\x0b\x32\'.voldemort.FetchPartitionEntriesRequest\x12J\n\x18\x64\x65lete_partition_entries\x18\x06 \x01(\x0b\x32(.voldemort.DeletePartitionEntriesRequest\x12K\n\x19initiate_fetch_and_update\x18\x07 \x01(\x0b\x32(.voldemort.InitiateFetchAndUpdateRequest\x12\x46\n\x16\x61sync_operation_status\x18\x08 \x01(\x0b\x32&.voldemort.AsyncOperationStatusRequest\x12H\n\x17initiate_rebalance_node\x18\t \x01(\x0b\x32\'.voldemort.InitiateRebalanceNodeRequest\x12\x42\n\x14\x61sync_operation_stop\x18\n \x01(\x0b\x32$.voldemort.AsyncOperationStopRequest\x12\x42\n\x14\x61sync_operation_list\x18\x0b \x01(\x0b\x32$.voldemort.AsyncOperationListRequest\x12;\n\x10truncate_entries\x18\x0c \x01(\x0b\x32!.voldemort.TruncateEntriesRequest\x12-\n\tadd_store\x18\r \x01(\x0b\x32\x1a.voldemort.AddStoreRequest\x12\x33\n\x0c\x64\x65lete_store\x18\x0e \x01(\x0b\x32\x1d.voldemort.DeleteStoreRequest\x12\x31\n\x0b\x66\x65tch_store\x18\x0f \x01(\x0b\x32\x1c.voldemort.FetchStoreRequest\x12/\n\nswap_store\x18\x10 \x01(\x0b\x32\x1b.voldemort.SwapStoreRequest\x12\x37\n\x0erollback_store\x18\x11 \x01(\x0b\x32\x1f.voldemort.RollbackStoreRequest\x12\x44\n\x16get_ro_max_version_dir\x18\x12 \x01(\x0b\x32$.voldemort.GetROMaxVersionDirRequest\x12L\n\x1aget_ro_current_version_dir\x18\x13 \x01(\x0b\x32(.voldemort.GetROCurrentVersionDirRequest\x12\x44\n\x15\x66\x65tch_partition_files\x18\x14 \x01(\x0b\x32%.voldemort.FetchPartitionFilesRequest\x12@\n\x13update_slop_entries\x18\x16 \x01(\x0b\x32#.voldemort.UpdateSlopEntriesRequest\x12>\n\x12\x66\x61iled_fetch_store\x18\x18 \x01(\x0b\x32\".voldemort.FailedFetchStoreRequest\x12\x43\n\x15get_ro_storage_format\x18\x19 \x01(\x0b\x32$.voldemort.GetROStorageFormatRequest\x12\x46\n\x16rebalance_state_change\x18\x1a \x01(\x0b\x32&.voldemort.RebalanceStateChangeRequest\x12/\n\nrepair_job\x18\x1b \x01(\x0b\x32\x1b.voldemort.RepairJobRequest\x12X\n initiate_rebalance_node_on_donor\x18\x1c \x01(\x0b\x32..voldemort.InitiateRebalanceNodeOnDonorRequest\x12Q\n\x1c\x64\x65lete_store_rebalance_state\x18\x1d \x01(\x0b\x32+.voldemort.DeleteStoreRebalanceStateRequest\x12\x35\n\rnative_backup\x18\x1e \x01(\x0b\x32\x1e.voldemort.NativeBackupRequest\x12\x37\n\x0ereserve_memory\x18\x1f \x01(\x0b\x32\x1f.voldemort.ReserveMemoryRequest*\xc8\x05\n\x10\x41\x64minRequestType\x12\x10\n\x0cGET_METADATA\x10\x00\x12\x13\n\x0fUPDATE_METADATA\x10\x01\x12\x1c\n\x18UPDATE_PARTITION_ENTRIES\x10\x02\x12\x1b\n\x17\x46\x45TCH_PARTITION_ENTRIES\x10\x03\x12\x1c\n\x18\x44\x45LETE_PARTITION_ENTRIES\x10\x04\x12\x1d\n\x19INITIATE_FETCH_AND_UPDATE\x10\x05\x12\x1a\n\x16\x41SYNC_OPERATION_STATUS\x10\x06\x12\x1b\n\x17INITIATE_REBALANCE_NODE\x10\x07\x12\x18\n\x14\x41SYNC_OPERATION_STOP\x10\x08\x12\x18\n\x14\x41SYNC_OPERATION_LIST\x10\t\x12\x14\n\x10TRUNCATE_ENTRIES\x10\n\x12\r\n\tADD_STORE\x10\x0b\x12\x10\n\x0c\x44\x45LETE_STORE\x10\x0c\x12\x0f\n\x0b\x46\x45TCH_STORE\x10\r\x12\x0e\n\nSWAP_STORE\x10\x0e\x12\x12\n\x0eROLLBACK_STORE\x10\x0f\x12\x1a\n\x16GET_RO_MAX_VERSION_DIR\x10\x10\x12\x1e\n\x1aGET_RO_CURRENT_VERSION_DIR\x10\x11\x12\x19\n\x15\x46\x45TCH_PARTITION_FILES\x10\x12\x12\x17\n\x13UPDATE_SLOP_ENTRIES\x10\x14\x12\x16\n\x12\x46\x41ILED_FETCH_STORE\x10\x16\x12\x19\n\x15GET_RO_STORAGE_FORMAT\x10\x17\x12\x1a\n\x16REBALANCE_STATE_CHANGE\x10\x18\x12\x0e\n\nREPAIR_JOB\x10\x19\x12$\n INITIATE_REBALANCE_NODE_ON_DONOR\x10\x1a\x12 \n\x1c\x44\x45LETE_STORE_REBALANCE_STATE\x10\x1b\x12\x11\n\rNATIVE_BACKUP\x10\x1c\x12\x12\n\x0eRESERVE_MEMORY\x10\x1d\x42-\n\x1cvoldemort.client.protocol.pbB\x0bVAdminProtoH\x01') _ADMINREQUESTTYPE = descriptor.EnumDescriptor( name='AdminRequestType', @@ -133,8 +133,8 @@ ], containing_type=None, options=None, - serialized_start=6971, - serialized_end=7683, + serialized_start=6995, + serialized_end=7707, ) @@ -645,6 +645,13 @@ message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None), + descriptor.FieldDescriptor( + name='fetch_orphaned', full_name='voldemort.FetchPartitionEntriesRequest.fetch_orphaned', index=6, + number=7, type=8, cpp_type=7, label=1, + has_default_value=False, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), ], extensions=[ ], @@ -655,7 +662,7 @@ is_extendable=False, extension_ranges=[], serialized_start=1059, - serialized_end=1274, + serialized_end=1298, ) @@ -696,8 +703,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=1277, - serialized_end=1406, + serialized_start=1301, + serialized_end=1430, ) @@ -745,8 +752,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=1409, - serialized_end=1581, + serialized_start=1433, + serialized_end=1605, ) @@ -780,8 +787,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=1583, - serialized_end=1663, + serialized_start=1607, + serialized_end=1687, ) @@ -843,8 +850,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=1666, - serialized_end=1873, + serialized_start=1690, + serialized_end=1897, ) @@ -871,8 +878,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=1875, - serialized_end=1924, + serialized_start=1899, + serialized_end=1948, ) @@ -899,8 +906,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=1926, - serialized_end=1973, + serialized_start=1950, + serialized_end=1997, ) @@ -927,8 +934,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=1975, - serialized_end=2036, + serialized_start=1999, + serialized_end=2060, ) @@ -955,8 +962,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=2038, - serialized_end=2088, + serialized_start=2062, + serialized_end=2112, ) @@ -990,8 +997,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=2090, - serialized_end=2172, + serialized_start=2114, + serialized_end=2196, ) @@ -1025,8 +1032,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=2174, - serialized_end=2232, + serialized_start=2198, + serialized_end=2256, ) @@ -1060,8 +1067,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=2234, - serialized_end=2335, + serialized_start=2258, + serialized_end=2359, ) @@ -1123,8 +1130,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=2338, - serialized_end=2586, + serialized_start=2362, + serialized_end=2610, ) @@ -1151,8 +1158,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=2588, - serialized_end=2690, + serialized_start=2612, + serialized_end=2714, ) @@ -1179,8 +1186,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=2692, - serialized_end=2801, + serialized_start=2716, + serialized_end=2825, ) @@ -1235,8 +1242,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=2804, - serialized_end=2942, + serialized_start=2828, + serialized_end=2966, ) @@ -1263,8 +1270,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=2944, - serialized_end=2983, + serialized_start=2968, + serialized_end=3007, ) @@ -1291,8 +1298,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=2985, - serialized_end=3043, + serialized_start=3009, + serialized_end=3067, ) @@ -1319,8 +1326,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=3045, - serialized_end=3087, + serialized_start=3069, + serialized_end=3111, ) @@ -1347,8 +1354,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=3089, - serialized_end=3140, + serialized_start=3113, + serialized_end=3164, ) @@ -1375,8 +1382,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=3142, - serialized_end=3181, + serialized_start=3166, + serialized_end=3205, ) @@ -1403,8 +1410,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=3183, - serialized_end=3237, + serialized_start=3207, + serialized_end=3261, ) @@ -1445,8 +1452,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=3239, - serialized_end=3319, + serialized_start=3263, + serialized_end=3343, ) @@ -1480,8 +1487,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=3321, - serialized_end=3378, + serialized_start=3345, + serialized_end=3402, ) @@ -1515,8 +1522,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=3380, - serialized_end=3460, + serialized_start=3404, + serialized_end=3484, ) @@ -1550,8 +1557,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=3462, - serialized_end=3526, + serialized_start=3486, + serialized_end=3550, ) @@ -1578,8 +1585,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=3528, - serialized_end=3584, + serialized_start=3552, + serialized_end=3608, ) @@ -1606,8 +1613,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=3586, - serialized_end=3624, + serialized_start=3610, + serialized_end=3648, ) @@ -1634,8 +1641,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=3626, - serialized_end=3678, + serialized_start=3650, + serialized_end=3702, ) @@ -1669,8 +1676,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=3680, - serialized_end=3741, + serialized_start=3704, + serialized_end=3765, ) @@ -1697,8 +1704,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=3743, - serialized_end=3790, + serialized_start=3767, + serialized_end=3814, ) @@ -1732,8 +1739,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=3792, - serialized_end=3913, + serialized_start=3816, + serialized_end=3937, ) @@ -1760,8 +1767,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=3915, - serialized_end=3966, + serialized_start=3939, + serialized_end=3990, ) @@ -1795,8 +1802,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=3968, - serialized_end=4093, + serialized_start=3992, + serialized_end=4117, ) @@ -1823,8 +1830,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=4095, - serialized_end=4142, + serialized_start=4119, + serialized_end=4166, ) @@ -1858,8 +1865,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=4144, - serialized_end=4265, + serialized_start=4168, + serialized_end=4289, ) @@ -1893,8 +1900,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=4267, - serialized_end=4331, + serialized_start=4291, + serialized_end=4355, ) @@ -1921,8 +1928,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=4333, - serialized_end=4392, + serialized_start=4357, + serialized_end=4416, ) @@ -1984,8 +1991,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=4395, - serialized_end=4625, + serialized_start=4419, + serialized_end=4649, ) @@ -2012,8 +2019,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=4627, - serialized_end=4690, + serialized_start=4651, + serialized_end=4714, ) @@ -2047,8 +2054,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=4692, - serialized_end=4763, + serialized_start=4716, + serialized_end=4787, ) @@ -2075,8 +2082,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=4765, - serialized_end=4833, + serialized_start=4789, + serialized_end=4857, ) @@ -2124,8 +2131,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=4835, - serialized_end=4939, + serialized_start=4859, + serialized_end=4963, ) @@ -2159,8 +2166,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=4941, - serialized_end=5003, + serialized_start=4965, + serialized_end=5027, ) @@ -2187,8 +2194,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=5005, - serialized_end=5061, + serialized_start=5029, + serialized_end=5085, ) @@ -2411,8 +2418,8 @@ options=None, is_extendable=False, extension_ranges=[], - serialized_start=5064, - serialized_end=6968, + serialized_start=5088, + serialized_end=6992, ) import voldemort_client_pb2 diff --git a/config/single_node_cluster/config/stores.xml b/config/single_node_cluster/config/stores.xml index d488d2b62d..082b7064ea 100644 --- a/config/single_node_cluster/config/stores.xml +++ b/config/single_node_cluster/config/stores.xml @@ -14,26 +14,6 @@ string - - - - test-evolution - bdb - Test store - harry@hogwarts.edu, hermoine@hogwarts.edu - consistent-routing - client - 1 - 1 - 1 - - string - - - avro-generic-versioned - {"type": "record", "name": "myrec","fields": [{ "name": "original", "type": "string" }]} - {"type": "record", "name": "myrec","fields": [{ "name": "original", "type": "string" }, { "name": "new-field", "type": "string", "default":"" }]} - - - + + \ No newline at end of file diff --git a/contrib/ec2-testing/test/voldemort/utils/Ec2GossipTest.java b/contrib/ec2-testing/test/voldemort/utils/Ec2GossipTest.java index 6fc0ddd049..b3fc661011 100644 --- a/contrib/ec2-testing/test/voldemort/utils/Ec2GossipTest.java +++ b/contrib/ec2-testing/test/voldemort/utils/Ec2GossipTest.java @@ -1,3 +1,18 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ package voldemort.utils; import static org.junit.Assert.assertEquals; @@ -31,6 +46,7 @@ import voldemort.Attempt; import voldemort.VoldemortException; +import voldemort.client.ClientConfig; import voldemort.client.protocol.admin.AdminClient; import voldemort.client.protocol.admin.AdminClientConfig; import voldemort.cluster.Cluster; @@ -154,20 +170,23 @@ public boolean apply(Integer input) { for(String hostname: newHostnames) { int nodeId = nodeIds.get(hostname); AdminClient adminClient = new AdminClient("tcp://" + hostname + ":6666", - new AdminClientConfig()); + new AdminClientConfig(), + new ClientConfig()); - Versioned versioned = adminClient.getRemoteMetadata(nodeId, - MetadataStore.CLUSTER_KEY); + Versioned versioned = adminClient.metadataMgmtOps.getRemoteMetadata(nodeId, + MetadataStore.CLUSTER_KEY); Version version = versioned.getVersion(); VectorClock vectorClock = (VectorClock) version; vectorClock.incrementVersion(nodeId, System.currentTimeMillis()); try { - adminClient.updateRemoteMetadata(peerNodeId, - MetadataStore.CLUSTER_KEY, - versioned); - adminClient.updateRemoteMetadata(nodeId, MetadataStore.CLUSTER_KEY, versioned); + adminClient.metadataMgmtOps.updateRemoteMetadata(peerNodeId, + MetadataStore.CLUSTER_KEY, + versioned); + adminClient.metadataMgmtOps.updateRemoteMetadata(nodeId, + MetadataStore.CLUSTER_KEY, + versioned); } catch(VoldemortException e) { logger.error(e); } @@ -181,7 +200,8 @@ public boolean apply(Integer input) { private int count = 1; private AdminClient adminClient = new AdminClient("tcp://" + hostNames.get(0) + ":6666", - new AdminClientConfig()); + new AdminClientConfig(), + new ClientConfig()); public void checkCondition() throws Exception, AssertionError { logger.info("Attempt " + count++); @@ -189,7 +209,8 @@ public void checkCondition() throws Exception, AssertionError { for(int testNodeId: oldNodeIdSet) { logger.info("Testing node " + testNodeId); try { - Cluster cluster = adminClient.getRemoteCluster(testNodeId).getValue(); + Cluster cluster = adminClient.metadataMgmtOps.getRemoteCluster(testNodeId) + .getValue(); Set allNodeIds = new HashSet(); for(Node node: cluster.getNodes()) { allNodeIds.add(node.getId()); diff --git a/contrib/ec2-testing/test/voldemort/utils/Ec2RebalanceTest.java b/contrib/ec2-testing/test/voldemort/utils/Ec2RebalanceTest.java index 3e402ffc5a..448a733c68 100644 --- a/contrib/ec2-testing/test/voldemort/utils/Ec2RebalanceTest.java +++ b/contrib/ec2-testing/test/voldemort/utils/Ec2RebalanceTest.java @@ -1,3 +1,18 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ package voldemort.utils; import static voldemort.utils.Ec2RemoteTestUtils.createInstances; @@ -25,6 +40,7 @@ import org.junit.BeforeClass; import voldemort.VoldemortException; +import voldemort.client.ClientConfig; import voldemort.client.protocol.RequestFormatType; import voldemort.client.protocol.admin.AdminClient; import voldemort.client.protocol.admin.AdminClientConfig; @@ -41,6 +57,8 @@ */ public class Ec2RebalanceTest extends AbstractRebalanceTest { + private static int NUM_KEYS; + private static final Logger logger = Logger.getLogger(Ec2RebalanceTest.class); private static Ec2RebalanceTestConfig ec2RebalanceTestConfig; private static List hostNamePairs; @@ -68,13 +86,20 @@ public static void ec2TearDown() throws Exception { destroyInstances(hostNames, ec2RebalanceTestConfig); } + @Override + protected int getNumKeys() { + return NUM_KEYS; + } + @Override protected Cluster getCurrentCluster(int nodeId) { String hostName = nodeIdsInv.get(nodeId); if(hostName == null) { throw new VoldemortException("Node id " + nodeId + " does not exist"); } else { - AdminClient adminClient = new AdminClient(hostName, new AdminClientConfig()); + AdminClient adminClient = new AdminClient(hostName, + new AdminClientConfig(), + new ClientConfig()); return adminClient.getAdminClientCluster(); } } @@ -85,8 +110,10 @@ protected VoldemortState getCurrentState(int nodeId) { if(hostName == null) { throw new VoldemortException("Node id " + nodeId + " does not exist"); } else { - AdminClient adminClient = new AdminClient(hostName, new AdminClientConfig()); - return adminClient.getRemoteServerState(nodeId).getValue(); + AdminClient adminClient = new AdminClient(hostName, + new AdminClientConfig(), + new ClientConfig()); + return adminClient.rebalanceOps.getRemoteServerState(nodeId).getValue(); } } diff --git a/contrib/hadoop-store-builder/lib/commons-configuration-1.6.jar b/contrib/hadoop-store-builder/lib/commons-configuration-1.6.jar new file mode 100644 index 0000000000..2d4689a1b8 Binary files /dev/null and b/contrib/hadoop-store-builder/lib/commons-configuration-1.6.jar differ diff --git a/contrib/hadoop-store-builder/lib/hadoop-0.20.2-core.jar b/contrib/hadoop-store-builder/lib/hadoop-0.20.2-core.jar deleted file mode 100644 index 32ae0a1c9c..0000000000 Binary files a/contrib/hadoop-store-builder/lib/hadoop-0.20.2-core.jar and /dev/null differ diff --git a/contrib/hadoop-store-builder/lib/hadoop-core-1.0.4-p2.jar b/contrib/hadoop-store-builder/lib/hadoop-core-1.0.4-p2.jar new file mode 100644 index 0000000000..c7a9027b1a Binary files /dev/null and b/contrib/hadoop-store-builder/lib/hadoop-core-1.0.4-p2.jar differ diff --git a/contrib/hadoop-store-builder/perf/voldemort/contrib/batchindexer/performance/BdbBuildPerformanceTest.java b/contrib/hadoop-store-builder/perf/voldemort/contrib/batchindexer/performance/BdbBuildPerformanceTest.java index b2c67df2ee..45aadaddde 100644 --- a/contrib/hadoop-store-builder/perf/voldemort/contrib/batchindexer/performance/BdbBuildPerformanceTest.java +++ b/contrib/hadoop-store-builder/perf/voldemort/contrib/batchindexer/performance/BdbBuildPerformanceTest.java @@ -52,7 +52,7 @@ public static void main(String[] args) throws FileNotFoundException, IOException String storeName = args[1]; String jsonDataFile = args[2]; - final Store store = new BdbStorageConfiguration(new VoldemortConfig(new Props(new File(serverPropsFile)))).getStore(TestUtils.makeStoreDefinition(storeName)); + final Store store = new BdbStorageConfiguration(new VoldemortConfig(new Props(new File(serverPropsFile)))).getStore(TestUtils.makeStoreDefinition(storeName), TestUtils.makeSingleNodeRoutingStrategy()); final AtomicInteger obsoletes = new AtomicInteger(0); diff --git a/contrib/hadoop-store-builder/perf/voldemort/contrib/batchindexer/performance/MysqlBuildPerformanceTest.java b/contrib/hadoop-store-builder/perf/voldemort/contrib/batchindexer/performance/MysqlBuildPerformanceTest.java index d7fe084ea1..3d7fd3df4f 100644 --- a/contrib/hadoop-store-builder/perf/voldemort/contrib/batchindexer/performance/MysqlBuildPerformanceTest.java +++ b/contrib/hadoop-store-builder/perf/voldemort/contrib/batchindexer/performance/MysqlBuildPerformanceTest.java @@ -52,7 +52,7 @@ public static void main(String[] args) throws FileNotFoundException, IOException String storeName = args[1]; String jsonDataFile = args[2]; - final Store store = new MysqlStorageConfiguration(new VoldemortConfig(new Props(new File(serverPropsFile)))).getStore(TestUtils.makeStoreDefinition(storeName)); + final Store store = new MysqlStorageConfiguration(new VoldemortConfig(new Props(new File(serverPropsFile)))).getStore(TestUtils.makeStoreDefinition(storeName), TestUtils.makeSingleNodeRoutingStrategy()); final AtomicInteger obsoletes = new AtomicInteger(0); diff --git a/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/disk/HadoopStoreWriter.java b/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/disk/HadoopStoreWriter.java index 87bebe74d8..2506cf1c51 100644 --- a/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/disk/HadoopStoreWriter.java +++ b/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/disk/HadoopStoreWriter.java @@ -26,6 +26,7 @@ import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobConf; @@ -38,6 +39,7 @@ import voldemort.store.readonly.ReadOnlyUtils; import voldemort.store.readonly.checksum.CheckSum; import voldemort.store.readonly.checksum.CheckSum.CheckSumType; +import voldemort.store.readonly.mr.HadoopStoreBuilder; import voldemort.utils.ByteUtils; import voldemort.xml.ClusterMapper; import voldemort.xml.StoreDefinitionsMapper; @@ -146,7 +148,14 @@ public void conf(JobConf job) { this.fs = this.taskIndexFileName.getFileSystem(job); this.indexFileStream = fs.create(this.taskIndexFileName); + fs.setPermission(this.taskIndexFileName, + new FsPermission(HadoopStoreBuilder.HADOOP_FILE_PERMISSION)); + logger.info("Setting permission to 755 for " + this.taskIndexFileName); + this.valueFileStream = fs.create(this.taskValueFileName); + fs.setPermission(this.taskValueFileName, + new FsPermission(HadoopStoreBuilder.HADOOP_FILE_PERMISSION)); + logger.info("Setting permission to 755 for " + this.taskValueFileName); logger.info("Opening " + this.taskIndexFileName + " and " + this.taskValueFileName + " for writing."); @@ -304,6 +313,8 @@ public void close() throws IOException { // Create output directory, if it doesn't exist FileSystem outputFs = nodeDir.getFileSystem(this.conf); outputFs.mkdirs(nodeDir); + outputFs.setPermission(nodeDir, new FsPermission(HadoopStoreBuilder.HADOOP_FILE_PERMISSION)); + logger.info("Setting permission to 755 for " + nodeDir); // Write the checksum and output files if(this.checkSumType != CheckSumType.NONE) { @@ -312,11 +323,21 @@ public void close() throws IOException { Path checkSumIndexFile = new Path(nodeDir, fileNamePrefix + ".index.checksum"); Path checkSumValueFile = new Path(nodeDir, fileNamePrefix + ".data.checksum"); + if(outputFs.exists(checkSumIndexFile)) { + outputFs.delete(checkSumIndexFile); + } FSDataOutputStream output = outputFs.create(checkSumIndexFile); + outputFs.setPermission(checkSumIndexFile, + new FsPermission(HadoopStoreBuilder.HADOOP_FILE_PERMISSION)); output.write(this.checkSumDigestIndex.getCheckSum()); output.close(); + if(outputFs.exists(checkSumValueFile)) { + outputFs.delete(checkSumValueFile); + } output = outputFs.create(checkSumValueFile); + outputFs.setPermission(checkSumValueFile, + new FsPermission(HadoopStoreBuilder.HADOOP_FILE_PERMISSION)); output.write(this.checkSumDigestValue.getCheckSum()); output.close(); } else { @@ -331,8 +352,15 @@ public void close() throws IOException { Path valueFile = new Path(nodeDir, fileNamePrefix + ".data"); logger.info("Moving " + this.taskIndexFileName + " to " + indexFile); + if(outputFs.exists(indexFile)) { + outputFs.delete(indexFile); + } outputFs.rename(taskIndexFileName, indexFile); + logger.info("Moving " + this.taskValueFileName + " to " + valueFile); + if(outputFs.exists(valueFile)) { + outputFs.delete(valueFile); + } outputFs.rename(this.taskValueFileName, valueFile); } diff --git a/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/disk/HadoopStoreWriterPerBucket.java b/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/disk/HadoopStoreWriterPerBucket.java index 6fdf34f910..ddc50857b2 100644 --- a/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/disk/HadoopStoreWriterPerBucket.java +++ b/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/disk/HadoopStoreWriterPerBucket.java @@ -26,6 +26,7 @@ import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobConf; @@ -38,6 +39,7 @@ import voldemort.store.readonly.ReadOnlyUtils; import voldemort.store.readonly.checksum.CheckSum; import voldemort.store.readonly.checksum.CheckSum.CheckSumType; +import voldemort.store.readonly.mr.HadoopStoreBuilder; import voldemort.utils.ByteUtils; import voldemort.xml.ClusterMapper; import voldemort.xml.StoreDefinitionsMapper; @@ -117,7 +119,14 @@ public void conf(JobConf job) { this.fs = this.taskIndexFileName[chunkId].getFileSystem(job); this.indexFileStream[chunkId] = fs.create(this.taskIndexFileName[chunkId]); + fs.setPermission(this.taskIndexFileName[chunkId], + new FsPermission(HadoopStoreBuilder.HADOOP_FILE_PERMISSION)); + logger.info("Setting permission to 755 for " + this.taskIndexFileName[chunkId]); + this.valueFileStream[chunkId] = fs.create(this.taskValueFileName[chunkId]); + fs.setPermission(this.taskValueFileName[chunkId], + new FsPermission(HadoopStoreBuilder.HADOOP_FILE_PERMISSION)); + logger.info("Setting permission to 755 for " + this.taskValueFileName[chunkId]); logger.info("Opening " + this.taskIndexFileName[chunkId] + " and " + this.taskValueFileName[chunkId] + " for writing."); @@ -278,6 +287,8 @@ public void close() throws IOException { // Create output directory, if it doesn't exist FileSystem outputFs = nodeDir.getFileSystem(this.conf); outputFs.mkdirs(nodeDir); + outputFs.setPermission(nodeDir, new FsPermission(HadoopStoreBuilder.HADOOP_FILE_PERMISSION)); + logger.info("Setting permission to 755 for " + nodeDir); // Write the checksum and output files for(int chunkId = 0; chunkId < getNumChunks(); chunkId++) { @@ -290,11 +301,21 @@ public void close() throws IOException { Path checkSumIndexFile = new Path(nodeDir, chunkFileName + ".index.checksum"); Path checkSumValueFile = new Path(nodeDir, chunkFileName + ".data.checksum"); + if(outputFs.exists(checkSumIndexFile)) { + outputFs.delete(checkSumIndexFile); + } FSDataOutputStream output = outputFs.create(checkSumIndexFile); + outputFs.setPermission(checkSumIndexFile, + new FsPermission(HadoopStoreBuilder.HADOOP_FILE_PERMISSION)); output.write(this.checkSumDigestIndex[chunkId].getCheckSum()); output.close(); + if(outputFs.exists(checkSumValueFile)) { + outputFs.delete(checkSumValueFile); + } output = outputFs.create(checkSumValueFile); + outputFs.setPermission(checkSumValueFile, + new FsPermission(HadoopStoreBuilder.HADOOP_FILE_PERMISSION)); output.write(this.checkSumDigestValue[chunkId].getCheckSum()); output.close(); } else { @@ -309,8 +330,15 @@ public void close() throws IOException { Path valueFile = new Path(nodeDir, chunkFileName + ".data"); logger.info("Moving " + this.taskIndexFileName[chunkId] + " to " + indexFile); + if(outputFs.exists(indexFile)) { + outputFs.delete(indexFile); + } fs.rename(taskIndexFileName[chunkId], indexFile); + logger.info("Moving " + this.taskValueFileName[chunkId] + " to " + valueFile); + if(outputFs.exists(valueFile)) { + outputFs.delete(valueFile); + } fs.rename(this.taskValueFileName[chunkId], valueFile); } diff --git a/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/fetcher/HdfsFetcher.java b/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/fetcher/HdfsFetcher.java index a5cbb371d9..8a441916ec 100644 --- a/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/fetcher/HdfsFetcher.java +++ b/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/fetcher/HdfsFetcher.java @@ -21,6 +21,8 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; +import java.net.URI; +import java.security.PrivilegedExceptionAction; import java.text.NumberFormat; import java.util.Arrays; import java.util.Comparator; @@ -32,10 +34,12 @@ import org.apache.commons.codec.binary.Hex; import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.CommonConfigurationKeys; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.security.UserGroupInformation; import org.apache.log4j.Logger; import voldemort.VoldemortException; @@ -61,6 +65,9 @@ public class HdfsFetcher implements FileFetcher { private static final Logger logger = Logger.getLogger(HdfsFetcher.class); + private static String keytabPath = ""; + private static String kerberosPrincipal = VoldemortConfig.DEFAULT_KERBEROS_PRINCIPAL; + private final Long maxBytesPerSecond, reportingIntervalBytes; private final int bufferSize; private static final AtomicInteger copyCount = new AtomicInteger(0); @@ -69,22 +76,36 @@ public class HdfsFetcher implements FileFetcher { private long minBytesPerSecond = 0; private DynamicThrottleLimit globalThrottleLimit = null; private static final int NUM_RETRIES = 3; + private VoldemortConfig voldemortConfig = null; + + public static final String FS_DEFAULT_NAME = "fs.default.name"; + /* Additional constructor invoked from ReadOnlyStoreManagementServlet */ public HdfsFetcher(VoldemortConfig config) { - this(config.getMaxBytesPerSecond(), - config.getReportingIntervalBytes(), - config.getFetcherBufferSize()); + this(null, + null, + config.getReadOnlyFetcherReportingIntervalBytes(), + config.getFetcherBufferSize(), + config.getReadOnlyFetcherMinBytesPerSecond(), + config.getReadOnlyKeytabPath(), + config.getReadOnlyKerberosUser()); - logger.info("Created hdfs fetcher with throttle rate " + maxBytesPerSecond - + ", buffer size " + bufferSize + ", reporting interval bytes " - + reportingIntervalBytes); + this.voldemortConfig = config; + + logger.info("Created hdfs fetcher with no dynamic throttler, buffer size " + bufferSize + + ", reporting interval bytes " + reportingIntervalBytes); } public HdfsFetcher(VoldemortConfig config, DynamicThrottleLimit dynThrottleLimit) { this(dynThrottleLimit, - config.getReportingIntervalBytes(), + null, + config.getReadOnlyFetcherReportingIntervalBytes(), config.getFetcherBufferSize(), - config.getMinBytesPerSecond()); + config.getReadOnlyFetcherMinBytesPerSecond(), + config.getReadOnlyKeytabPath(), + config.getReadOnlyKerberosUser()); + + this.voldemortConfig = config; logger.info("Created hdfs fetcher with throttle rate " + dynThrottleLimit.getRate() + ", buffer size " + bufferSize + ", reporting interval bytes " @@ -98,21 +119,16 @@ public HdfsFetcher() { } public HdfsFetcher(Long maxBytesPerSecond, Long reportingIntervalBytes, int bufferSize) { - this(null, maxBytesPerSecond, reportingIntervalBytes, bufferSize, 0); - } - - public HdfsFetcher(DynamicThrottleLimit dynThrottleLimit, - Long reportingIntervalBytes, - int bufferSize, - long minBytesPerSecond) { - this(dynThrottleLimit, null, reportingIntervalBytes, bufferSize, minBytesPerSecond); + this(null, maxBytesPerSecond, reportingIntervalBytes, bufferSize, 0, "", ""); } public HdfsFetcher(DynamicThrottleLimit dynThrottleLimit, Long maxBytesPerSecond, Long reportingIntervalBytes, int bufferSize, - long minBytesPerSecond) { + long minBytesPerSecond, + String keytabLocation, + String kerberosUser) { if(maxBytesPerSecond != null) { this.maxBytesPerSecond = maxBytesPerSecond; this.throttler = new EventThrottler(this.maxBytesPerSecond); @@ -128,9 +144,20 @@ public HdfsFetcher(DynamicThrottleLimit dynThrottleLimit, this.bufferSize = bufferSize; this.status = null; this.minBytesPerSecond = minBytesPerSecond; + HdfsFetcher.kerberosPrincipal = kerberosUser; + HdfsFetcher.keytabPath = keytabLocation; } public File fetch(String sourceFileUrl, String destinationFile) throws IOException { + String hadoopConfigPath = ""; + if(this.voldemortConfig != null) { + hadoopConfigPath = this.voldemortConfig.getHadoopConfigPath(); + } + return fetch(sourceFileUrl, destinationFile, hadoopConfigPath); + } + + public File fetch(String sourceFileUrl, String destinationFile, String hadoopConfigPath) + throws IOException { if(this.globalThrottleLimit != null) { if(this.globalThrottleLimit.getSpeculativeRate() < this.minBytesPerSecond) throw new VoldemortException("Too many push jobs."); @@ -140,12 +167,117 @@ public File fetch(String sourceFileUrl, String destinationFile) throws IOExcepti ObjectName jmxName = null; try { - Path path = new Path(sourceFileUrl); - Configuration config = new Configuration(); + final Configuration config = new Configuration(); + FileSystem fs = null; config.setInt("io.socket.receive.buffer", bufferSize); config.set("hadoop.rpc.socket.factory.class.ClientProtocol", ConfigurableSocketFactory.class.getName()); - FileSystem fs = path.getFileSystem(config); + config.set("hadoop.security.group.mapping", + "org.apache.hadoop.security.ShellBasedUnixGroupsMapping"); + + final Path path = new Path(sourceFileUrl); + + boolean isHftpBasedFetch = sourceFileUrl.length() > 4 + && sourceFileUrl.substring(0, 4).equals("hftp"); + logger.info("URL : " + sourceFileUrl + " and hftp protocol enabled = " + + isHftpBasedFetch); + logger.info("Hadoop path = " + hadoopConfigPath + " , keytab path = " + + HdfsFetcher.keytabPath + " , kerberos principal = " + + HdfsFetcher.kerberosPrincipal); + + if(hadoopConfigPath.length() > 0 && !isHftpBasedFetch) { + + config.addResource(new Path(hadoopConfigPath + "/core-site.xml")); + config.addResource(new Path(hadoopConfigPath + "/hdfs-site.xml")); + + String security = config.get(CommonConfigurationKeys.HADOOP_SECURITY_AUTHENTICATION); + + if(security == null || !security.equals("kerberos")) { + logger.error("Security isn't turned on in the conf: " + + CommonConfigurationKeys.HADOOP_SECURITY_AUTHENTICATION + + " = " + + config.get(CommonConfigurationKeys.HADOOP_SECURITY_AUTHENTICATION)); + logger.error("Please make sure that the Hadoop config directory path is valid."); + throw new VoldemortException("Error in getting Hadoop filesystem. Invalid Hadoop config directory path."); + } else { + logger.info("Security is turned on in the conf. Trying to authenticate ..."); + + } + } + + if(HdfsFetcher.keytabPath.length() > 0 && !isHftpBasedFetch) { + + /* + * We're seeing intermittent errors while trying to get the + * Hadoop filesystem in a privileged doAs block. This happens + * when we fetch the files over hdfs or webhdfs. This retry loop + * is inserted here as a temporary measure. + */ + for(int retryCount = 0; retryCount < NUM_RETRIES; retryCount++) { + boolean isValidFilesystem = false; + + if(!new File(HdfsFetcher.keytabPath).exists()) { + logger.error("Invalid keytab file path. Please provide a valid keytab path"); + throw new VoldemortException("Error in getting Hadoop filesystem. Invalid keytab file path."); + } + + /* + * The Hadoop path for getting a Filesystem object in a + * privileged doAs block is not thread safe. This might be + * causing intermittent NPE exceptions. Adding a + * synchronized block. + */ + synchronized(this) { + /* + * First login using the specified principal and keytab + * file + */ + UserGroupInformation.setConfiguration(config); + UserGroupInformation.loginUserFromKeytab(HdfsFetcher.kerberosPrincipal, + HdfsFetcher.keytabPath); + + /* + * If login is successful, get the filesystem object. + * NOTE: Ideally we do not need a doAs block for this. + * Consider removing it in the future once the Hadoop + * jars have the corresponding patch (tracked in the + * Hadoop Apache project: HDFS-3367) + */ + try { + logger.info("I've logged in and am now Doasing as " + + UserGroupInformation.getCurrentUser().getUserName()); + fs = UserGroupInformation.getCurrentUser() + .doAs(new PrivilegedExceptionAction() { + + @Override + public FileSystem run() throws Exception { + FileSystem fs = path.getFileSystem(config); + return fs; + } + }); + isValidFilesystem = true; + } catch(InterruptedException e) { + logger.error(e.getMessage()); + } catch(Exception e) { + logger.error("Got an exception while getting the filesystem object: "); + logger.error("Exception class : " + e.getClass()); + e.printStackTrace(); + for(StackTraceElement et: e.getStackTrace()) { + logger.error(et.toString()); + } + } + } + + if(isValidFilesystem) { + break; + } else if(retryCount < NUM_RETRIES - 1) { + logger.error("Could not get a valid Filesystem object. Trying again."); + } + } + + } else { + fs = path.getFileSystem(config); + } CopyStats stats = new CopyStats(sourceFileUrl, sizeOfPath(fs, path)); jmxName = JmxUtils.registerMbean("hdfs-copy-" + copyCount.getAndIncrement(), stats); @@ -156,18 +288,33 @@ public File fetch(String sourceFileUrl, String destinationFile) throws IOExcepti + " already exists"); } + logger.info("Starting fetch for : " + sourceFileUrl); boolean result = fetch(fs, path, destination, stats); + logger.info("Completed fetch : " + sourceFileUrl); + + // Close the filesystem + fs.close(); if(result) { return destination; } else { return null; } + } catch(IOException e) { + e.printStackTrace(); + logger.error("Error while getting Hadoop filesystem : " + e); + throw new VoldemortException("Error while getting Hadoop filesystem : " + e); + } catch(Throwable te) { + te.printStackTrace(); + logger.error("Error thrown while trying to get Hadoop filesystem"); + throw new VoldemortException("Error thrown while trying to get Hadoop filesystem : " + + te); } finally { if(this.globalThrottleLimit != null) { this.globalThrottleLimit.decrementNumJobs(); } - JmxUtils.unregisterMbean(jmxName); + if(jmxName != null) + JmxUtils.unregisterMbean(jmxName); } } @@ -226,25 +373,28 @@ private boolean fetch(FileSystem fs, Path source, File dest, CopyStats stats) logger.debug("Checksum from .metadata " + new String(Hex.encodeHex(origCheckSum))); + + // Define the Global checksum generator checkSumType = CheckSum.fromString(checkSumTypeString); checkSumGenerator = CheckSum.getInstance(checkSumType); - fileCheckSumGenerator = CheckSum.getInstance(checkSumType); } } else if(!status.getPath().getName().startsWith(".")) { // Read other (.data , .index files) File copyLocation = new File(dest, status.getPath().getName()); - copyFileWithCheckSum(fs, - status.getPath(), - copyLocation, - stats, - fileCheckSumGenerator); + fileCheckSumGenerator = copyFileWithCheckSum(fs, + status.getPath(), + copyLocation, + stats, + checkSumType); if(fileCheckSumGenerator != null && checkSumGenerator != null) { byte[] checkSum = fileCheckSumGenerator.getCheckSum(); - logger.debug("Checksum for " + status.getPath() + " - " - + new String(Hex.encodeHex(checkSum))); + if(logger.isDebugEnabled()) { + logger.debug("Checksum for " + status.getPath() + " - " + + new String(Hex.encodeHex(checkSum))); + } checkSumGenerator.update(checkSum); } } @@ -275,18 +425,38 @@ private boolean fetch(FileSystem fs, Path source, File dest, CopyStats stats) } - private void copyFileWithCheckSum(FileSystem fs, - Path source, - File dest, - CopyStats stats, - CheckSum fileCheckSumGenerator) throws IOException { - logger.info("Starting copy of " + source + " to " + dest); + /** + * Function to copy a file from the given filesystem with a checksum of type + * 'checkSumType' computed and returned. In case an error occurs during such + * a copy, we do a retry for a maximum of NUM_RETRIES + * + * @param fs Filesystem used to copy the file + * @param source Source path of the file to copy + * @param dest Destination path of the file on the local machine + * @param stats Stats for measuring the transfer progress + * @param checkSumType Type of the Checksum to be computed for this file + * @return A Checksum (generator) of type checkSumType which contains the + * computed checksum of the copied file + * @throws IOException + */ + private CheckSum copyFileWithCheckSum(FileSystem fs, + Path source, + File dest, + CopyStats stats, + CheckSumType checkSumType) throws IOException { + CheckSum fileCheckSumGenerator = null; + logger.debug("Starting copy of " + source + " to " + dest); FSDataInputStream input = null; OutputStream output = null; for(int attempt = 0; attempt < NUM_RETRIES; attempt++) { boolean success = true; try { + // Create a per file checksum generator + if(checkSumType != null) { + fileCheckSumGenerator = CheckSum.getInstance(checkSumType); + } + input = fs.open(source); output = new BufferedOutputStream(new FileOutputStream(dest)); byte[] buffer = new byte[bufferSize]; @@ -298,10 +468,16 @@ private void copyFileWithCheckSum(FileSystem fs, output.write(buffer, 0, read); } - if(fileCheckSumGenerator != null) + // Update the per file checksum + if(fileCheckSumGenerator != null) { fileCheckSumGenerator.update(buffer, 0, read); - if(throttler != null) + } + + // Check if we need to throttle the fetch + if(throttler != null) { throttler.maybeThrottle(read); + } + stats.recordBytes(read); if(stats.getBytesSinceLastReport() > reportingIntervalBytes) { NumberFormat format = NumberFormat.getNumberInstance(); @@ -334,7 +510,19 @@ private void copyFileWithCheckSum(FileSystem fs, throw ioe; } - } finally { + } catch(Exception e) { + logger.error("Error during copying file ", e); + return null; + + } catch(Throwable te) { + logger.error("Error during copying file ", te); + return null; + + } + // the finally block _always_ executes even if we have + // return in the catch block + + finally { IOUtils.closeQuietly(output); IOUtils.closeQuietly(input); if(success) { @@ -342,8 +530,9 @@ private void copyFileWithCheckSum(FileSystem fs, } } - + logger.debug("Completed copy of " + source + " to " + dest); } + return fileCheckSumGenerator; } private long sizeOfPath(FileSystem fs, Path path) throws IOException { @@ -459,28 +648,120 @@ public void setAsyncOperationStatus(AsyncOperationStatus status) { * Main method for testing fetching */ public static void main(String[] args) throws Exception { - if(args.length != 1) - Utils.croak("USAGE: java " + HdfsFetcher.class.getName() + " url"); + if(args.length < 1) + Utils.croak("USAGE: java " + HdfsFetcher.class.getName() + + " url [keytab location] [kerberos username] [hadoop-config-path]"); String url = args[0]; + + String keytabLocation = ""; + String kerberosUser = ""; + String hadoopPath = ""; + if(args.length == 4) { + keytabLocation = args[1]; + kerberosUser = args[2]; + hadoopPath = args[3]; + } + long maxBytesPerSec = 1024 * 1024 * 1024; Path p = new Path(url); - Configuration config = new Configuration(); + + final Configuration config = new Configuration(); + final URI uri = new URI(url); config.setInt("io.file.buffer.size", VoldemortConfig.DEFAULT_BUFFER_SIZE); config.set("hadoop.rpc.socket.factory.class.ClientProtocol", ConfigurableSocketFactory.class.getName()); config.setInt("io.socket.receive.buffer", 1 * 1024 * 1024 - 10000); - FileStatus status = p.getFileSystem(config).getFileStatus(p); + + FileSystem fs = null; + p = new Path(url); + HdfsFetcher.keytabPath = keytabLocation; + HdfsFetcher.kerberosPrincipal = kerberosUser; + + boolean isHftpBasedFetch = url.length() > 4 && url.substring(0, 4).equals("hftp"); + logger.info("URL : " + url + " and hftp protocol enabled = " + isHftpBasedFetch); + + if(hadoopPath.length() > 0 && !isHftpBasedFetch) { + config.set("hadoop.security.group.mapping", + "org.apache.hadoop.security.ShellBasedUnixGroupsMapping"); + + config.addResource(new Path(hadoopPath + "/core-site.xml")); + config.addResource(new Path(hadoopPath + "/hdfs-site.xml")); + + String security = config.get(CommonConfigurationKeys.HADOOP_SECURITY_AUTHENTICATION); + + if(security == null || !security.equals("kerberos")) { + logger.info("Security isn't turned on in the conf: " + + CommonConfigurationKeys.HADOOP_SECURITY_AUTHENTICATION + " = " + + config.get(CommonConfigurationKeys.HADOOP_SECURITY_AUTHENTICATION)); + logger.info("Fix that. Exiting."); + return; + } else { + logger.info("Security is turned on in the conf. Trying to authenticate ..."); + } + } + + try { + + // Get the filesystem object + if(keytabLocation.length() > 0 && !isHftpBasedFetch) { + UserGroupInformation.setConfiguration(config); + UserGroupInformation.loginUserFromKeytab(kerberosUser, keytabLocation); + + final Path path = p; + try { + logger.debug("I've logged in and am now Doasing as " + + UserGroupInformation.getCurrentUser().getUserName()); + fs = UserGroupInformation.getCurrentUser() + .doAs(new PrivilegedExceptionAction() { + + public FileSystem run() throws Exception { + FileSystem fs = path.getFileSystem(config); + return fs; + } + }); + } catch(InterruptedException e) { + logger.error(e.getMessage()); + } catch(Exception e) { + logger.error("Got an exception while getting the filesystem object: "); + logger.error("Exception class : " + e.getClass()); + e.printStackTrace(); + for(StackTraceElement et: e.getStackTrace()) { + logger.error(et.toString()); + } + } + } else { + fs = p.getFileSystem(config); + } + + } catch(IOException e) { + e.printStackTrace(); + System.err.println("IOException in getting Hadoop filesystem object !!! Exiting !!!"); + System.exit(-1); + } catch(Throwable te) { + te.printStackTrace(); + logger.error("Error thrown while trying to get Hadoop filesystem"); + System.exit(-1); + } + + FileStatus status = fs.listStatus(p)[0]; long size = status.getLen(); - HdfsFetcher fetcher = new HdfsFetcher(maxBytesPerSec, + HdfsFetcher fetcher = new HdfsFetcher(null, + maxBytesPerSec, VoldemortConfig.REPORTING_INTERVAL_BYTES, - VoldemortConfig.DEFAULT_BUFFER_SIZE); + VoldemortConfig.DEFAULT_BUFFER_SIZE, + 0, + keytabLocation, + kerberosUser); long start = System.currentTimeMillis(); + File location = fetcher.fetch(url, System.getProperty("java.io.tmpdir") + File.separator - + start); + + start, hadoopPath); + double rate = size * Time.MS_PER_SECOND / (double) (System.currentTimeMillis() - start); NumberFormat nf = NumberFormat.getInstance(); nf.setMaximumFractionDigits(2); System.out.println("Fetch to " + location + " completed: " + nf.format(rate / (1024.0 * 1024.0)) + " MB/sec."); + fs.close(); } } diff --git a/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/AbstractHadoopStoreBuilderMapper.java b/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/AbstractHadoopStoreBuilderMapper.java index ea18558da6..cad736f861 100644 --- a/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/AbstractHadoopStoreBuilderMapper.java +++ b/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/AbstractHadoopStoreBuilderMapper.java @@ -26,6 +26,7 @@ import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; +import voldemort.cluster.Node; import voldemort.routing.ConsistentRoutingStrategy; import voldemort.serialization.DefaultSerializerFactory; import voldemort.serialization.Serializer; @@ -33,7 +34,6 @@ import voldemort.serialization.SerializerFactory; import voldemort.store.compress.CompressionStrategy; import voldemort.store.compress.CompressionStrategyFactory; -import voldemort.store.readonly.mr.utils.MapperKeyValueWriter; import voldemort.utils.ByteUtils; /** @@ -76,31 +76,100 @@ public void map(K key, V value, OutputCollector output, Reporter reporter) throws IOException { + byte[] keyBytes = keySerializer.toBytes(makeKey(key, value)); byte[] valBytes = valueSerializer.toBytes(makeValue(key, value)); - MapperKeyValueWriter mapWriter = new MapperKeyValueWriter(); - - List mapperList = mapWriter.map(routingStrategy, - keySerializer, - valueSerializer, - valueCompressor, - keyCompressor, - keySerializerDefinition, - valueSerializerDefinition, - keyBytes, - valBytes, - getSaveKeys(), - md5er); - - for(int i = 0; i < mapperList.size(); i++) { - voldemort.utils.Pair pair = (voldemort.utils.Pair) mapperList.get(i); - BytesWritable outputKey = pair.getFirst(); - BytesWritable outputVal = pair.getSecond(); + // Compress key and values if required + if(keySerializerDefinition.hasCompression()) { + keyBytes = keyCompressor.deflate(keyBytes); + } - output.collect(outputKey, outputVal); + if(valueSerializerDefinition.hasCompression()) { + valBytes = valueCompressor.deflate(valBytes); } + // Get the output byte arrays ready to populate + byte[] outputValue; + BytesWritable outputKey; + + // Leave initial offset for (a) node id (b) partition id + // since they are written later + int offsetTillNow = 2 * ByteUtils.SIZE_OF_INT; + + if(getSaveKeys()) { + + // In order - 4 ( for node id ) + 4 ( partition id ) + 1 ( replica + // type - primary | secondary | tertiary... ] + 4 ( key size ) + // size ) + 4 ( value size ) + key + value + outputValue = new byte[valBytes.length + keyBytes.length + ByteUtils.SIZE_OF_BYTE + 4 + * ByteUtils.SIZE_OF_INT]; + + // Write key length - leave byte for replica type + offsetTillNow += ByteUtils.SIZE_OF_BYTE; + ByteUtils.writeInt(outputValue, keyBytes.length, offsetTillNow); + + // Write value length + offsetTillNow += ByteUtils.SIZE_OF_INT; + ByteUtils.writeInt(outputValue, valBytes.length, offsetTillNow); + + // Write key + offsetTillNow += ByteUtils.SIZE_OF_INT; + System.arraycopy(keyBytes, 0, outputValue, offsetTillNow, keyBytes.length); + + // Write value + offsetTillNow += keyBytes.length; + System.arraycopy(valBytes, 0, outputValue, offsetTillNow, valBytes.length); + + // Generate MR key - upper 8 bytes of 16 byte md5 + outputKey = new BytesWritable(ByteUtils.copy(md5er.digest(keyBytes), + 0, + 2 * ByteUtils.SIZE_OF_INT)); + + } else { + + // In order - 4 ( for node id ) + 4 ( partition id ) + value + outputValue = new byte[valBytes.length + 2 * ByteUtils.SIZE_OF_INT]; + + // Write value + System.arraycopy(valBytes, 0, outputValue, offsetTillNow, valBytes.length); + + // Generate MR key - 16 byte md5 + outputKey = new BytesWritable(md5er.digest(keyBytes)); + + } + + // Generate partition and node list this key is destined for + List partitionList = routingStrategy.getPartitionList(keyBytes); + Node[] partitionToNode = routingStrategy.getPartitionToNode(); + + for(int replicaType = 0; replicaType < partitionList.size(); replicaType++) { + + // Node id + ByteUtils.writeInt(outputValue, + partitionToNode[partitionList.get(replicaType)].getId(), + 0); + + if(getSaveKeys()) { + // Primary partition id + ByteUtils.writeInt(outputValue, partitionList.get(0), ByteUtils.SIZE_OF_INT); + + // Replica type + ByteUtils.writeBytes(outputValue, + replicaType, + 2 * ByteUtils.SIZE_OF_INT, + ByteUtils.SIZE_OF_BYTE); + } else { + // Partition id + ByteUtils.writeInt(outputValue, + partitionList.get(replicaType), + ByteUtils.SIZE_OF_INT); + } + BytesWritable outputVal = new BytesWritable(outputValue); + + output.collect(outputKey, outputVal); + + } md5er.reset(); } diff --git a/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/AvroStoreBuilderMapper.java b/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/AvroStoreBuilderMapper.java index 2318346387..4d52358dec 100644 --- a/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/AvroStoreBuilderMapper.java +++ b/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/AvroStoreBuilderMapper.java @@ -35,6 +35,7 @@ import voldemort.VoldemortException; import voldemort.cluster.Cluster; +import voldemort.cluster.Node; import voldemort.routing.ConsistentRoutingStrategy; import voldemort.serialization.DefaultSerializerFactory; import voldemort.serialization.Serializer; @@ -46,7 +47,6 @@ import voldemort.store.compress.CompressionStrategy; import voldemort.store.compress.CompressionStrategyFactory; import voldemort.store.readonly.mr.utils.HadoopUtils; -import voldemort.store.readonly.mr.utils.MapperKeyValueWriter; import voldemort.utils.ByteUtils; import voldemort.xml.ClusterMapper; import voldemort.xml.StoreDefinitionsMapper; @@ -92,24 +92,93 @@ public void map(GenericData.Record record, byte[] keyBytes = keySerializer.toBytes(record.get(keyField)); byte[] valBytes = valueSerializer.toBytes(record.get(valField)); - MapperKeyValueWriter mapWriter = new MapperKeyValueWriter(); - - List mapperList = mapWriter.map(routingStrategy, - keySerializer, - valueSerializer, - valueCompressor, - keyCompressor, - keySerializerDefinition, - valueSerializerDefinition, - keyBytes, - valBytes, - getSaveKeys(), - md5er); - - for(int i = 0; i < mapperList.size(); i++) { - voldemort.utils.Pair pair = (voldemort.utils.Pair) mapperList.get(i); - BytesWritable outputKey = pair.getFirst(); - BytesWritable outputVal = pair.getSecond(); + // Compress key and values if required + if(keySerializerDefinition.hasCompression()) { + keyBytes = keyCompressor.deflate(keyBytes); + } + + if(valueSerializerDefinition.hasCompression()) { + valBytes = valueCompressor.deflate(valBytes); + } + + // Get the output byte arrays ready to populate + byte[] outputValue; + BytesWritable outputKey; + + // Leave initial offset for (a) node id (b) partition id + // since they are written later + int offsetTillNow = 2 * ByteUtils.SIZE_OF_INT; + + if(getSaveKeys()) { + + // In order - 4 ( for node id ) + 4 ( partition id ) + 1 ( + // replica + // type - primary | secondary | tertiary... ] + 4 ( key size ) + // size ) + 4 ( value size ) + key + value + outputValue = new byte[valBytes.length + keyBytes.length + ByteUtils.SIZE_OF_BYTE + 4 + * ByteUtils.SIZE_OF_INT]; + + // Write key length - leave byte for replica type + offsetTillNow += ByteUtils.SIZE_OF_BYTE; + ByteUtils.writeInt(outputValue, keyBytes.length, offsetTillNow); + + // Write value length + offsetTillNow += ByteUtils.SIZE_OF_INT; + ByteUtils.writeInt(outputValue, valBytes.length, offsetTillNow); + + // Write key + offsetTillNow += ByteUtils.SIZE_OF_INT; + System.arraycopy(keyBytes, 0, outputValue, offsetTillNow, keyBytes.length); + + // Write value + offsetTillNow += keyBytes.length; + System.arraycopy(valBytes, 0, outputValue, offsetTillNow, valBytes.length); + + // Generate MR key - upper 8 bytes of 16 byte md5 + outputKey = new BytesWritable(ByteUtils.copy(md5er.digest(keyBytes), + 0, + 2 * ByteUtils.SIZE_OF_INT)); + + } else { + + // In order - 4 ( for node id ) + 4 ( partition id ) + value + outputValue = new byte[valBytes.length + 2 * ByteUtils.SIZE_OF_INT]; + + // Write value + System.arraycopy(valBytes, 0, outputValue, offsetTillNow, valBytes.length); + + // Generate MR key - 16 byte md5 + outputKey = new BytesWritable(md5er.digest(keyBytes)); + + } + + // Generate partition and node list this key is destined for + List partitionList = routingStrategy.getPartitionList(keyBytes); + Node[] partitionToNode = routingStrategy.getPartitionToNode(); + + for(int replicaType = 0; replicaType < partitionList.size(); replicaType++) { + + // Node id + ByteUtils.writeInt(outputValue, + partitionToNode[partitionList.get(replicaType)].getId(), + 0); + + if(getSaveKeys()) { + // Primary partition id + ByteUtils.writeInt(outputValue, partitionList.get(0), ByteUtils.SIZE_OF_INT); + + // Replica type + ByteUtils.writeBytes(outputValue, + replicaType, + 2 * ByteUtils.SIZE_OF_INT, + ByteUtils.SIZE_OF_BYTE); + } else { + // Partition id + ByteUtils.writeInt(outputValue, + partitionList.get(replicaType), + ByteUtils.SIZE_OF_INT); + } + BytesWritable outputVal = new BytesWritable(outputValue); ByteBuffer keyBuffer = null, valueBuffer = null; @@ -118,7 +187,6 @@ public void map(GenericData.Record record, keyBuffer.put(md5KeyBytes); keyBuffer.rewind(); - byte[] outputValue = outputVal.getBytes(); valueBuffer = ByteBuffer.allocate(outputValue.length); valueBuffer.put(outputValue); valueBuffer.rewind(); @@ -128,7 +196,6 @@ public void map(GenericData.Record record, collector.collect(p); } - md5er.reset(); } diff --git a/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/HadoopStoreBuilder.java b/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/HadoopStoreBuilder.java index c1e3c9c70f..529c8b30bd 100644 --- a/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/HadoopStoreBuilder.java +++ b/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/HadoopStoreBuilder.java @@ -34,6 +34,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; +import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.mapred.Counters; import org.apache.hadoop.mapred.FileInputFormat; @@ -69,6 +70,7 @@ public class HadoopStoreBuilder { public static final long MIN_CHUNK_SIZE = 1L; public static final long MAX_CHUNK_SIZE = (long) (1.9 * 1024 * 1024 * 1024); public static final int DEFAULT_BUFFER_SIZE = 64 * 1024; + public static final short HADOOP_FILE_PERMISSION = 493; private static final Logger logger = Logger.getLogger(HadoopStoreBuilder.class); @@ -470,6 +472,8 @@ public void build() { logger.info("No data generated for node " + node.getId() + ". Generating empty folder"); outputFs.mkdirs(nodePath); // Create empty folder + outputFs.setPermission(nodePath, new FsPermission(HADOOP_FILE_PERMISSION)); + logger.info("Setting permission to 755 for " + nodePath); } if(checkSumType != CheckSumType.NONE) { @@ -518,7 +522,10 @@ public boolean accept(Path arg0) { } // Write metadata - FSDataOutputStream metadataStream = outputFs.create(new Path(nodePath, ".metadata")); + Path metadataPath = new Path(nodePath, ".metadata"); + FSDataOutputStream metadataStream = outputFs.create(metadataPath); + outputFs.setPermission(metadataPath, new FsPermission(HADOOP_FILE_PERMISSION)); + logger.info("Setting permission to 755 for " + metadataPath); metadataStream.write(metadata.toJsonString().getBytes()); metadataStream.flush(); metadataStream.close(); diff --git a/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/azkaban/AbstractHadoopJob.java b/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/azkaban/AbstractHadoopJob.java index bcd0170e9f..53a126a627 100644 --- a/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/azkaban/AbstractHadoopJob.java +++ b/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/azkaban/AbstractHadoopJob.java @@ -59,6 +59,10 @@ public abstract class AbstractHadoopJob extends AbstractJob { private final Props _props; private RunningJob _runningJob; + private final static String voldemortLibPath = "voldemort.distributedcache"; + + private final static String hadoopLibPath = "hdfs.default.classpath.dir"; + public AbstractHadoopJob(String name, Props props) { super(name); this._props = props; @@ -216,41 +220,67 @@ public boolean accept(Path arg0) { } } - String hadoopCacheJarDir = _props.getString("hdfs.default.classpath.dir", null); - if(hadoopCacheJarDir != null) { + // this property can be set by azkaban to manage voldemort lib path on + // hdfs + addToDistributedCache(voldemortLibPath, conf); + + boolean isAddFiles = _props.getBoolean("hdfs.default.classpath.dir.enable", false); + if(isAddFiles) { + addToDistributedCache(hadoopLibPath, conf); + } + + // May want to add this to HadoopUtils, but will await refactoring + for(String key: getProps().keySet()) { + String lowerCase = key.toLowerCase(); + if(lowerCase.startsWith(HADOOP_PREFIX)) { + String newKey = key.substring(HADOOP_PREFIX.length()); + conf.set(newKey, getProps().get(key)); + } + } + + HadoopUtils.setPropsInJob(conf, getProps()); + + // http://hadoop.apache.org/docs/r1.1.1/mapred_tutorial.html#Job+Credentials + + // The MapReduce tokens are provided so that tasks can spawn jobs if + // they wish to. + // The tasks authenticate to the JobTracker via the MapReduce delegation + // tokens. + if(System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) { + conf.set("mapreduce.job.credentials.binary", + System.getenv("HADOOP_TOKEN_FILE_LOCATION")); + } + return conf; + } + + /* + * Loads jar files into distributed cache This way the mappers and reducers + * have the jars they need at run time + */ + private void addToDistributedCache(String propertyName, JobConf conf) throws IOException { + String jarDir = _props.getString(propertyName, null); + if(jarDir != null) { FileSystem fs = FileSystem.get(conf); if(fs != null) { - FileStatus[] status = fs.listStatus(new Path(hadoopCacheJarDir)); + FileStatus[] status = fs.listStatus(new Path(jarDir)); if(status != null) { for(int i = 0; i < status.length; ++i) { if(!status[i].isDir()) { - Path path = new Path(hadoopCacheJarDir, status[i].getPath().getName()); + Path path = new Path(jarDir, status[i].getPath().getName()); info("Adding Jar to Distributed Cache Archive File:" + path); DistributedCache.addFileToClassPath(path, conf); } } } else { - info("hdfs.default.classpath.dir " + hadoopCacheJarDir + " is empty."); + info(propertyName + jarDir + " is empty."); } } else { - info("hdfs.default.classpath.dir " + hadoopCacheJarDir - + " filesystem doesn't exist"); - } - } - - // May want to add this to HadoopUtils, but will await refactoring - for(String key: getProps().keySet()) { - String lowerCase = key.toLowerCase(); - if(lowerCase.startsWith(HADOOP_PREFIX)) { - String newKey = key.substring(HADOOP_PREFIX.length()); - conf.set(newKey, getProps().get(key)); + info(propertyName + jarDir + " filesystem doesn't exist"); } } - HadoopUtils.setPropsInJob(conf, getProps()); - return conf; } public Props getProps() { diff --git a/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/azkaban/VoldemortBuildAndPushJob.java b/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/azkaban/VoldemortBuildAndPushJob.java index eb87c77ac1..49128e0f2f 100644 --- a/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/azkaban/VoldemortBuildAndPushJob.java +++ b/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/azkaban/VoldemortBuildAndPushJob.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2009 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -40,6 +40,7 @@ import org.apache.hadoop.mapred.JobConf; import org.apache.log4j.Logger; +import voldemort.client.ClientConfig; import voldemort.client.protocol.admin.AdminClient; import voldemort.client.protocol.admin.AdminClientConfig; import voldemort.cluster.Cluster; @@ -98,11 +99,19 @@ public class VoldemortBuildAndPushJob extends AbstractJob { private static final String AVRO_GENERIC_VERSIONED_TYPE_NAME = "avro-generic-versioned"; + // new properties for the push job + + private final String hdfsFetcherPort; + private final String hdfsFetcherProtocol; + /* Informed stuff */ private final String informedURL = "http://informed.corp.linkedin.com/_post"; private final List informedResults; private ExecutorService informedExecutor; + private String jsonKeyField; + private String jsonValueField; + public VoldemortBuildAndPushJob(String name, Props props) { super(name); this.props = props; @@ -134,6 +143,12 @@ public VoldemortBuildAndPushJob(String name, Props props) { this.informedResults = Lists.newArrayList(); this.informedExecutor = Executors.newFixedThreadPool(2); + this.hdfsFetcherProtocol = props.getString("voldemort.fetcher.protocol", "hftp"); + this.hdfsFetcherPort = props.getString("voldemort.fetcher.port", "50070"); + + log.info("voldemort.fetcher.protocol is set to : " + hdfsFetcherProtocol); + log.info("voldemort.fetcher.port is set to : " + hdfsFetcherPort); + isAvroJob = props.getBoolean("build.type.avro", false); // Set default to false @@ -161,6 +176,8 @@ public void run() throws Exception { boolean build = props.getBoolean("build", true); boolean push = props.getBoolean("push", true); + jsonKeyField = props.getString("key.selection", null); + jsonValueField = props.getString("value.selection", null); if(build && push && dataDirs.size() != 1) { // Should have only one data directory ( which acts like the parent // directory to all @@ -254,9 +271,16 @@ public void verifySchema(String url) throws Exception { String owners = props.getString("push.store.owners", ""); String keySchema = "\n\t\tjson\n\t\t" + schema.getKeyType() + "\n\t"; + if(jsonKeyField != null && jsonKeyField.length() > 0) + keySchema = "\n\t\tjson\n\t\t" + + schema.getKeyType().subtype(jsonKeyField) + "\n\t"; String valSchema = "\n\t\tjson\n\t\t" + schema.getValueType() + "\n\t"; + if(jsonValueField != null && jsonValueField.length() > 0) + valSchema = "\n\t\tjson\n\t\t" + + schema.getValueType().subtype(jsonValueField) + "\n\t"; + boolean hasCompression = false; if(props.containsKey("build.compress.value")) hasCompression = true; @@ -295,10 +319,10 @@ public void verifySchema(String url) throws Exception { // get store def from cluster log.info("Getting store definition from: " + url + " (node id " + this.nodeId + ")"); - AdminClient adminClient = new AdminClient(url, new AdminClientConfig()); + AdminClient adminClient = new AdminClient(url, new AdminClientConfig(), new ClientConfig()); try { - List remoteStoreDefs = adminClient.getRemoteStoreDefList(this.nodeId) - .getValue(); + List remoteStoreDefs = adminClient.metadataMgmtOps.getRemoteStoreDefList(this.nodeId) + .getValue(); boolean foundStore = false; // go over all store defs and see if one has the same name as the @@ -412,7 +436,7 @@ public void verifySchema(String url) throws Exception { log.info("Could not find store " + storeName + " on Voldemort. Adding it to all nodes "); - adminClient.addStore(newStoreDef); + adminClient.storeMgmtOps.addStore(newStoreDef); } // don't use newStoreDef because we want to ALWAYS use the JSON @@ -434,7 +458,7 @@ public void verifySchema(String url) throws Exception { valSchema))); cluster = adminClient.getAdminClientCluster(); } finally { - adminClient.stop(); + adminClient.close(); } } @@ -446,8 +470,8 @@ public String runBuildStore(Props props, String url) throws Exception { URI uri = new URI(url); Path outputDir = new Path(props.getString("build.output.dir"), uri.getHost()); Path inputPath = getInputPath(); - String keySelection = props.getString("build.key.selection", null); - String valSelection = props.getString("build.value.selection", null); + String keySelection = props.getString("key.selection", null); + String valSelection = props.getString("value.selection", null); CheckSumType checkSumType = CheckSum.fromString(props.getString("checksum.type", CheckSum.toString(CheckSumType.MD5))); boolean saveKeys = props.getBoolean("save.keys", true); @@ -632,10 +656,10 @@ public void verifyAvroSchemaAndVersions(String url, boolean isVersioned) throws // get store def from cluster log.info("Getting store definition from: " + url + " (node id " + this.nodeId + ")"); - AdminClient adminClient = new AdminClient(url, new AdminClientConfig()); + AdminClient adminClient = new AdminClient(url, new AdminClientConfig(), new ClientConfig()); try { - List remoteStoreDefs = adminClient.getRemoteStoreDefList(this.nodeId) - .getValue(); + List remoteStoreDefs = adminClient.metadataMgmtOps.getRemoteStoreDefList(this.nodeId) + .getValue(); boolean foundStore = false; // go over all store defs and see if one has the same name as the @@ -791,7 +815,7 @@ public void verifyAvroSchemaAndVersions(String url, boolean isVersioned) throws log.info("Could not find store " + storeName + " on Voldemort. Adding it to all nodes "); - adminClient.addStore(newStoreDef); + adminClient.storeMgmtOps.addStore(newStoreDef); } storeDefs = ImmutableList.of(VoldemortUtils.getStoreDef(VoldemortUtils.getStoreDefXml(storeName, @@ -806,7 +830,7 @@ public void verifyAvroSchemaAndVersions(String url, boolean isVersioned) throws valSchema))); cluster = adminClient.getAdminClientCluster(); } finally { - adminClient.stop(); + adminClient.close(); } } diff --git a/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/azkaban/VoldemortMultiStoreBuildAndPushJob.java b/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/azkaban/VoldemortMultiStoreBuildAndPushJob.java index 80280a5d53..8ed0d86c9b 100644 --- a/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/azkaban/VoldemortMultiStoreBuildAndPushJob.java +++ b/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/azkaban/VoldemortMultiStoreBuildAndPushJob.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2009 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -43,6 +43,7 @@ import org.apache.log4j.Logger; import voldemort.VoldemortException; +import voldemort.client.ClientConfig; import voldemort.client.protocol.admin.AdminClient; import voldemort.client.protocol.admin.AdminClientConfig; import voldemort.cluster.Cluster; @@ -253,7 +254,8 @@ public List call() throws Exception { // verification of // schema + pushing adminClient = new AdminClient(url, - new AdminClientConfig()); + new AdminClientConfig(), + new ClientConfig()); // Verify the store // exists ( If not, @@ -309,7 +311,7 @@ public List call() throws Exception { + url + "'"); - Map pushVersions = adminClient.getROMaxVersion(Lists.newArrayList(storeName)); + Map pushVersions = adminClient.readonlyOps.getROMaxVersion(Lists.newArrayList(storeName)); if(pushVersions == null || !pushVersions.containsKey(storeName)) { @@ -388,7 +390,7 @@ public List call() throws Exception { TimeUnit.SECONDS); } if(adminClient != null) { - adminClient.stop(); + adminClient.close(); } } } @@ -431,7 +433,9 @@ public List call() throws Exception { AdminClient adminClient = null; try { - adminClient = new AdminClient(cluster, new AdminClientConfig()); + adminClient = new AdminClient(cluster, + new AdminClientConfig(), + new ClientConfig()); for(final String storeName: storeNames) { // Check if the [ cluster , store name ] succeeded. We // need to roll it back @@ -448,7 +452,9 @@ public List call() throws Exception { log.info("Deleting data ( " + nodeDir + " ) for successful pushes to '" + clusterUrl + "' and store '" + storeName + "' and node " + nodeId); - adminClient.failedFetchStore(nodeId, storeName, nodeDir); + adminClient.readonlyOps.failedFetchStore(nodeId, + storeName, + nodeDir); log.info("Successfully deleted data for successful pushes to '" + clusterUrl + "' and store '" + storeName + "' and node " + nodeId); @@ -464,7 +470,7 @@ public List call() throws Exception { } } finally { if(adminClient != null) { - adminClient.stop(); + adminClient.close(); } } } @@ -500,7 +506,9 @@ public List call() throws Exception { String url = clusterUrls.get(index); Cluster cluster = urlToCluster.get(url); - AdminClient adminClient = new AdminClient(cluster, new AdminClientConfig()); + AdminClient adminClient = new AdminClient(cluster, + new AdminClientConfig(), + new ClientConfig()); log.info("Swapping all stores on cluster " + url); try { @@ -520,10 +528,10 @@ public List call() throws Exception { previousNodeDirPerClusterStore.put(key, Pair.create(node.getId(), - adminClient.swapStore(node.getId(), - storeName, - nodeDirPerClusterStore.get(key) - .get(node.getId())))); + adminClient.readonlyOps.swapStore(node.getId(), + storeName, + nodeDirPerClusterStore.get(key) + .get(node.getId())))); log.info("Successfully swapped '" + storeName + "' store on cluster " + url + " and node " + node.getId()); @@ -532,7 +540,7 @@ public List call() throws Exception { } } finally { if(adminClient != null) { - adminClient.stop(); + adminClient.close(); } } } @@ -548,16 +556,18 @@ public List call() throws Exception { log.info("Rolling back for cluster " + url + " and store " + clusterStoreTuple.getSecond()); - AdminClient adminClient = new AdminClient(cluster, new AdminClientConfig()); + AdminClient adminClient = new AdminClient(cluster, + new AdminClientConfig(), + new ClientConfig()); try { for(Pair nodeToPreviousDir: nodeToPreviousDirs) { log.info("Rolling back for cluster " + url + " and store " + clusterStoreTuple.getSecond() + " and node " + nodeToPreviousDir.getFirst() + " to dir " + nodeToPreviousDir.getSecond()); - adminClient.rollbackStore(nodeToPreviousDir.getFirst(), - nodeToPreviousDir.getSecond(), - ReadOnlyUtils.getVersionId(new File(nodeToPreviousDir.getSecond()))); + adminClient.readonlyOps.rollbackStore(nodeToPreviousDir.getFirst(), + nodeToPreviousDir.getSecond(), + ReadOnlyUtils.getVersionId(new File(nodeToPreviousDir.getSecond()))); log.info("Successfully rolled back for cluster " + url + " and store " + clusterStoreTuple.getSecond() + " and node " + nodeToPreviousDir.getFirst() + " to dir " @@ -566,7 +576,7 @@ public List call() throws Exception { } } finally { if(adminClient != null) { - adminClient.stop(); + adminClient.close(); } } } @@ -667,8 +677,8 @@ public Pair verifySchema(String storeName, // get store def from cluster log.info("Getting store definition from: " + url + " ( node id " + this.nodeId + " )"); - List remoteStoreDefs = adminClient.getRemoteStoreDefList(this.nodeId) - .getValue(); + List remoteStoreDefs = adminClient.metadataMgmtOps.getRemoteStoreDefList(this.nodeId) + .getValue(); boolean foundStore = false; // go over all store defs and see if one has the same name as the store @@ -771,7 +781,7 @@ public Pair verifySchema(String storeName, log.info("Could not find store " + storeName + " on Voldemort. Adding it to all nodes for cluster " + url); - adminClient.addStore(newStoreDef); + adminClient.storeMgmtOps.addStore(newStoreDef); } // don't use newStoreDef because we want to ALWAYS use the JSON diff --git a/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/azkaban/VoldemortRollbackJob.java b/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/azkaban/VoldemortRollbackJob.java index bb22c1b1fd..696cb72ae4 100644 --- a/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/azkaban/VoldemortRollbackJob.java +++ b/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/azkaban/VoldemortRollbackJob.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2009 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -27,6 +27,7 @@ import org.apache.log4j.Logger; import voldemort.VoldemortException; +import voldemort.client.ClientConfig; import voldemort.client.protocol.admin.AdminClient; import voldemort.client.protocol.admin.AdminClientConfig; import voldemort.cluster.Cluster; @@ -75,7 +76,9 @@ public void run() throws Exception { ExecutorService service = null; try { service = Executors.newCachedThreadPool(); - adminClient = new AdminClient(clusterUrl, new AdminClientConfig()); + adminClient = new AdminClient(clusterUrl, + new AdminClientConfig(), + new ClientConfig()); Cluster cluster = adminClient.getAdminClientCluster(); AdminStoreSwapper swapper = new AdminStoreSwapper(cluster, service, @@ -88,8 +91,8 @@ public void run() throws Exception { // Get the current version for all stores on all nodes Map> previousVersions = Maps.newHashMap(); for(Node node: cluster.getNodes()) { - Map currentVersion = adminClient.getROCurrentVersion(node.getId(), - storeNames); + Map currentVersion = adminClient.readonlyOps.getROCurrentVersion(node.getId(), + storeNames); log.info("Retrieving current version information on node " + node.getId()); Map previousVersion = Maps.newHashMap(); @@ -123,7 +126,7 @@ public void run() throws Exception { service = null; } if(adminClient != null) { - adminClient.stop(); + adminClient.close(); adminClient = null; } } diff --git a/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/azkaban/VoldemortSwapJob.java b/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/azkaban/VoldemortSwapJob.java index f4e51b5dd1..6da1767ee6 100644 --- a/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/azkaban/VoldemortSwapJob.java +++ b/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/azkaban/VoldemortSwapJob.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2009 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -28,6 +28,7 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.JobConf; +import voldemort.client.ClientConfig; import voldemort.client.protocol.admin.AdminClient; import voldemort.client.protocol.admin.AdminClientConfig; import voldemort.cluster.Cluster; @@ -43,16 +44,23 @@ public class VoldemortSwapJob extends AbstractJob { private final Props _props; private VoldemortSwapConf swapConf; + private String hdfsFetcherProtocol; + private String hdfsFetcherPort; public VoldemortSwapJob(String id, Props props) throws IOException { super(id); _props = props; + + this.hdfsFetcherProtocol = props.getString("voldemort.fetcher.protocol", "hftp"); + this.hdfsFetcherPort = props.getString("voldemort.fetcher.port", "50070"); swapConf = new VoldemortSwapConf(_props); } public VoldemortSwapJob(String id, Props props, VoldemortSwapConf conf) throws IOException { super(id); _props = props; + this.hdfsFetcherProtocol = props.getString("voldemort.fetcher.protocol", "hftp"); + this.hdfsFetcherPort = props.getString("voldemort.fetcher.port", "50070"); swapConf = conf; } @@ -149,17 +157,6 @@ public void run() throws Exception { Path dataPath = new Path(dataDir); dataDir = dataPath.makeQualified(FileSystem.get(conf)).toString(); - /* - * Set the protocol according to config: webhdfs if its enabled - * Otherwise use hftp. - */ - Configuration hadoopConfig = new Configuration(); - String protocolName = hadoopConfig.get("dfs.webhdfs.enabled"); - String protocolPort = ""; - if(hadoopConfig.get("dfs.http.address").split(":").length >= 2) - protocolPort = hadoopConfig.get("dfs.http.address").split(":")[1]; - protocolName = (protocolName == null) ? "hftp" : "webhdfs"; - /* * Replace the default protocol and port with the one derived as above */ @@ -171,25 +168,24 @@ public void run() throws Exception { existingPort = pathComponents[2].split("/")[0]; } info("Existing protocol = " + existingProtocol + " and port = " + existingPort); - if(protocolName.length() > 0 && protocolPort.length() > 0) { - dataDir = dataDir.replaceFirst(existingProtocol, protocolName); - dataDir = dataDir.replaceFirst(existingPort, protocolPort); + if(hdfsFetcherProtocol.length() > 0 && hdfsFetcherPort.length() > 0) { + dataDir = dataDir.replaceFirst(existingProtocol, this.hdfsFetcherProtocol); + dataDir = dataDir.replaceFirst(existingPort, this.hdfsFetcherPort); } - info("dfs.webhdfs.enabled = " + hadoopConfig.get("dfs.webhdfs.enabled") - + " and new protocol = " + protocolName + " and port = " + protocolPort); // Create admin client AdminClient client = new AdminClient(cluster, new AdminClientConfig().setMaxConnectionsPerNode(cluster.getNumberOfNodes()) .setAdminConnectionTimeoutSec(httpTimeoutMs / 1000) - .setMaxBackoffDelayMs(swapConf.getMaxBackoffDelayMs())); + .setMaxBackoffDelayMs(swapConf.getMaxBackoffDelayMs()), + new ClientConfig()); if(pushVersion == -1L) { // Need to retrieve max version ArrayList stores = new ArrayList(); stores.add(storeName); - Map pushVersions = client.getROMaxVersion(stores); + Map pushVersions = client.readonlyOps.getROMaxVersion(stores); if(pushVersions == null || !pushVersions.containsKey(storeName)) { throw new RuntimeException("Push version could not be determined for store " diff --git a/contrib/hadoop-store-builder/test/voldemort/store/readonly/fetcher/HDFSFetcherAdvancedTest.java b/contrib/hadoop-store-builder/test/voldemort/store/readonly/fetcher/HDFSFetcherAdvancedTest.java new file mode 100644 index 0000000000..6157d58a20 --- /dev/null +++ b/contrib/hadoop-store-builder/test/voldemort/store/readonly/fetcher/HDFSFetcherAdvancedTest.java @@ -0,0 +1,366 @@ +/* + * Copyright 2008-2009 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.store.readonly.fetcher; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; + +import java.io.File; +import java.io.IOException; +import java.lang.reflect.Method; +import java.util.Arrays; +import java.util.Random; + +import org.apache.commons.codec.binary.Hex; +import org.apache.commons.io.FileUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.junit.Test; +import org.mockito.Mockito; +import org.mortbay.jetty.EofException; + +import voldemort.TestUtils; +import voldemort.server.VoldemortConfig; +import voldemort.store.readonly.ReadOnlyStorageFormat; +import voldemort.store.readonly.ReadOnlyStorageMetadata; +import voldemort.store.readonly.checksum.CheckSum; +import voldemort.store.readonly.checksum.CheckSum.CheckSumType; +import voldemort.store.readonly.checksum.CheckSumTests; +import voldemort.store.readonly.fetcher.HdfsFetcher.CopyStats; +import voldemort.utils.Utils; + +/* + * This test suite tests the HDFSFetcher We test the fetch from hadoop by + * simulating exceptions during fetches + */ +public class HDFSFetcherAdvancedTest { + + public static final Random UNSEEDED_RANDOM = new Random(); + + /* + * Tests that HdfsFetcher can correctly fetch a file in happy path + */ + @Test + public void testCheckSumMetadata() throws Exception { + + // Generate 0_0.[index | data] and their corresponding metadata + File testSourceDirectory = createTempDir(); + File testDestinationDirectory = testSourceDirectory; + + File indexFile = new File(testSourceDirectory, "0_0.index"); + FileUtils.writeByteArrayToFile(indexFile, TestUtils.randomBytes(100)); + + File dataFile = new File(testSourceDirectory, "0_0.data"); + FileUtils.writeByteArrayToFile(dataFile, TestUtils.randomBytes(400)); + + HdfsFetcher fetcher = new HdfsFetcher(); + + File metadataFile = new File(testSourceDirectory, ".metadata"); + + ReadOnlyStorageMetadata metadata = new ReadOnlyStorageMetadata(); + metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V2.getCode()); + + metadata.add(ReadOnlyStorageMetadata.CHECKSUM_TYPE, CheckSum.toString(CheckSumType.MD5)); + // Correct metadata checksum - MD5 + metadata.add(ReadOnlyStorageMetadata.CHECKSUM, + new String(Hex.encodeHex(CheckSumTests.calculateCheckSum(testSourceDirectory.listFiles(), + CheckSumType.MD5)))); + FileUtils.writeStringToFile(metadataFile, metadata.toJsonString()); + + File tempDest = new File(testDestinationDirectory.getAbsolutePath() + "1"); + if(tempDest.exists()) { + + deleteDir(tempDest); + } + + File fetchedFile = fetcher.fetch(testSourceDirectory.getAbsolutePath(), + testDestinationDirectory.getAbsolutePath() + "1"); + + assertNotNull(fetchedFile); + assertEquals(fetchedFile.getAbsolutePath(), testDestinationDirectory.getAbsolutePath() + + "1"); + + tempDest = new File(testDestinationDirectory.getAbsolutePath() + "1"); + if(tempDest.exists()) { + + deleteDir(tempDest); + } + + } + + public static File createTempDir() { + return createTempDir(new File(System.getProperty("java.io.tmpdir"))); + } + + /** + * Create a temporary directory that is a child of the given directory + * + * @param parent The parent directory + * @return The temporary directory + */ + public static File createTempDir(File parent) { + File temp = new File(parent, "hdfsfetchertestadvanced"); + temp.delete(); + temp.mkdir(); + temp.deleteOnExit(); + return temp; + } + + /** + * Convenient method to execute private methods from other classes. + * + * @param test Instance of the class we want to test + * @param methodName Name of the method we want to test + * @param params Arguments we want to pass to the method + * @return Object with the result of the executed method + * @throws Exception + */ + private Object invokePrivateMethod(Object test, String methodName, Object params[]) + throws Exception { + Object ret = null; + + final Method[] methods = test.getClass().getDeclaredMethods(); + for(int i = 0; i < methods.length; ++i) { + if(methods[i].getName().equals(methodName)) { + methods[i].setAccessible(true); + ret = methods[i].invoke(test, params); + break; + } + } + + return ret; + } + + /* + * Tests that HdfsFetcher can correctly fetch a file when there is an + * IOException, specifically an EofException during the fetch + */ + @Test + public void testEofExceptionIntermittent() throws Exception { + + File testSourceDirectory = createTempDir(); + File testDestinationDirectory = testSourceDirectory; + + File indexFile = new File(testSourceDirectory, "0_0.index"); + byte[] indexBytes = TestUtils.randomBytes(100); + FileUtils.writeByteArrayToFile(indexFile, indexBytes); + + final Path source = new Path(indexFile.getAbsolutePath()); + CheckSum fileCheckSumGenerator = CheckSum.getInstance(CheckSumType.MD5); + + fileCheckSumGenerator.update(indexBytes); + byte[] checksumCalculated = calculateCheckSumForFile(source); + + HdfsFetcher fetcher = new HdfsFetcher(); + + Configuration config = new Configuration(); + + FileSystem fs = source.getFileSystem(config); + + FileSystem spyfs = Mockito.spy(fs); + CopyStats stats = new CopyStats(testSourceDirectory.getAbsolutePath(), sizeOfPath(fs, + source)); + + File destination = new File(testDestinationDirectory.getAbsolutePath() + "1"); + Utils.mkdirs(destination); + File copyLocation = new File(destination, "0_0.index"); + + Mockito.doThrow(new IOException()) + .doAnswer(Mockito.CALLS_REAL_METHODS) + .when(spyfs) + .open(source); + + Object[] params = { spyfs, source, copyLocation, stats, CheckSumType.MD5 }; + + CheckSum ckSum = (CheckSum) this.invokePrivateMethod(fetcher, + "copyFileWithCheckSum", + params); + + assertEquals(Arrays.equals(ckSum.getCheckSum(), checksumCalculated), true); + + } + + /* + * Tests that HdfsFetcher can correctly fetch a file when there is an + * IOException, specifically an EofException during the fetch this test case + * is different from the earlier one since it simulates an excpetion midway + * a fetch + */ + + @Test + public void testEofExceptionIntermittentDuringFetch() throws Exception { + + File testSourceDirectory = createTempDir(); + File testDestinationDirectory = testSourceDirectory; + + File indexFile = new File(testSourceDirectory, "0_0.index"); + byte[] indexBytes = TestUtils.randomBytes(VoldemortConfig.DEFAULT_BUFFER_SIZE * 3); + FileUtils.writeByteArrayToFile(indexFile, indexBytes); + + final Path source = new Path(indexFile.getAbsolutePath()); + CheckSum fileCheckSumGenerator = CheckSum.getInstance(CheckSumType.MD5); + + fileCheckSumGenerator.update(indexBytes); + byte[] checksumCalculated = calculateCheckSumForFile(source); + + HdfsFetcher fetcher = new HdfsFetcher(); + + Configuration config = new Configuration(); + + FileSystem fs = source.getFileSystem(config); + + FileSystem spyfs = Mockito.spy(fs); + CopyStats stats = new CopyStats(testSourceDirectory.getAbsolutePath(), sizeOfPath(fs, + source)); + + File destination = new File(testDestinationDirectory.getAbsolutePath() + "1"); + Utils.mkdirs(destination); + File copyLocation = new File(destination, "0_0.index"); + + FSDataInputStream input = null; + + input = fs.open(source); + FSDataInputStream spyinput = Mockito.spy(input); + + Mockito.doAnswer(Mockito.CALLS_REAL_METHODS) + .doThrow(new EofException()) + .when(spyinput) + .read(); + + Mockito.doReturn(spyinput).doReturn(input).when(spyfs).open(source); + + Object[] params = { spyfs, source, copyLocation, stats, CheckSumType.MD5 }; + + CheckSum ckSum = (CheckSum) this.invokePrivateMethod(fetcher, + "copyFileWithCheckSum", + params); + + assertEquals(Arrays.equals(ckSum.getCheckSum(), checksumCalculated), true); + + } + + /* + * Tests that HdfsFetcher can correctly handle when there is an + * RuntimeException + * + * Expected- the exception should be consumed without spilling it over + */ + + @Test + public void testIntermittentRuntimeExceptions() throws Exception { + + File testSourceDirectory = createTempDir(); + File testDestinationDirectory = createTempDir(); + + File indexFile = new File(testSourceDirectory, "0_0.index"); + byte[] indexBytes = TestUtils.randomBytes(100); + FileUtils.writeByteArrayToFile(indexFile, indexBytes); + + final Path source = new Path(indexFile.getAbsolutePath()); + CheckSum fileCheckSumGenerator = CheckSum.getInstance(CheckSumType.MD5); + + fileCheckSumGenerator.update(indexBytes); + + HdfsFetcher fetcher = new HdfsFetcher(); + + Configuration config = new Configuration(); + + FileSystem fs = source.getFileSystem(config); + + FileSystem spyfs = Mockito.spy(fs); + CopyStats stats = new CopyStats(testSourceDirectory.getAbsolutePath(), sizeOfPath(fs, + source)); + + File destination = new File(testDestinationDirectory.getAbsolutePath() + "1"); + Utils.mkdirs(destination); + File copyLocation = new File(destination, "0_0.index"); + + Mockito.doThrow(new RuntimeException()) + .doAnswer(Mockito.CALLS_REAL_METHODS) + .when(spyfs) + .open(source); + + Object[] params = { spyfs, source, copyLocation, stats, CheckSumType.MD5 }; + + CheckSum ckSum = (CheckSum) this.invokePrivateMethod(fetcher, + "copyFileWithCheckSum", + params); + + } + + private long sizeOfPath(FileSystem fs, Path path) throws IOException { + long size = 0; + FileStatus[] statuses = fs.listStatus(path); + if(statuses != null) { + for(FileStatus status: statuses) { + if(status.isDir()) + size += sizeOfPath(fs, status.getPath()); + else + size += status.getLen(); + } + } + return size; + } + + /* + * Helper method to delete a non empty directory + */ + public static boolean deleteDir(File dir) { + if(dir.isDirectory()) { + String[] children = dir.list(); + for(int i = 0; i < children.length; i++) { + boolean success = deleteDir(new File(dir, children[i])); + if(!success) { + return false; + } + } + } + return dir.delete(); + } + + /* + * Helper method to calculate checksum for a single file + */ + private byte[] calculateCheckSumForFile(Path source) throws Exception { + CheckSum fileCheckSumGenerator = CheckSum.getInstance(CheckSumType.MD5); + byte[] buffer = new byte[VoldemortConfig.DEFAULT_BUFFER_SIZE]; + + FSDataInputStream input = null; + + Configuration config = new Configuration(); + + FileSystem fs = source.getFileSystem(config); + input = fs.open(source); + + while(true) { + int read = input.read(buffer); + if(read < 0) { + break; + } + // Update the per file checksum + if(fileCheckSumGenerator != null) { + fileCheckSumGenerator.update(buffer, 0, read); + } + + } + + return fileCheckSumGenerator.getCheckSum(); + } +} diff --git a/contrib/krati/src/java/voldemort/store/krati/KratiStorageConfiguration.java b/contrib/krati/src/java/voldemort/store/krati/KratiStorageConfiguration.java index 5f3a41574c..f5d2590e62 100644 --- a/contrib/krati/src/java/voldemort/store/krati/KratiStorageConfiguration.java +++ b/contrib/krati/src/java/voldemort/store/krati/KratiStorageConfiguration.java @@ -8,6 +8,7 @@ import org.apache.log4j.Logger; import voldemort.VoldemortException; +import voldemort.routing.RoutingStrategy; import voldemort.server.VoldemortConfig; import voldemort.store.StorageConfiguration; import voldemort.store.StorageEngine; @@ -45,7 +46,8 @@ public KratiStorageConfiguration(VoldemortConfig config) { public void close() {} - public StorageEngine getStore(StoreDefinition storeDef) { + public StorageEngine getStore(StoreDefinition storeDef, + RoutingStrategy strategy) { synchronized(lock) { File storeDir = new File(dataDirectory, storeDef.getName()); if(!storeDir.exists()) { diff --git a/contrib/krati/src/java/voldemort/store/krati/KratiStorageEngine.java b/contrib/krati/src/java/voldemort/store/krati/KratiStorageEngine.java index 098c083df3..c30eea305f 100644 --- a/contrib/krati/src/java/voldemort/store/krati/KratiStorageEngine.java +++ b/contrib/krati/src/java/voldemort/store/krati/KratiStorageEngine.java @@ -20,25 +20,21 @@ import org.apache.log4j.Logger; import voldemort.VoldemortException; -import voldemort.store.NoSuchCapabilityException; -import voldemort.store.StorageEngine; -import voldemort.store.StoreCapabilityType; +import voldemort.store.AbstractStorageEngine; import voldemort.store.StoreUtils; import voldemort.utils.ByteArray; import voldemort.utils.ClosableIterator; import voldemort.utils.Pair; import voldemort.utils.StripedLock; -import voldemort.utils.Utils; import voldemort.versioning.ObsoleteVersionException; import voldemort.versioning.Occurred; import voldemort.versioning.VectorClock; import voldemort.versioning.Version; import voldemort.versioning.Versioned; -public class KratiStorageEngine implements StorageEngine { +public class KratiStorageEngine extends AbstractStorageEngine { private static final Logger logger = Logger.getLogger(KratiStorageEngine.class); - private final String name; private final DynamicDataStore datastore; private final StripedLock locks; @@ -49,7 +45,7 @@ public KratiStorageEngine(String name, double hashLoadFactor, int initLevel, File dataDirectory) { - this.name = Utils.notNull(name); + super(name); try { this.datastore = new DynamicDataStore(dataDirectory, initLevel, @@ -64,16 +60,7 @@ public KratiStorageEngine(String name, } - public Object getCapability(StoreCapabilityType capability) { - throw new NoSuchCapabilityException(capability, getName()); - } - - public String getName() { - return this.name; - } - - public void close() throws VoldemortException {} - + @Override public Map>> getAll(Iterable keys, Map transforms) throws VoldemortException { @@ -81,19 +68,22 @@ public Map>> getAll(Iterable keys, return StoreUtils.getAll(this, keys, null); } + @Override public List getVersions(ByteArray key) { return StoreUtils.getVersions(get(key, null)); } + @Override public void truncate() { try { datastore.clear(); } catch(Exception e) { - logger.error("Failed to truncate store '" + name + "': ", e); - throw new VoldemortException("Failed to truncate store '" + name + "'."); + logger.error("Failed to truncate store '" + getName() + "': ", e); + throw new VoldemortException("Failed to truncate store '" + getName() + "'."); } } + @Override public List> get(ByteArray key, byte[] transforms) throws VoldemortException { StoreUtils.assertValidKey(key); try { @@ -104,6 +94,7 @@ public List> get(ByteArray key, byte[] transforms) throws Vold } } + @Override public ClosableIterator>> entries() { List>> returnedList = new ArrayList>>(); DataArray array = datastore.getDataArray(); @@ -143,10 +134,22 @@ public ClosableIterator>> entries() { return new KratiClosableIterator(returnedList); } + @Override public ClosableIterator keys() { return StoreUtils.keys(entries()); } + @Override + public ClosableIterator>> entries(int partition) { + throw new UnsupportedOperationException("Partition based entries scan not supported for this storage type"); + } + + @Override + public ClosableIterator keys(int partition) { + throw new UnsupportedOperationException("Partition based key scan not supported for this storage type"); + } + + @Override public boolean delete(ByteArray key, Version maxVersion) throws VoldemortException { StoreUtils.assertValidKey(key); @@ -189,6 +192,7 @@ public boolean delete(ByteArray key, Version maxVersion) throws VoldemortExcepti } } + @Override public void put(ByteArray key, Versioned value, byte[] transforms) throws VoldemortException { StoreUtils.assertValidKey(key); @@ -290,26 +294,26 @@ public KratiClosableIterator(List>> list) { iter = list.iterator(); } + @Override public void close() { // Nothing to close here } + @Override public boolean hasNext() { return iter.hasNext(); } + @Override public Pair> next() { return iter.next(); } + @Override public void remove() { Pair> currentPair = next(); delete(currentPair.getFirst(), currentPair.getSecond().getVersion()); } } - - public boolean isPartitionAware() { - return false; - } } diff --git a/contrib/restclient/lib/data-1.5.10.jar b/contrib/restclient/lib/data-1.5.10.jar new file mode 100644 index 0000000000..d7d966bc4c Binary files /dev/null and b/contrib/restclient/lib/data-1.5.10.jar differ diff --git a/contrib/restclient/lib/pegasus-common-1.5.10.jar b/contrib/restclient/lib/pegasus-common-1.5.10.jar new file mode 100644 index 0000000000..6dbed4b622 Binary files /dev/null and b/contrib/restclient/lib/pegasus-common-1.5.10.jar differ diff --git a/contrib/restclient/lib/r2-1.5.10.jar b/contrib/restclient/lib/r2-1.5.10.jar new file mode 100644 index 0000000000..e9ab4d3a38 Binary files /dev/null and b/contrib/restclient/lib/r2-1.5.10.jar differ diff --git a/contrib/restclient/src/java/voldemort/restclient/R2Store.java b/contrib/restclient/src/java/voldemort/restclient/R2Store.java new file mode 100644 index 0000000000..4b2abc26f7 --- /dev/null +++ b/contrib/restclient/src/java/voldemort/restclient/R2Store.java @@ -0,0 +1,383 @@ +/* + * Copyright 2008-2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.restclient; + +import java.io.ByteArrayOutputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.net.HttpURLConnection; +import java.net.URI; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; + +import javax.mail.MessagingException; +import javax.mail.internet.MimeBodyPart; +import javax.mail.internet.MimeMultipart; +import javax.mail.util.ByteArrayDataSource; + +import org.apache.commons.codec.binary.Base64; +import org.apache.log4j.Logger; +import org.codehaus.jackson.JsonParseException; +import org.codehaus.jackson.map.JsonMappingException; +import org.codehaus.jackson.map.ObjectMapper; + +import voldemort.VoldemortException; +import voldemort.coordinator.VectorClockWrapper; +import voldemort.store.AbstractStore; +import voldemort.utils.ByteArray; +import voldemort.versioning.VectorClock; +import voldemort.versioning.Version; +import voldemort.versioning.Versioned; + +import com.linkedin.common.callback.FutureCallback; +import com.linkedin.common.util.None; +import com.linkedin.data.ByteString; +import com.linkedin.r2.message.rest.RestRequest; +import com.linkedin.r2.message.rest.RestRequestBuilder; +import com.linkedin.r2.message.rest.RestResponse; +import com.linkedin.r2.transport.common.Client; +import com.linkedin.r2.transport.common.bridge.client.TransportClient; +import com.linkedin.r2.transport.common.bridge.client.TransportClientAdapter; +import com.linkedin.r2.transport.http.client.HttpClientFactory; + +/** + * A class that implements the Store interface for interacting with the RESTful + * Coordinator. It leverages the R2 library for doing this. + * + */ +public class R2Store extends AbstractStore { + + private static final String GET = "GET"; + private static final String POST = "POST"; + private static final String DELETE = "DELETE"; + private static final String ETAG = "ETag"; + public static final String X_VOLD_REQUEST_TIMEOUT_MS = "X-VOLD-Request-Timeout-ms"; + public static final String X_VOLD_INCONSISTENCY_RESOLVER = "X-VOLD-Inconsistency-Resolver"; + public static final String CUSTOM_RESOLVING_STRATEGY = "custom"; + public static final String DEFAULT_RESOLVING_STRATEGY = "timestamp"; + private static final String LAST_MODIFIED = "Last-Modified"; + private static final String MULTIPART_CONTENT_TYPE = "multipart/binary"; + private final Logger logger = Logger.getLogger(R2Store.class); + + HttpURLConnection conn = null; + private HttpClientFactory _clientFactory; + private Client client = null; + private String baseURL; + + public R2Store(String baseURL, String storeName) { + super(storeName); + try { + _clientFactory = new HttpClientFactory(); + final TransportClient transportClient = _clientFactory.getClient(new HashMap()); + client = new TransportClientAdapter(transportClient); + this.baseURL = baseURL; + } catch(Exception e) { + e.printStackTrace(); + } + } + + @Override + public void close() throws VoldemortException { + final FutureCallback callback = new FutureCallback(); + client.shutdown(callback); + try { + callback.get(); + } catch(InterruptedException e) { + e.printStackTrace(); + } catch(ExecutionException e) { + e.printStackTrace(); + } + } + + @Override + public boolean delete(ByteArray key, Version version) throws VoldemortException { + try { + + // Create the REST request with this byte array + String base64Key = new String(Base64.encodeBase64(key.get())); + RestRequestBuilder rb = new RestRequestBuilder(new URI(this.baseURL + "/" + getName() + + "/" + base64Key)); + + // Create a HTTP POST request + // TODO: Create a proper request based on client config + rb.setMethod(DELETE); + rb.setHeader("Content-Type", "application/json"); + rb.setHeader("Content-Length", "0"); + rb.setHeader(X_VOLD_REQUEST_TIMEOUT_MS, "1000"); + + RestRequest request = rb.build(); + Future f = client.restRequest(request); + + // This will block + RestResponse response = f.get(); + final ByteString entity = response.getEntity(); + if(entity == null) { + logger.error("Empty response !"); + } + } catch(VoldemortException ve) { + ve.printStackTrace(); + throw ve; + } catch(Exception e) { + e.printStackTrace(); + } + return false; + } + + @Override + public List> get(ByteArray key, byte[] transforms) throws VoldemortException { + + List> resultList = new ArrayList>(); + + try { + String base64Key = new String(Base64.encodeBase64(key.get())); + RestRequestBuilder rb = new RestRequestBuilder(new URI(this.baseURL + "/" + getName() + + "/" + base64Key)); + + // TODO: Form a proper request based on client config + rb.setMethod(GET); + rb.setHeader("Accept", "application/json"); + rb.setHeader(X_VOLD_REQUEST_TIMEOUT_MS, "1000"); + rb.setHeader(X_VOLD_INCONSISTENCY_RESOLVER, "custom"); + + RestRequest request = rb.build(); + Future f = client.restRequest(request); + + // This will block + RestResponse response = f.get(); + + // Parse the response + final ByteString entity = response.getEntity(); + String eTag = response.getHeader(ETAG); + String lastModified = response.getHeader(LAST_MODIFIED); + if(entity != null) { + resultList = readResults(entity, eTag, lastModified); + } else { + logger.error("Did not get any response!"); + } + + } catch(VoldemortException ve) { + ve.printStackTrace(); + throw ve; + } catch(Exception e) { + if(!e.getMessage().contains("status=404")) { + logger.error("ERROR: " + e); + } + } + + return resultList; + } + + @Override + public void put(ByteArray key, Versioned value, byte[] transform) + throws VoldemortException { + try { + + // Write the value in the payload + ByteArrayOutputStream outputBytes = new ByteArrayOutputStream(); + DataOutputStream outputStream = new DataOutputStream(outputBytes); + byte[] payload = value.getValue(); + outputStream.write(payload); + + // Create the REST request with this byte array + String base64Key = new String(Base64.encodeBase64(key.get())); + RestRequestBuilder rb = new RestRequestBuilder(new URI(this.baseURL + "/" + getName() + + "/" + base64Key)); + + // Create a HTTP POST request + // TODO: Create a proper request based on client config + rb.setMethod(POST); + rb.setEntity(outputBytes.toByteArray()); + rb.setHeader("Content-Type", "application/json"); + rb.setHeader("Content-Length", "" + payload.length); + rb.setHeader(X_VOLD_REQUEST_TIMEOUT_MS, "1000"); + rb.setHeader(X_VOLD_INCONSISTENCY_RESOLVER, "custom"); + + RestRequest request = rb.build(); + Future f = client.restRequest(request); + + // This will block + RestResponse response = f.get(); + final ByteString entity = response.getEntity(); + if(entity == null) { + logger.error("Empty response !"); + } + } catch(VoldemortException ve) { + ve.printStackTrace(); + throw ve; + } catch(Exception e) { + logger.error("ERROR: " + e); + } + } + + private List> readResults(ByteString entity, String eTag, String lastModified) + throws IOException { + + ObjectMapper mapper = new ObjectMapper(); + logger.debug("Received etag : " + eTag); + logger.debug("Received last modified date : " + lastModified); + VectorClockWrapper vcWrapper = mapper.readValue(eTag, VectorClockWrapper.class); + List> results = new ArrayList>(2); + + byte[] bytes = new byte[entity.length()]; + entity.copyBytes(bytes, 0); + VectorClock clock = new VectorClock(vcWrapper.getVersions(), vcWrapper.getTimestamp()); + results.add(new Versioned(bytes, clock)); + return results; + } + + @Override + public Map>> getAll(Iterable keys, + Map tranforms) + throws VoldemortException { + + Map>> resultMap = new HashMap>>(); + + try { + Iterator it = keys.iterator(); + String keyArgs = null; + + while(it.hasNext()) { + ByteArray key = it.next(); + String base64Key = new String(Base64.encodeBase64(key.get())); + if(keyArgs == null) { + keyArgs = base64Key; + } else { + keyArgs += "," + base64Key; + } + } + + RestRequestBuilder rb = new RestRequestBuilder(new URI(this.baseURL + "/" + getName() + + "/" + keyArgs)); + + // TODO: Form a proper request based on client config + rb.setMethod(GET); + rb.setHeader("Accept", MULTIPART_CONTENT_TYPE); + rb.setHeader(X_VOLD_REQUEST_TIMEOUT_MS, "1000"); + + RestRequest request = rb.build(); + Future f = client.restRequest(request); + + // This will block + RestResponse response = f.get(); + + // Parse the response + final ByteString entity = response.getEntity(); + String contentType = response.getHeader("Content-Type"); + // String eTag = response.getHeader(ETAG); + // String lastModified = response.getHeader(LAST_MODIFIED); + if(entity != null) { + if(contentType.equalsIgnoreCase(MULTIPART_CONTENT_TYPE)) { + resultMap = readResultsGetAll(entity); + } else { + logger.error("Did not receive a multipart response"); + } + + } else { + logger.error("Did not get any response!"); + } + + } catch(VoldemortException ve) { + ve.printStackTrace(); + throw ve; + } catch(Exception e) { + if(!e.getMessage().contains("status=404")) { + logger.error("ERROR: " + e); + } + } + + return resultMap; + } + + private Map>> readResultsGetAll(ByteString entity) { + Map>> results = new HashMap>>(); + + try { + ObjectMapper mapper = new ObjectMapper(); + // VectorClockWrapper vcWrapper = mapper.readValue(eTag, + // VectorClockWrapper.class); + + // Build the multipart object + byte[] bytes = new byte[entity.length()]; + entity.copyBytes(bytes, 0); + + ByteArrayDataSource ds = new ByteArrayDataSource(bytes, "multipart/mixed"); + // logger.info("received data = "); + // BufferedReader in = new BufferedReader(new + // InputStreamReader(ds.getInputStream())); + // String inputLine; + // while((inputLine = in.readLine()) != null) + // System.out.println(inputLine); + // in.close(); + + MimeMultipart mp = new MimeMultipart(ds); + for(int i = 0; i < mp.getCount(); i++) { + MimeBodyPart part = (MimeBodyPart) mp.getBodyPart(i); + String eTag = part.getHeader("ETag")[0]; + String contentLocation = part.getHeader("Content-Location")[0]; + + logger.debug("Received etag : " + eTag); + logger.debug("Content-Location : " + contentLocation); + + // Get the key + String base64Key = contentLocation.split("/")[2]; + + logger.debug("Base 64 key : " + base64Key); + ByteArray key = new ByteArray(Base64.decodeBase64(base64Key.getBytes())); + + VectorClockWrapper vcWrapper = mapper.readValue(eTag, VectorClockWrapper.class); + List> keyResultList = new ArrayList>(2); + + // get the value bytes + byte[] bodyPartBytes = ((String) part.getContent()).getBytes(); + VectorClock clock = new VectorClock(vcWrapper.getVersions(), + vcWrapper.getTimestamp()); + keyResultList.add(new Versioned(bodyPartBytes, clock)); + results.put(key, keyResultList); + + } + + // VectorClock clock = new VectorClock(vcWrapper.getVersions(), + // vcWrapper.getTimestamp()); + // results.add(new Versioned(bytes, clock)); + } catch(MessagingException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch(JsonParseException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch(JsonMappingException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch(IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + return results; + + } + + @Override + public List getVersions(ByteArray arg0) { + // TODO Auto-generated method stub + return null; + } +} diff --git a/contrib/restclient/src/java/voldemort/restclient/RESTClient.java b/contrib/restclient/src/java/voldemort/restclient/RESTClient.java new file mode 100644 index 0000000000..531a35178b --- /dev/null +++ b/contrib/restclient/src/java/voldemort/restclient/RESTClient.java @@ -0,0 +1,241 @@ +/* + * Copyright 2008-2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.restclient; + +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +import voldemort.client.RoutingTier; +import voldemort.client.StoreClient; +import voldemort.client.UpdateAction; +import voldemort.cluster.Node; +import voldemort.routing.RoutingStrategyType; +import voldemort.serialization.DefaultSerializerFactory; +import voldemort.serialization.Serializer; +import voldemort.serialization.SerializerDefinition; +import voldemort.serialization.SerializerFactory; +import voldemort.store.Store; +import voldemort.store.StoreDefinition; +import voldemort.store.StoreDefinitionBuilder; +import voldemort.store.serialized.SerializingStore; +import voldemort.store.versioned.InconsistencyResolvingStore; +import voldemort.utils.ByteArray; +import voldemort.versioning.ChainedResolver; +import voldemort.versioning.InconsistencyResolver; +import voldemort.versioning.InconsistentDataException; +import voldemort.versioning.ObsoleteVersionException; +import voldemort.versioning.TimeBasedInconsistencyResolver; +import voldemort.versioning.VectorClock; +import voldemort.versioning.VectorClockInconsistencyResolver; +import voldemort.versioning.Version; +import voldemort.versioning.Versioned; + +import com.google.common.collect.Maps; + +public class RESTClient implements StoreClient { + + private Store clientStore = null; + private SerializerFactory serializerFactory = new DefaultSerializerFactory(); + private StoreDefinition storeDef; + private String storeName; + + /** + * A REST ful equivalent of the DefaultStoreClient. This uses the R2Store to + * interact with the RESTful Coordinator + * + * @param bootstrapURL The bootstrap URL of the Voldemort cluster + * @param storeName Name of the store to interact with + */ + public RESTClient(String bootstrapURL, String storeName) { + + String baseURL = "http://" + bootstrapURL.split(":")[1].substring(2) + ":8080"; + // The lowest layer : Transporting request to coordinator + Store store = new R2Store(baseURL, storeName); + + // TODO + // Get the store definition so that we can learn the Serialization + // and + // compression properties + + // TODO + // Add compression layer + + // Add Serialization layer + + // Set the following values although we don't need them + // TODO: Fix this, so that we only need to set the needed parameters + storeDef = new StoreDefinitionBuilder().setName(storeName) + .setType("bdb") + .setKeySerializer(new SerializerDefinition("string")) + .setValueSerializer(new SerializerDefinition("string")) + .setRoutingPolicy(RoutingTier.CLIENT) + .setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY) + .setReplicationFactor(1) + .setPreferredReads(1) + .setRequiredReads(1) + .setPreferredWrites(1) + .setRequiredWrites(1) + .build(); + Serializer keySerializer = (Serializer) serializerFactory.getSerializer(storeDef.getKeySerializer()); + Serializer valueSerializer = (Serializer) serializerFactory.getSerializer(storeDef.getValueSerializer()); + clientStore = SerializingStore.wrap(store, keySerializer, valueSerializer, null); + + // Add inconsistency Resolving layer + InconsistencyResolver> secondaryResolver = new TimeBasedInconsistencyResolver(); + clientStore = new InconsistencyResolvingStore(clientStore, + new ChainedResolver>(new VectorClockInconsistencyResolver(), + secondaryResolver)); + + this.storeName = storeName; + } + + @Override + public V getValue(K key) { + return getValue(key, null); + } + + @Override + public V getValue(K key, V defaultValue) { + Versioned retVal = get(key); + return retVal.getValue(); + } + + @Override + public Versioned get(K key) { + return get(key, null); + } + + @Override + public Versioned get(K key, Object transforms) { + return this.clientStore.get(key, null).get(0); + } + + protected Versioned getItemOrThrow(K key, Versioned defaultValue, List> items) { + if(items.size() == 0) + return defaultValue; + else if(items.size() == 1) + return items.get(0); + else + throw new InconsistentDataException("Unresolved versions returned from get(" + key + + ") = " + items, items); + } + + @Override + public Map> getAll(Iterable keys) { + Map>> items = null; + items = this.clientStore.getAll(keys, null); + Map> result = Maps.newHashMapWithExpectedSize(items.size()); + + for(Entry>> mapEntry: items.entrySet()) { + Versioned value = getItemOrThrow(mapEntry.getKey(), null, mapEntry.getValue()); + result.put(mapEntry.getKey(), value); + } + return result; + } + + @Override + public Map> getAll(Iterable keys, Map transforms) { + return null; + } + + @Override + public Versioned get(K key, Versioned defaultValue) { + List> resultList = this.clientStore.get(key, null); + if(resultList.size() == 0) { + return null; + } + return resultList.get(0); + } + + @Override + public Version put(K key, V value) { + clientStore.put(key, new Versioned(value), null); + return new VectorClock(); + } + + @Override + public Version put(K key, V value, Object transforms) { + return put(key, value); + } + + @Override + public Version put(K key, Versioned versioned) throws ObsoleteVersionException { + clientStore.put(key, versioned, null); + return new VectorClock(); + } + + @Override + public boolean putIfNotObsolete(K key, Versioned versioned) { + try { + put(key, versioned); + return true; + } catch(ObsoleteVersionException e) { + return false; + } + } + + @Override + public boolean applyUpdate(UpdateAction action) { + return applyUpdate(action, 3); + } + + @Override + public boolean applyUpdate(UpdateAction action, int maxTries) { + boolean success = false; + try { + for(int i = 0; i < maxTries; i++) { + try { + action.update(this); + success = true; + return success; + } catch(ObsoleteVersionException e) { + // ignore for now + } + } + } finally { + if(!success) + action.rollback(); + } + + // if we got here we have seen too many ObsoleteVersionExceptions + // and have rolled back the updates + return false; + } + + @Override + public boolean delete(K key) { + Versioned versioned = get(key); + if(versioned == null) + return false; + return this.clientStore.delete(key, versioned.getVersion()); + } + + @Override + public boolean delete(K key, Version version) { + return this.clientStore.delete(key, version); + } + + @Override + public List getResponsibleNodes(K key) { + return null; + } + + public void close() { + this.clientStore.close(); + } +} diff --git a/contrib/restclient/src/java/voldemort/restclient/SampleRESTClient.java b/contrib/restclient/src/java/voldemort/restclient/SampleRESTClient.java new file mode 100644 index 0000000000..595351117d --- /dev/null +++ b/contrib/restclient/src/java/voldemort/restclient/SampleRESTClient.java @@ -0,0 +1,43 @@ +/* + * Copyright 2008-2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.restclient; + +import java.util.ArrayList; +import java.util.List; + +public class SampleRESTClient { + + public static void main(String[] args) { + + // Create the client + RESTClient clientStore = new RESTClient("http://localhost:8080", + "test"); + + // Sample put + clientStore.put("a", "Howdy!!!!"); + clientStore.put("b", "Partner!!!!"); + + // Do a sample operation: + System.out.println("Received response : " + clientStore.get("a")); + List keyList = new ArrayList(); + keyList.add("a"); + keyList.add("b"); + System.out.println("Received response : " + clientStore.getAll(keyList)); + + clientStore.close(); + } +} diff --git a/lib/commons-codec-1.3.jar b/lib/commons-codec-1.3.jar deleted file mode 100644 index 957b6752af..0000000000 Binary files a/lib/commons-codec-1.3.jar and /dev/null differ diff --git a/lib/commons-codec-1.4.jar b/lib/commons-codec-1.4.jar new file mode 100644 index 0000000000..458d432da8 Binary files /dev/null and b/lib/commons-codec-1.4.jar differ diff --git a/lib/je-4.0.92.jar b/lib/je-4.0.92.jar deleted file mode 100644 index fa9225dd07..0000000000 Binary files a/lib/je-4.0.92.jar and /dev/null differ diff --git a/lib/je-4.1.17.jar b/lib/je-4.1.17.jar new file mode 100644 index 0000000000..17cc9f11f6 Binary files /dev/null and b/lib/je-4.1.17.jar differ diff --git a/lib/netty-3.5.8.Final.jar b/lib/netty-3.5.8.Final.jar new file mode 100644 index 0000000000..e2e42a528e Binary files /dev/null and b/lib/netty-3.5.8.Final.jar differ diff --git a/release_notes.txt b/release_notes.txt index d26d880b3d..4288726d15 100644 --- a/release_notes.txt +++ b/release_notes.txt @@ -1,8 +1,223 @@ +Release 1.3.1 on 03/25/2013 +* HDFSFetcher + - Fixed the bug in calculating checksums when we entere a retry loop + - refactored per file checksums + - added junit test case to simulate intermittent IO exceptions +* voldemort.client.protocol.admin.AdminClient + - Added AdminStoreClient so that AdminClient can do store operations + against specific store on a specific node. + - Added helper methods for diong put & get for specific node/store + - Added voldemort.client.protocol.admin.QueryKeyResult type to + simplify QueryKey interface +* Improved FetchStreamRequestHandler and sub-classes + - Renamed all sub-classes: 'FullScan' and 'PartitionScan' prefixes + for pertinent stream request handler implementations. + - Removed unused skipRecords parameter. + - Added recordsPerPartition parameter to fetch limited portion of + each partition. + - All logic about how many keys to fetch (return to invoker) is + server side now. +* RebalanceCLI + - Added many options to help improve the balance of (zoned) clusters. + - Analysis of the balance of a cluster is significantly more detailed. + - Fixed a bug that reduced the balance of a cluster each time it was + expanded. + - Many different algorithms for improving cluster balance are + implemented in voldemort.utils.RebalanceClusterUtils +* ConsistencyCheck & ConsistencyFixCLI + - New tools for ensuring data durability. These tools are necessary + because slop creation can fail during put operations. + - ConsistencyCheck determines which keys, if any, lack + "consistency". I.e., are present on only a subset of the expected + partitions. + - ConsistencyFix takes a list of bad (inconsistent) keys and makes + sure they are present on all expected partitions. + - ConsistencyFix also has an interface for repairing "orphaned" keys + that could result from an aborted rebalance. +* KeySamplerCLI & KeyVersionFetcherCLI + - KeySamplerCLI is a new tool that reads some number of keys for + specified partitions/stores. + - KeyVersionFetcherCLI is a new tool that, given a key and a store, + fetches the version from all nodes that host a partition that + ought to store a replica of the key's value. + - Together, KeySamplerCLI and KeyVersionFetcherCLI correctly + implement the intended functionality of the Entropy tool (for + servers that implement either FullScan and PartitionScan fetches). + - Entropy tool had been used in the past to verify a sample of keys + before and after a rebalance. Entropy tool does not work as + intended/expected. This is exacerbated by the partition aware + layouts. Instead of trying to fix the Entropy tool, these two new + tools were developed. Entropy is deprecated and will eventually be + removed from the code base. +* Substantial refactoring of helper & util methods: + - voldemort.cluster.Cluster : added helper methods + - voldemort.utils.ClusterInstance : wraps up one Cluster & + List + - voldemort.utils.Cluster : utils for single Cluster object. + - voldemort.utils.NodeUtils : utils for Node object. + - voldemort.utils.RebalanceUtils : Many methods moved to more + appropriate helper classes + - voldemort.utils.StoreDefinitionUtils : utils for StoreDefinition + object. + - voldemort.utils.StoreInstance : wraps up one Cluster & one + StoreDefinition +* Et cetera + - ByteUtils toHexString & from HexString now rely on standard + libraries + - voldemort.client.AdminFetchTest now tests FullScan and + PartitionScan fetches + - voldemort.store.routed.ReadRepairerTest annotated all tests with + @Test + + +Release 1.3.0 on 03/08/2013 + +NOTE: This is an open source release! This release can be downloaded here: + http://github.com/voldemort/voldemort/downloads. + +Changes made since 1.2.3 +* VoldemortConfig and ClientConfig now contain detailed documentation +* BDB-JE defaults set to ones in prod@linkedin +* Bug fixes on kerberos support for Hadoop + + +Release 1.2.3 on 02/20/2013 + +Changes made since 1.2.2 +* Added a retry loop and synchronized block while getting Hadoop FS +* Code cleanup in HdfsFetcher to make it more readable. +* Throwing explicit exceptions in HdfsFetcher instead of + returning null to be more precise in the Azkaban logs. + + +Release 1.2.2 on 02/19/2013 + +Changes made since 1.2.1 +* Synchronized the streaming API +* Fixed some of the streaming API tests. + + +Release 1.2.1 on 0/30/2013 + +Changes made since 1.2.0 +* Added a Streaming API and related tests. +* Refactoring of the admin client apis into functional inner classes + + +Release 1.2.0 on 01/21/2013 + +Changes made since 1.1.9 +* Added an Admin API to fetch orphaned key / entries +* Improved some tests related to streaming API. +* Correcting commons-codec version in ivy file (1.4) + + +Release 1.1.9 on 01/15/2013 + +Changes made since 1.1.8 +* Asynchronous socket checkout improvements + * Changed checkout behavior of KeyedResourcePool to only create new + connections when there are no resources available (rather than + creating new connections until the pool is full) + * Changed QueuedKeyedResourcePool.reset behavior to better match + KeyedResourcePool (i.e., to not cancel queued asynchronous + requests unnecessarily) + * Removed (unnecessary) synchronization primitives from keyed resource pool + * Reduce granularity of failure detector locking within ThresholdFailureDetector +* Minor features/improvements + * Less verbose logging in the face of expected exceptions and errors + * Refactored (Queued)KeyedResourcePoolTest +* Bug fixes + * Fixed possible race condition for resource creation in KeyedResourcePool + * More efficient (time & space) and simpler Histogram implementation + with improved tests + + +Release 1.1.8 on 01/14/2013 + +Changes made since release 1.1.7 +* Enhanced Server Monitoring + -- Server NIO layer + -- Streaming operations to the server + -- BDB storage exception counts +* Ability to turn off BDB Checkpointing during batch modifications +* Added ability to delete old checksum files in Build and Push reducer +* Upgrade Hadoop jar to 1.0.4-p2 + + +Release 1.1.7 on 01/03/2013 + +NOTE: This release is based off of release 1.1.4 + +Changes made since release 1.1.4 +* Upgrading Hadoop jar to 1.0.2 +* Added support for Kerberos authentication in HdfsFetcher +* Extra config parameters for Kerberos config and keytab file + + +NOTE: Release 1.1.5 and 1.1.6 are special client side releases +not based off of master. 1.1.5 was rolled back to to a weird bug. +1.1.6 is a special client side release including Auto- +bootstrapper and Versioned Avro support. + + +Release 1.1.4 on 11/29/2012 + +Changes made since release 1.1.3 +* Added BDB parameters to control LRU behavior in cache & proactive cleaner migration +* Added a mlock fix for pinning the indexes of RO stores in memory + + +Release 1.1.3 on 11/28/2012 + +Changes made since release 1.1.2 +* Fixed a bug in the build and push job, specifically the Mapper + that caused collisions +* Added retry mechanism with the HDFS fetcher for hftp + + +Release 1.1.2 on 10/31/2012 + +Changes made since release 1.1.1 +* Reverted a change to voldemort.versioning.Versioned.getVersion() so + that a Version is returned as our clients expect. + + +Release 1.1.1 on 10/30/2012 + +Changes made since release 1.1.0 +* Fixed connection leak in ClientRequestExecutorFactory +* Changed client to default to DefaultStoreClient + + +Release 1.1.0 on 10/19/2012 + +Changes made since release 1.0.0 + +IMPORTANT NOTE : This release has significant changes to the BDB storage layer. +Users are required to read the bin/PREUPGRADE_FOR_1_1_X_README file +thoroughly before attempting to upgrade to 1.1.0. The necessary data +conversion will be done through bin/voldemort-convert-bdb.sh + +* Upgrading to JE 4.1.17 +* New data format that handles conflicting updates in Voldemort more + efficiently +* Move data off heap and only use it for Index +* When scanning, evict whatever you bring in right away. +* Partition based scan api to dramatically speed up rebalancing & restore + using Partition aware scans (you exactly scan whatever you want to fetch) +* Flexible knobs to control scheduling of DataCleanupJob + + Release 1.0.0 on 10/17/2012 -NOTE: This is not a major release. This is a minor release. The large -version number jump from 0.96 is to standardize on a version number of -the sort MAJOR.MINOR.PATCH. +NOTE: The large version number jump from 0.96 to 1.0.0 is to +standardize on a version number of the sort MAJOR.MINOR.PATCH. This +change is part of our effort to treat internal and open source +releases in a much more similar manner. Along these lines, release +notes for internal releases (like this one) are committed on the +master branch. We hope this improves transparency as we work towards +the next open source release. Changes made since release 0.96 diff --git a/src/java/log4j.properties b/src/java/log4j.properties index 9a16ef00d2..3cb4b7c790 100755 --- a/src/java/log4j.properties +++ b/src/java/log4j.properties @@ -6,15 +6,18 @@ log4j.rootLogger=INFO, stdout log4j.appender.stdout=org.apache.log4j.ConsoleAppender log4j.appender.stdout.layout=org.apache.log4j.PatternLayout # log4j.appender.stdout.layout.ConversionPattern=[%d %c] %p %m %n -log4j.appender.stdout.layout.ConversionPattern=[%d{ABSOLUTE} %c] %p %m [%t]%n +log4j.appender.stdout.layout.ConversionPattern=[%d{ABSOLUTE} %c] %p %m [%t]%n # Turn on all our debugging info log4j.logger=INFO log4j.logger.httpclient.wire=INFO log4j.logger.org.mortbay.log=WARN +log4j.logger.voldemort.server=INFO log4j.logger.voldemort.store.routed=INFO log4j.logger.voldemort.server.niosocket=INFO log4j.logger.voldemort.utils=INFO log4j.logger.voldemort.client.rebalance=INFO log4j.logger.voldemort.server=INFO +log4j.logger.voldemort.routing=INFO +log4j.logger.voldemort.store.stats=INFO log4j.logger.krati=WARN diff --git a/src/java/voldemort/VoldemortAdminTool.java b/src/java/voldemort/VoldemortAdminTool.java index 04d8ab7828..c462150b65 100644 --- a/src/java/voldemort/VoldemortAdminTool.java +++ b/src/java/voldemort/VoldemortAdminTool.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2010 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -54,8 +54,10 @@ import org.codehaus.jackson.JsonGenerator; import org.codehaus.jackson.map.ObjectMapper; +import voldemort.client.ClientConfig; import voldemort.client.protocol.admin.AdminClient; import voldemort.client.protocol.admin.AdminClientConfig; +import voldemort.client.protocol.admin.QueryKeyResult; import voldemort.cluster.Cluster; import voldemort.cluster.Node; import voldemort.serialization.DefaultSerializerFactory; @@ -252,6 +254,15 @@ public static void main(String[] args) throws Exception { .describedAs("query-keys") .withValuesSeparatedBy(',') .ofType(String.class); + parser.accepts("mirror-from-url", "Cluster url to mirror data from") + .withRequiredArg() + .describedAs("mirror-cluster-bootstrap-url") + .ofType(String.class); + parser.accepts("mirror-node", "Node id in the mirror cluster to mirror from") + .withRequiredArg() + .describedAs("id-of-mirror-node") + .ofType(Integer.class); + parser.accepts("fetch-orphaned", "Fetch any orphaned keys/entries in the node"); OptionSet options = parser.parse(args); @@ -263,6 +274,8 @@ public static void main(String[] args) throws Exception { Set missing = CmdUtils.missing(options, "url", "node"); if(missing.size() > 0) { // Not the most elegant way to do this + // basically check if only "node" is missing for these set of + // options; all these can live without explicit node ids if(!(missing.equals(ImmutableSet.of("node")) && (options.has("add-stores") || options.has("delete-store") || options.has("ro-metadata") || options.has("set-metadata") @@ -282,7 +295,10 @@ public static void main(String[] args) throws Exception { Integer zoneId = CmdUtils.valueOf(options, "zone", -1); int zone = zoneId == -1 ? 0 : zoneId; - AdminClient adminClient = new AdminClient(url, new AdminClientConfig(), zone); + AdminClient adminClient = new AdminClient(url, + new AdminClientConfig(), + new ClientConfig(), + zone); if(options.has("verify-metadata-version")) { checkMetadataVersion(adminClient); @@ -290,54 +306,51 @@ public static void main(String[] args) throws Exception { } String ops = ""; - if(options.has("delete-partitions")) { - ops += "d"; - } - if(options.has("fetch-keys")) { - ops += "k"; - } - if(options.has("fetch-entries")) { - ops += "v"; - } - if(options.has("restore")) { - ops += "r"; - } + // Honestly, the most insane code I have seen. Atleast sorting this for + // now so its easy to find a spare character if(options.has("add-stores")) { ops += "a"; } - if(options.has("update-entries")) { - ops += "u"; + if(options.has("async")) { + ops += "b"; } - if(options.has("delete-store")) { - ops += "s"; + if(options.has("check-metadata")) { + ops += "c"; } - if(options.has("get-metadata")) { - ops += "g"; + if(options.has("delete-partitions")) { + ops += "d"; } if(options.has("ro-metadata")) { ops += "e"; } - if(options.has("truncate")) { - ops += "t"; - } - if(options.has("set-metadata")) { - ops += "m"; + if(options.has("reserve-memory")) { + if(!options.has("stores")) { + Utils.croak("Specify the list of stores to reserve memory"); + } + ops += "f"; } - if(options.has("check-metadata")) { - ops += "c"; + if(options.has("get-metadata")) { + ops += "g"; } - if(options.has("key-distribution")) { - ops += "y"; + if(options.has("mirror-from-url")) { + if(!options.has("mirror-node")) { + Utils.croak("Specify the mirror node to fetch from"); + } + ops += "h"; } if(options.has("clear-rebalancing-metadata")) { ops += "i"; } - if(options.has("async")) { - ops += "b"; + if(options.has("fetch-keys")) { + ops += "k"; } + if(options.has("repair-job")) { ops += "l"; } + if(options.has("set-metadata")) { + ops += "m"; + } if(options.has("native-backup")) { if(!options.has("backup-dir")) { Utils.croak("A backup directory must be specified with backup-dir option"); @@ -350,24 +363,38 @@ public static void main(String[] args) throws Exception { } ops += "o"; } - if(options.has("synchronize-metadata-version")) { - ops += "z"; - } - if(options.has("reserve-memory")) { - if(!options.has("stores")) { - Utils.croak("Specify the list of stores to reserve memory"); - } - ops += "f"; - } if(options.has("query-keys")) { ops += "q"; } + if(options.has("restore")) { + ops += "r"; + } + if(options.has("delete-store")) { + ops += "s"; + } + if(options.has("truncate")) { + ops += "t"; + } + if(options.has("update-entries")) { + ops += "u"; + } + if(options.has("fetch-entries")) { + ops += "v"; + } + + if(options.has("key-distribution")) { + ops += "y"; + } + + if(options.has("synchronize-metadata-version")) { + ops += "z"; + } if(ops.length() < 1) { Utils.croak("At least one of (delete-partitions, restore, add-node, fetch-entries, " + "fetch-keys, add-stores, delete-store, update-entries, get-metadata, ro-metadata, " + "set-metadata, check-metadata, key-distribution, clear-rebalancing-metadata, async, " - + "repair-job, native-backup, rollback, reserve-memory, verify-metadata-version) must be specified"); + + "repair-job, native-backup, rollback, reserve-memory, mirror-url, verify-metadata-version) must be specified"); } List storeNames = null; @@ -395,7 +422,7 @@ public static void main(String[] args) throws Exception { System.exit(1); } System.out.println("Starting restore"); - adminClient.restoreDataFromReplications(nodeId, parallelism, zoneId); + adminClient.restoreOps.restoreDataFromReplications(nodeId, parallelism, zoneId); System.out.println("Finished restore"); } if(ops.contains("k")) { @@ -409,7 +436,8 @@ public static void main(String[] args) throws Exception { partitionIdList, outputDir, storeNames, - useAscii); + useAscii, + options.has("fetch-orphaned")); } if(ops.contains("v")) { boolean useAscii = options.has("ascii"); @@ -422,8 +450,10 @@ public static void main(String[] args) throws Exception { partitionIdList, outputDir, storeNames, - useAscii); + useAscii, + options.has("fetch-orphaned")); } + if(ops.contains("a")) { String storesXml = (String) options.valueOf("add-stores"); executeAddStores(adminClient, storesXml, nodeId); @@ -516,7 +546,7 @@ public static void main(String[] args) throws Exception { + storeNames); try { for(String name: storeNames) { - adminClient.updateMetadataversion(name); + adminClient.metadataMgmtOps.updateMetadataversion(name); } } catch(Exception e) { System.err.println("Error while updating metadata version for the specified store."); @@ -557,12 +587,12 @@ public static void main(String[] args) throws Exception { String backupDir = (String) options.valueOf("backup-dir"); String storeName = (String) options.valueOf("native-backup"); int timeout = CmdUtils.valueOf(options, "backup-timeout", 30); - adminClient.nativeBackup(nodeId, - storeName, - backupDir, - timeout, - options.has("backup-verify"), - options.has("backup-incremental")); + adminClient.storeMntOps.nativeBackup(nodeId, + storeName, + backupDir, + timeout, + options.has("backup-verify"), + options.has("backup-incremental")); } if(ops.contains("o")) { String storeName = (String) options.valueOf("rollback"); @@ -574,7 +604,7 @@ public static void main(String[] args) throws Exception { } if(ops.contains("f")) { long reserveMB = (Long) options.valueOf("reserve-memory"); - adminClient.reserveMemory(nodeId, storeNames, reserveMB); + adminClient.storeMntOps.reserveMemory(nodeId, storeNames, reserveMB); } if(ops.contains("q")) { List keyList = (List) options.valuesOf("query-keys"); @@ -583,6 +613,21 @@ public static void main(String[] args) throws Exception { } executeQueryKeys(nodeId, adminClient, storeNames, keyList); } + if(ops.contains("h")) { + if(nodeId == -1) { + System.err.println("Cannot run mirroring without node id"); + System.exit(1); + } + Integer mirrorNodeId = CmdUtils.valueOf(options, "mirror-node", -1); + if(mirrorNodeId == -1) { + System.err.println("Cannot run mirroring without mirror node id"); + System.exit(1); + } + adminClient.restoreOps.mirrorData(nodeId, + mirrorNodeId, + (String) options.valueOf("mirror-from-url"), + storeNames); + } } catch(Exception e) { e.printStackTrace(); Utils.croak(e.getMessage()); @@ -595,11 +640,11 @@ private static String getMetadataVersionsForNode(AdminClient adminClient, int no partitionIdList.addAll(node.getPartitionIds()); } - Iterator>> entriesIterator = adminClient.fetchEntries(nodeId, - SystemStoreConstants.SystemStoreName.voldsys$_metadata_version_persistence.name(), - partitionIdList, - null, - true); + Iterator>> entriesIterator = adminClient.bulkFetchOps.fetchEntries(nodeId, + SystemStoreConstants.SystemStoreName.voldsys$_metadata_version_persistence.name(), + partitionIdList, + null, + true); Serializer serializer = new StringSerializer("UTF8"); String keyObject = null; String valueObject = null; @@ -665,7 +710,7 @@ private static void synchronizeMetadataVersion(AdminClient adminClient, int base System.err.println("The specified node does not have any versions metadata ! Exiting ..."); System.exit(-1); } - adminClient.setMetadataversion(props); + adminClient.metadataMgmtOps.setMetadataversion(props); System.out.println("Metadata versions synchronized successfully."); } catch(IOException e) { System.err.println("Error while retrieving Metadata versions from node : " + baseNodeId @@ -682,20 +727,20 @@ private static void executeRollback(Integer nodeId, AdminClient adminClient) { if(nodeId < 0) { for(Node node: adminClient.getAdminClientCluster().getNodes()) { - adminClient.rollbackStore(node.getId(), storeName, pushVersion); + adminClient.readonlyOps.rollbackStore(node.getId(), storeName, pushVersion); } } else { - adminClient.rollbackStore(nodeId, storeName, pushVersion); + adminClient.readonlyOps.rollbackStore(nodeId, storeName, pushVersion); } } private static void executeRepairJob(Integer nodeId, AdminClient adminClient) { if(nodeId < 0) { for(Node node: adminClient.getAdminClientCluster().getNodes()) { - adminClient.repairJob(node.getId()); + adminClient.storeMntOps.repairJob(node.getId()); } } else { - adminClient.repairJob(nodeId); + adminClient.storeMntOps.repairJob(nodeId); } } @@ -775,6 +820,14 @@ public static void printHelp(PrintStream stream, OptionParser parser) throws IOE stream.println("\t\t./bin/voldemort-admin-tool.sh --update-entries [folder path from output of --fetch-entries --outdir] --url [url] --node [node-id] --stores [comma-separated list of store names]"); stream.println("\t10) Query stores for a set of keys on a specific node."); stream.println("\t\t./bin/voldemort-admin-tool.sh --query-keys [comma-separated list of keys] --url [url] --node [node-id] --stores [comma-separated list of store names]"); + stream.println("\t11) Mirror data from another voldemort server (possibly in another cluster) for specified stores"); + stream.println("\t\t./bin/voldemort-admin-tool.sh --mirror-from-url [bootstrap url to mirror from] --mirror-node [node to mirror from] --url [url] --node [node-id] --stores [comma-separated-list-of-store-names]"); + stream.println("\t12) Mirror data from another voldemort server (possibly in another cluster) for all stores in current cluster"); + stream.println("\t\t./bin/voldemort-admin-tool.sh --mirror-from-url [bootstrap url to mirror from] --mirror-node [node to mirror from] --url [url] --node [node-id]"); + stream.println("\t13) Fetch all orphaned keys on a particular node"); + stream.println("\t\t./bin/voldemort-admin-tool.sh --fetch-keys --url [url] --node [node-id] --fetch-orphaned"); + stream.println("\t14) Fetch all orphaned entries on a particular node"); + stream.println("\t\t./bin/voldemort-admin-tool.sh --fetch-entries --url [url] --node [node-id] --fetch-orphaned"); stream.println(); stream.println("READ-ONLY OPERATIONS"); stream.println("\t1) Retrieve metadata information of read-only data for a particular node and all stores"); @@ -826,11 +879,14 @@ private static void executeAsync(Integer nodeId, // Print the job information for(int currentNodeId: nodeIds) { System.out.println("Retrieving async jobs from node " + currentNodeId); - List asyncIds = adminClient.getAsyncRequestList(currentNodeId); + List asyncIds = adminClient.rpcOps.getAsyncRequestList(currentNodeId); System.out.println("Async Job Ids on node " + currentNodeId + " : " + asyncIds); for(int asyncId: asyncIds) { - System.out.println("Async Job Id " + asyncId + " ] " - + adminClient.getAsyncRequestStatus(currentNodeId, asyncId)); + System.out.println("Async Job Id " + + asyncId + + " ] " + + adminClient.rpcOps.getAsyncRequestStatus(currentNodeId, + asyncId)); System.out.println(); } } @@ -845,7 +901,7 @@ private static void executeAsync(Integer nodeId, for(int asyncId: asyncIdsToStop) { System.out.println("Stopping async id " + asyncId); - adminClient.stopAsyncRequest(nodeId, asyncId); + adminClient.rpcOps.stopAsyncRequest(nodeId, asyncId); System.out.println("Stopped async id " + asyncId); } } else { @@ -874,12 +930,12 @@ private static void executeClearRebalancing(int nodeId, AdminClient adminClient) private static void executeKeyDistribution(AdminClient adminClient) { List keys = KeyDistributionGenerator.generateKeys(KeyDistributionGenerator.DEFAULT_NUM_KEYS); System.out.println(KeyDistributionGenerator.printStoreWiseDistribution(adminClient.getAdminClientCluster(), - adminClient.getRemoteStoreDefList(0) - .getValue(), + adminClient.metadataMgmtOps.getRemoteStoreDefList(0) + .getValue(), keys)); System.out.println(KeyDistributionGenerator.printOverallDistribution(adminClient.getAdminClientCluster(), - adminClient.getRemoteStoreDefList(0) - .getValue(), + adminClient.metadataMgmtOps.getRemoteStoreDefList(0) + .getValue(), keys)); } @@ -888,7 +944,8 @@ private static void executeCheckMetadata(AdminClient adminClient, String metadat Set metadataValues = Sets.newHashSet(); for(Node node: adminClient.getAdminClientCluster().getNodes()) { System.out.println(node.getHost() + ":" + node.getId()); - Versioned versioned = adminClient.getRemoteMetadata(node.getId(), metadataKey); + Versioned versioned = adminClient.metadataMgmtOps.getRemoteMetadata(node.getId(), + metadataKey); if(versioned == null || versioned.getValue() == null) { throw new VoldemortException("Value returned from node " + node.getId() + " was null"); @@ -925,26 +982,29 @@ public static void executeSetMetadata(Integer nodeId, for(Node node: adminClient.getAdminClientCluster().getNodes()) { nodeIds.add(node.getId()); if(updatedVersion == null) { - updatedVersion = (VectorClock) adminClient.getRemoteMetadata(node.getId(), key) - .getVersion(); + updatedVersion = (VectorClock) adminClient.metadataMgmtOps.getRemoteMetadata(node.getId(), + key) + .getVersion(); } else { - updatedVersion = updatedVersion.merge((VectorClock) adminClient.getRemoteMetadata(node.getId(), - key) - .getVersion()); + updatedVersion = updatedVersion.merge((VectorClock) adminClient.metadataMgmtOps.getRemoteMetadata(node.getId(), + key) + .getVersion()); } } // Bump up version on node 0 updatedVersion = updatedVersion.incremented(0, System.currentTimeMillis()); } else { - Versioned currentValue = adminClient.getRemoteMetadata(nodeId, key); + Versioned currentValue = adminClient.metadataMgmtOps.getRemoteMetadata(nodeId, + key); updatedVersion = ((VectorClock) currentValue.getVersion()).incremented(nodeId, System.currentTimeMillis()); nodeIds.add(nodeId); } - adminClient.updateRemoteMetadata(nodeIds, - key, - Versioned.value(value.toString(), updatedVersion)); + adminClient.metadataMgmtOps.updateRemoteMetadata(nodeIds, + key, + Versioned.value(value.toString(), + updatedVersion)); } private static void executeROMetadata(Integer nodeId, @@ -956,8 +1016,9 @@ private static void executeROMetadata(Integer nodeId, if(storeNames == null) { // Retrieve list of read-only stores storeNames = Lists.newArrayList(); - for(StoreDefinition storeDef: adminClient.getRemoteStoreDefList(nodeId > 0 ? nodeId : 0) - .getValue()) { + for(StoreDefinition storeDef: adminClient.metadataMgmtOps.getRemoteStoreDefList(nodeId > 0 ? nodeId + : 0) + .getValue()) { if(storeDef.getType().compareTo(ReadOnlyStorageConfiguration.TYPE_NAME) == 0) { storeNames.add(storeDef.getName()); } @@ -982,12 +1043,13 @@ private static void executeROMetadata(Integer nodeId, .getNodeById(currentNodeId) .getId()); if(type.compareTo("max") == 0) { - storeToValue = adminClient.getROMaxVersion(currentNodeId, storeNames); + storeToValue = adminClient.readonlyOps.getROMaxVersion(currentNodeId, storeNames); } else if(type.compareTo("current") == 0) { - storeToValue = adminClient.getROCurrentVersion(currentNodeId, storeNames); + storeToValue = adminClient.readonlyOps.getROCurrentVersion(currentNodeId, + storeNames); } else if(type.compareTo("storage-format") == 0) { - Map storeToStorageFormat = adminClient.getROStorageFormat(currentNodeId, - storeNames); + Map storeToStorageFormat = adminClient.readonlyOps.getROStorageFormat(currentNodeId, + storeNames); for(String storeName: storeToStorageFormat.keySet()) { System.out.println(storeName + ":" + storeToStorageFormat.get(storeName)); } @@ -1045,7 +1107,7 @@ private static void executeGetMetadata(Integer nodeId, System.out.println("Key - " + key); Versioned versioned = null; try { - versioned = adminClient.getRemoteMetadata(currentNodeId, key); + versioned = adminClient.metadataMgmtOps.getRemoteMetadata(currentNodeId, key); } catch(Exception e) { System.out.println("Error in retrieving " + e.getMessage()); System.out.println(); @@ -1077,9 +1139,9 @@ private static void executeGetMetadata(Integer nodeId, private static void executeDeleteStore(AdminClient adminClient, String storeName, int nodeId) { System.out.println("Deleting " + storeName); if(nodeId == -1) { - adminClient.deleteStore(storeName); + adminClient.storeMgmtOps.deleteStore(storeName); } else { - adminClient.deleteStore(storeName, nodeId); + adminClient.storeMgmtOps.deleteStore(storeName, nodeId); } } @@ -1096,7 +1158,7 @@ private static void executeTruncateStore(int nodeId, AdminClient adminClient, St for(Integer currentNodeId: nodeIds) { System.out.println("Truncating " + storeName + " on node " + currentNodeId); - adminClient.truncate(currentNodeId, storeName); + adminClient.storeMntOps.truncate(currentNodeId, storeName); } } @@ -1106,9 +1168,9 @@ private static void executeAddStores(AdminClient adminClient, String storesXml, for(StoreDefinition storeDef: storeDefinitionList) { System.out.println("Adding " + storeDef.getName()); if(-1 != nodeId) - adminClient.addStore(storeDef, nodeId); + adminClient.storeMgmtOps.addStore(storeDef, nodeId); else - adminClient.addStore(storeDef); + adminClient.storeMgmtOps.addStore(storeDef); } } @@ -1117,10 +1179,11 @@ private static void executeFetchEntries(Integer nodeId, List partitionIdList, String outputDir, List storeNames, - boolean useAscii) throws IOException { + boolean useAscii, + boolean fetchOrphaned) throws IOException { - List storeDefinitionList = adminClient.getRemoteStoreDefList(nodeId) - .getValue(); + List storeDefinitionList = adminClient.metadataMgmtOps.getRemoteStoreDefList(nodeId) + .getValue(); HashMap storeDefinitionMap = Maps.newHashMap(); for(StoreDefinition storeDefinition: storeDefinitionList) { storeDefinitionMap.put(storeDefinition.getName(), storeDefinition); @@ -1161,16 +1224,23 @@ private static void executeFetchEntries(Integer nodeId, System.out.println("No store found under the name \'" + store + "\'"); continue; + } + + Iterator>> entriesIteratorRef = null; + if(fetchOrphaned) { + System.out.println("Fetching orphaned entries of " + store); + entriesIteratorRef = adminClient.bulkFetchOps.fetchOrphanedEntries(nodeId, store); } else { System.out.println("Fetching entries in partitions " + Joiner.on(", ").join(partitionIdList) + " of " + store); + entriesIteratorRef = adminClient.bulkFetchOps.fetchEntries(nodeId, + store, + partitionIdList, + null, + false); } - final Iterator>> entriesIterator = adminClient.fetchEntries(nodeId, - store, - partitionIdList, - null, - false); + final Iterator>> entriesIterator = entriesIteratorRef; File outputFile = null; if(directory != null) { outputFile = new File(directory, store + ".entries"); @@ -1271,8 +1341,8 @@ private static void executeUpdateEntries(Integer nodeId, AdminClient adminClient, List storeNames, String inputDirPath) throws IOException { - List storeDefinitionList = adminClient.getRemoteStoreDefList(nodeId) - .getValue(); + List storeDefinitionList = adminClient.metadataMgmtOps.getRemoteStoreDefList(nodeId) + .getValue(); Map storeDefinitionMap = Maps.newHashMap(); for(StoreDefinition storeDefinition: storeDefinitionList) { storeDefinitionMap.put(storeDefinition.getName(), storeDefinition); @@ -1297,7 +1367,7 @@ private static void executeUpdateEntries(Integer nodeId, for(String storeName: storeNames) { Iterator>> iterator = readEntriesBinary(inputDir, storeName); - adminClient.updateEntries(nodeId, storeName, iterator, null); + adminClient.streamingOps.updateEntries(nodeId, storeName, iterator, null); } } @@ -1356,9 +1426,10 @@ private static void executeFetchKeys(Integer nodeId, List partitionIdList, String outputDir, List storeNames, - boolean useAscii) throws IOException { - List storeDefinitionList = adminClient.getRemoteStoreDefList(nodeId) - .getValue(); + boolean useAscii, + boolean fetchOrphaned) throws IOException { + List storeDefinitionList = adminClient.metadataMgmtOps.getRemoteStoreDefList(nodeId) + .getValue(); Map storeDefinitionMap = Maps.newHashMap(); for(StoreDefinition storeDefinition: storeDefinitionList) { storeDefinitionMap.put(storeDefinition.getName(), storeDefinition); @@ -1397,21 +1468,26 @@ private static void executeFetchKeys(Integer nodeId, if(null == storeDefinition) { System.out.println("No store found under the name \'" + store + "\'"); continue; + } + + Iterator keyIteratorRef = null; + if(fetchOrphaned) { + System.out.println("Fetching orphaned keys of " + store); + keyIteratorRef = adminClient.bulkFetchOps.fetchOrphanedKeys(nodeId, store); } else { System.out.println("Fetching keys in partitions " + Joiner.on(", ").join(partitionIdList) + " of " + store); + keyIteratorRef = adminClient.bulkFetchOps.fetchKeys(nodeId, + store, + partitionIdList, + null, + false); } - - final Iterator keyIterator = adminClient.fetchKeys(nodeId, - store, - partitionIdList, - null, - false); File outputFile = null; if(directory != null) { outputFile = new File(directory, store + ".keys"); } - + final Iterator keyIterator = keyIteratorRef; if(useAscii) { final SerializerDefinition serializerDef = storeDefinition.getKeySerializer(); final SerializerFactory serializerFactory = new DefaultSerializerFactory(); @@ -1527,8 +1603,8 @@ private static void executeDeletePartitions(Integer nodeId, List stores = storeNames; if(stores == null) { stores = Lists.newArrayList(); - List storeDefinitionList = adminClient.getRemoteStoreDefList(nodeId) - .getValue(); + List storeDefinitionList = adminClient.metadataMgmtOps.getRemoteStoreDefList(nodeId) + .getValue(); for(StoreDefinition storeDefinition: storeDefinitionList) { stores.add(storeDefinition.getName()); } @@ -1537,7 +1613,7 @@ private static void executeDeletePartitions(Integer nodeId, for(String store: stores) { System.out.println("Deleting partitions " + Joiner.on(", ").join(partitionIdList) + " of " + store); - adminClient.deletePartitions(nodeId, store, partitionIdList, null); + adminClient.storeMntOps.deletePartitions(nodeId, store, partitionIdList, null); } } @@ -1551,11 +1627,11 @@ private static void executeQueryKeys(final Integer nodeId, listKeys.add(new ByteArray(serializer.toBytes(key))); } for(final String storeName: storeNames) { - final Iterator>, Exception>>> iterator = adminClient.queryKeys(nodeId.intValue(), - storeName, - listKeys.iterator()); - List storeDefinitionList = adminClient.getRemoteStoreDefList(nodeId) - .getValue(); + final Iterator iterator = adminClient.streamingOps.queryKeys(nodeId.intValue(), + storeName, + listKeys.iterator()); + List storeDefinitionList = adminClient.metadataMgmtOps.getRemoteStoreDefList(nodeId) + .getValue(); StoreDefinition storeDefinition = null; for(StoreDefinition storeDef: storeDefinitionList) { if(storeDef.getName().equals(storeName)) @@ -1596,15 +1672,15 @@ public void writeTo(BufferedWriter out) throws IOException { + "\n"); while(iterator.hasNext()) { - Pair>, Exception>> kvPair = iterator.next(); + QueryKeyResult queryKeyResult = iterator.next(); // unserialize and write key - byte[] keyBytes = kvPair.getFirst().get(); + byte[] keyBytes = queryKeyResult.getKey().get(); Object keyObject = keySerializer.toObject((null == keyCompressionStrategy) ? keyBytes : keyCompressionStrategy.inflate(keyBytes)); generator.writeObject(keyObject); // iterate through, unserialize and write values - List> values = kvPair.getSecond().getFirst(); + List> values = queryKeyResult.getValues(); if(values != null) { if(values.size() == 0) { stringWriter.write(", null"); @@ -1626,9 +1702,9 @@ public void writeTo(BufferedWriter out) throws IOException { stringWriter.write(", null"); } // write out exception - if(kvPair.getSecond().getSecond() != null) { + if(queryKeyResult.hasException()) { stringWriter.write(", "); - stringWriter.write(kvPair.getSecond().getSecond().toString()); + stringWriter.write(queryKeyResult.getException().toString()); } StringBuffer buf = stringWriter.getBuffer(); diff --git a/src/java/voldemort/VoldemortClientShell.java b/src/java/voldemort/VoldemortClientShell.java index 93bd7d6a29..57aff2644f 100644 --- a/src/java/voldemort/VoldemortClientShell.java +++ b/src/java/voldemort/VoldemortClientShell.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2009 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -116,7 +116,9 @@ public static void main(String[] args) throws Exception { try { factory = new SocketStoreClientFactory(clientConfig); client = (DefaultStoreClient) factory.getStoreClient(storeName); - adminClient = new AdminClient(bootstrapUrl, new AdminClientConfig()); + adminClient = new AdminClient(bootstrapUrl, + new AdminClientConfig(), + new ClientConfig()); } catch(Exception e) { Utils.croak("Could not connect to server: " + e.getMessage()); } @@ -130,7 +132,7 @@ public static void main(String[] args) throws Exception { processCommands(factory, adminClient, inputReader, false); } finally { if(adminClient != null) - adminClient.stop(); + adminClient.close(); if(factory != null) factory.close(); } @@ -177,7 +179,8 @@ private static void processCommands(StoreClientFactory factory, String[] args = line.substring("getmetadata".length() + 1).split("\\s+"); int remoteNodeId = Integer.valueOf(args[0]); String key = args[1]; - Versioned versioned = adminClient.getRemoteMetadata(remoteNodeId, key); + Versioned versioned = adminClient.metadataMgmtOps.getRemoteMetadata(remoteNodeId, + key); if(versioned == null) { System.out.println("null"); } else { @@ -205,11 +208,11 @@ private static void processCommands(StoreClientFactory factory, int remoteNodeId = Integer.valueOf(args[0]); String storeName = args[1]; List partititionList = parseCsv(args[2]); - Iterator partitionKeys = adminClient.fetchKeys(remoteNodeId, - storeName, - partititionList, - null, - false); + Iterator partitionKeys = adminClient.bulkFetchOps.fetchKeys(remoteNodeId, + storeName, + partititionList, + null, + false); BufferedWriter writer = null; try { @@ -236,11 +239,11 @@ private static void processCommands(StoreClientFactory factory, int remoteNodeId = Integer.valueOf(args[0]); String storeName = args[1]; List partititionList = parseCsv(args[2]); - Iterator>> partitionEntries = adminClient.fetchEntries(remoteNodeId, - storeName, - partititionList, - null, - false); + Iterator>> partitionEntries = adminClient.bulkFetchOps.fetchEntries(remoteNodeId, + storeName, + partititionList, + null, + false); BufferedWriter writer = null; try { if(args.length > 3) { @@ -270,21 +273,29 @@ private static void processCommands(StoreClientFactory factory, } else if(line.startsWith("help")) { System.out.println(); System.out.println("Commands:"); - System.out.println(PROMPT + "put key value --- Associate the given value with the key."); - System.out.println(PROMPT + "get key --- Retrieve the value associated with the key."); - System.out.println(PROMPT + "getall key1 [key2...] --- Retrieve the value(s) associated with the key(s)."); - System.out.println(PROMPT + "delete key --- Remove all values associated with the key."); - System.out.println(PROMPT + "preflist key --- Get node preference list for given key."); + System.out.println(PROMPT + + "put key value --- Associate the given value with the key."); + System.out.println(PROMPT + + "get key --- Retrieve the value associated with the key."); + System.out.println(PROMPT + + "getall key1 [key2...] --- Retrieve the value(s) associated with the key(s)."); + System.out.println(PROMPT + + "delete key --- Remove all values associated with the key."); + System.out.println(PROMPT + + "preflist key --- Get node preference list for given key."); String metaKeyValues = voldemort.store.metadata.MetadataStore.METADATA_KEYS.toString(); - System.out.println(PROMPT + "getmetadata node_id meta_key --- Get store metadata associated " + System.out.println(PROMPT + + "getmetadata node_id meta_key --- Get store metadata associated " + "with meta_key from node_id. meta_key may be one of " + metaKeyValues.substring(1, metaKeyValues.length() - 1) + "."); - System.out.println(PROMPT + "fetchkeys node_id store_name partitions --- Fetch all keys " + System.out.println(PROMPT + + "fetchkeys node_id store_name partitions --- Fetch all keys " + "from given partitions (a comma separated list) of store_name on " + "node_id. Optionally, write to file_name. " + "Use getmetadata to determine appropriate values for store_name and partitions"); - System.out.println(PROMPT + "fetch node_id store_name partitions --- Fetch all entries " + System.out.println(PROMPT + + "fetch node_id store_name partitions --- Fetch all entries " + "from given partitions (a comma separated list) of store_name on " + "node_id. Optionally, write to file_name. " + "Use getmetadata to determine appropriate values for store_name and partitions"); @@ -383,10 +394,10 @@ private static void printObject(Object o) { System.out.print('}'); } else if(o instanceof Object[]) { Object[] a = (Object[]) o; - System.out.print( Arrays.deepToString(a) ); + System.out.print(Arrays.deepToString(a)); } else if(o instanceof byte[]) { byte[] a = (byte[]) o; - System.out.print( Arrays.toString(a) ); + System.out.print(Arrays.toString(a)); } else { System.out.print(o); } diff --git a/src/java/voldemort/client/AbstractStoreClientFactory.java b/src/java/voldemort/client/AbstractStoreClientFactory.java index 08502196cb..63b9a5ce24 100644 --- a/src/java/voldemort/client/AbstractStoreClientFactory.java +++ b/src/java/voldemort/client/AbstractStoreClientFactory.java @@ -294,8 +294,11 @@ public Store getRawStore(String storeName, failureDetectorRef, isJmxEnabled, this.jmxId); + store = new LoggingStore(store); + Store finalStore = (Store) store; + if(isJmxEnabled) { StatTrackingStore statStore = new StatTrackingStore(store, this.stats); store = statStore; @@ -305,35 +308,42 @@ public Store getRawStore(String storeName, + JmxUtils.getJmxId(jmxId))); } - if(storeDef.getKeySerializer().hasCompression() - || storeDef.getValueSerializer().hasCompression()) { - store = new CompressingStore(store, - getCompressionStrategy(storeDef.getKeySerializer()), - getCompressionStrategy(storeDef.getValueSerializer())); + if(this.config.isEnableCompressionLayer()) { + if(storeDef.getKeySerializer().hasCompression() + || storeDef.getValueSerializer().hasCompression()) { + store = new CompressingStore(store, + getCompressionStrategy(storeDef.getKeySerializer()), + getCompressionStrategy(storeDef.getValueSerializer())); + } } - Serializer keySerializer = (Serializer) serializerFactory.getSerializer(storeDef.getKeySerializer()); - Serializer valueSerializer = (Serializer) serializerFactory.getSerializer(storeDef.getValueSerializer()); + if(this.config.isEnableSerializationLayer()) { + Serializer keySerializer = (Serializer) serializerFactory.getSerializer(storeDef.getKeySerializer()); + Serializer valueSerializer = (Serializer) serializerFactory.getSerializer(storeDef.getValueSerializer()); - if(storeDef.isView() && (storeDef.getTransformsSerializer() == null)) - throw new SerializationException("Transforms serializer must be specified with a view "); + if(storeDef.isView() && (storeDef.getTransformsSerializer() == null)) + throw new SerializationException("Transforms serializer must be specified with a view "); - Serializer transformsSerializer = (Serializer) serializerFactory.getSerializer(storeDef.getTransformsSerializer() != null ? storeDef.getTransformsSerializer() - : new SerializerDefinition("identity")); + Serializer transformsSerializer = (Serializer) serializerFactory.getSerializer(storeDef.getTransformsSerializer() != null ? storeDef.getTransformsSerializer() + : new SerializerDefinition("identity")); - Store serializedStore = SerializingStore.wrap(store, - keySerializer, - valueSerializer, - transformsSerializer); + finalStore = SerializingStore.wrap(store, + keySerializer, + valueSerializer, + transformsSerializer); + } // Add inconsistency resolving decorator, using their inconsistency // resolver (if they gave us one) - InconsistencyResolver> secondaryResolver = resolver == null ? new TimeBasedInconsistencyResolver() - : resolver; - serializedStore = new InconsistencyResolvingStore(serializedStore, - new ChainedResolver>(new VectorClockInconsistencyResolver(), - secondaryResolver)); - return serializedStore; + if(this.config.isEnableInconsistencyResolvingLayer()) { + InconsistencyResolver> secondaryResolver = resolver == null ? new TimeBasedInconsistencyResolver() + : resolver; + finalStore = new InconsistencyResolvingStore(finalStore, + new ChainedResolver>(new VectorClockInconsistencyResolver(), + secondaryResolver)); + } + + return finalStore; } protected ClientConfig getConfig() { diff --git a/src/java/voldemort/client/ClientConfig.java b/src/java/voldemort/client/ClientConfig.java index 66a3ec6d4d..eb62ccc74b 100644 --- a/src/java/voldemort/client/ClientConfig.java +++ b/src/java/voldemort/client/ClientConfig.java @@ -30,7 +30,9 @@ import voldemort.client.protocol.RequestFormatType; import voldemort.cluster.Zone; +import voldemort.cluster.failuredetector.BannagePeriodFailureDetector; import voldemort.cluster.failuredetector.FailureDetectorConfig; +import voldemort.cluster.failuredetector.ThresholdFailureDetector; import voldemort.common.VoldemortOpCode; import voldemort.serialization.DefaultSerializerFactory; import voldemort.serialization.SerializerFactory; @@ -47,36 +49,43 @@ public class ClientConfig { private volatile int maxConnectionsPerNode = 50; - private volatile int maxTotalConnections = 500; - private volatile int maxThreads = 5; - private volatile int maxQueuedRequests = 50; - private volatile long threadIdleMs = 100000; private volatile long connectionTimeoutMs = 500; private volatile long socketTimeoutMs = 5000; private volatile boolean socketKeepAlive = false; private volatile int selectors = 8; - private volatile long routingTimeoutMs = 15000; + private volatile long routingTimeoutMs = 5000; private volatile TimeoutConfig timeoutConfig = new TimeoutConfig(routingTimeoutMs, false); private volatile int socketBufferSize = 64 * 1024; private volatile SerializerFactory serializerFactory = new DefaultSerializerFactory(); private volatile List bootstrapUrls = null; private volatile RequestFormatType requestFormatType = RequestFormatType.VOLDEMORT_V1; private volatile RoutingTier routingTier = RoutingTier.CLIENT; - private volatile boolean enableJmx = true; private volatile boolean enableLazy = true; private volatile boolean enablePipelineRoutedStore = true; private volatile int clientZoneId = Zone.DEFAULT_ZONE_ID; - // Flag to control which store client to use. Default = Enhanced - private volatile boolean useDefaultClient = false; + /* + * The following are only used with a non pipe line routed, i.e non NIO + * based client + */ + @Deprecated + private volatile int maxTotalConnections = 500; + @Deprecated + private volatile int maxThreads = 5; + @Deprecated + private volatile int maxQueuedRequests = 50; + @Deprecated + private volatile long threadIdleMs = 100000; + + private volatile boolean useDefaultClient = true; private volatile String failureDetectorImplementation = FailureDetectorConfig.DEFAULT_IMPLEMENTATION_CLASS_NAME; private volatile long failureDetectorBannagePeriod = FailureDetectorConfig.DEFAULT_BANNAGE_PERIOD; private volatile int failureDetectorThreshold = FailureDetectorConfig.DEFAULT_THRESHOLD; private volatile int failureDetectorThresholdCountMinimum = FailureDetectorConfig.DEFAULT_THRESHOLD_COUNT_MINIMUM; - private volatile long failureDetectorThresholdInterval = FailureDetectorConfig.DEFAULT_THRESHOLD_INTERVAL; - private volatile long failureDetectorAsyncRecoveryInterval = FailureDetectorConfig.DEFAULT_ASYNC_RECOVERY_INTERVAL; + private volatile long failureDetectorThresholdIntervalMs = FailureDetectorConfig.DEFAULT_THRESHOLD_INTERVAL; + private volatile long failureDetectorAsyncRecoveryIntervalMs = FailureDetectorConfig.DEFAULT_ASYNC_RECOVERY_INTERVAL; private volatile List failureDetectorCatastrophicErrorTypes = FailureDetectorConfig.DEFAULT_CATASTROPHIC_ERROR_TYPES; private long failureDetectorRequestLengthThreshold = socketTimeoutMs; @@ -91,12 +100,18 @@ public class ClientConfig { /* SystemStore client config */ private volatile int sysMaxConnectionsPerNode = 2; - private volatile int sysRoutingTimeout = 5000; - private volatile int sysSocketTimeout = 5000; - private volatile int sysConnectionTimeout = 1500; + private volatile int sysRoutingTimeoutMs = 5000; + private volatile int sysSocketTimeoutMs = 5000; + private volatile int sysConnectionTimeoutMs = 1500; private volatile boolean sysEnableJmx = false; private volatile boolean sysEnablePipelineRoutedStore = true; + /* Voldemort client component config */ + private volatile boolean enableJmx = true; + private volatile boolean enableCompressionLayer = true; + private volatile boolean enableSerializationLayer = true; + private volatile boolean enableInconsistencyResolvingLayer = true; + public ClientConfig() {} /* Propery names for propery-based configuration */ @@ -147,6 +162,9 @@ public ClientConfig() {} public static final String SYS_SOCKET_TIMEOUT_MS = "sys_socket_timeout_ms"; public static final String SYS_ENABLE_JMX = "sys_enable_jmx"; public static final String SYS_ENABLE_PIPELINE_ROUTED_STORE = "sys_enable_pipeline_routed_store"; + public static final String ENABLE_COMPRESSION_LAYER = "enable_compression_layer"; + public static final String ENABLE_SERIALIZATION_LAYER = "enable_serialization_layer"; + public static final String ENABLE_INCONSISTENCY_RESOLVING_LAYER = "enable_inconsistency_resolving_layer"; /** * Instantiate the client config using a properties file @@ -350,8 +368,27 @@ private void setProperties(Properties properties) { this.setSysEnablePipelineRoutedStore(props.getBoolean(SYS_ENABLE_PIPELINE_ROUTED_STORE)); } + if(props.containsKey(ENABLE_COMPRESSION_LAYER)) { + this.setEnableCompressionLayer(props.getBoolean(ENABLE_COMPRESSION_LAYER)); + } + + if(props.containsKey(ENABLE_SERIALIZATION_LAYER)) { + this.setEnableSerializationLayer(props.getBoolean(ENABLE_SERIALIZATION_LAYER)); + } + + if(props.containsKey(ENABLE_INCONSISTENCY_RESOLVING_LAYER)) { + this.setEnableInconsistencyResolvingLayer(props.getBoolean(ENABLE_INCONSISTENCY_RESOLVING_LAYER)); + } + } + /** + * Sets the maximum number of connections a system store client will create + * to a voldemort server + * + * @param maxConnectionsPerNode + * @return + */ private ClientConfig setSysMaxConnectionsPerNode(int maxConnectionsPerNode) { if(maxConnectionsPerNode <= 0) throw new IllegalArgumentException("Value must be greater than zero."); @@ -363,63 +400,98 @@ public int getSysMaxConnectionsPerNode() { return this.sysMaxConnectionsPerNode; } - private ClientConfig setSysRoutingTimeout(int sysRoutingTimeout) { - if(sysRoutingTimeout <= 0) + /** + * Sets the routing layer timeout for the system store client. + * + * @param sysRoutingTimeout + * @return + */ + private ClientConfig setSysRoutingTimeout(int sysRoutingTimeoutMs) { + if(sysRoutingTimeoutMs <= 0) throw new IllegalArgumentException("Value must be greater than zero."); - this.sysRoutingTimeout = sysRoutingTimeout; + this.sysRoutingTimeoutMs = sysRoutingTimeoutMs; return this; } public int getSysRoutingTimeout() { - return this.sysRoutingTimeout; + return this.sysRoutingTimeoutMs; } - private ClientConfig setSysSocketTimeout(int sysSocketTimeout) { - if(sysSocketTimeout <= 0) + /** + * Sets the socket timeout (at the java.net layer) for the system store + * client + * + * @param sysSocketTimeout + * @return + */ + private ClientConfig setSysSocketTimeout(int sysSocketTimeoutMs) { + if(sysSocketTimeoutMs <= 0) throw new IllegalArgumentException("Value must be greater than zero."); - this.sysSocketTimeout = sysSocketTimeout; + this.sysSocketTimeoutMs = sysSocketTimeoutMs; return this; } public int getSysSocketTimeout() { - return this.sysSocketTimeout; + return this.sysSocketTimeoutMs; } - private ClientConfig setSysConnectionTimeout(int sysConnectionTimeout) { - if(sysConnectionTimeout <= 0) + /** + * Amount of time in ms spent trying to establish a connection to voldemort + * servers, from the system store client + * + * @param sysConnectionTimeoutMs + * @return + */ + private ClientConfig setSysConnectionTimeout(int sysConnectionTimeoutMs) { + if(sysConnectionTimeoutMs <= 0) throw new IllegalArgumentException("Value must be greater than zero."); - this.sysConnectionTimeout = sysConnectionTimeout; + this.sysConnectionTimeoutMs = sysConnectionTimeoutMs; return this; } public int getSysConnectionTimeout() { - return this.sysConnectionTimeout; - } - - public boolean getSysEnableJmx() { - return this.sysEnableJmx; + return this.sysConnectionTimeoutMs; } + /** + * Whether or not JMX monitoring is enabled for the system store + * + * @param sysEnableJmx + * @return + */ public ClientConfig setSysEnableJmx(boolean sysEnableJmx) { this.sysEnableJmx = sysEnableJmx; return this; } - public boolean getSysEnablePipelineRoutedStore() { - return this.sysEnablePipelineRoutedStore; + public boolean getSysEnableJmx() { + return this.sysEnableJmx; } + /** + * Should pipleline store be used by the system store client? + * + * @param sysEnablePipelineRoutedStore + * @return + */ public ClientConfig setSysEnablePipelineRoutedStore(boolean sysEnablePipelineRoutedStore) { this.sysEnablePipelineRoutedStore = sysEnablePipelineRoutedStore; return this; } + public boolean getSysEnablePipelineRoutedStore() { + return this.sysEnablePipelineRoutedStore; + } + public int getMaxConnectionsPerNode() { return maxConnectionsPerNode; } /** - * Set the maximum number of connection allowed to each voldemort node + * Set the maximum number of connections allowed to each voldemort node. + * Play with this value to determine how many connections are enough for + * your workload. Without high enough connections, you may not be able to + * throw enough traffic at the servers * * @param maxConnectionsPerNode The maximum number of connections */ @@ -430,22 +502,6 @@ public ClientConfig setMaxConnectionsPerNode(int maxConnectionsPerNode) { return this; } - public int getMaxTotalConnections() { - return maxTotalConnections; - } - - /** - * Set the maximum number of connections allowed to all voldemort nodes - * - * @param maxTotalConnections The maximum total number of connections - */ - public ClientConfig setMaxTotalConnections(int maxTotalConnections) { - if(maxTotalConnections <= 0) - throw new IllegalArgumentException("Value must be greater than zero."); - this.maxTotalConnections = maxTotalConnections; - return this; - } - public int getSocketTimeout(TimeUnit unit) { return toInt(unit.convert(socketTimeoutMs, TimeUnit.MILLISECONDS)); } @@ -467,6 +523,13 @@ public boolean getSocketKeepAlive() { return socketKeepAlive; } + /** + * Enabled/disable SO_KEEPALIVE on the connection created with the voldemort + * servers + * + * @param socketKeepAlive + * @return + */ public ClientConfig setSocketKeepAlive(boolean socketKeepAlive) { this.socketKeepAlive = socketKeepAlive; return this; @@ -476,6 +539,14 @@ public int getSelectors() { return selectors; } + /** + * Number of NIO selector threads to use, to handle communication with the + * server.Typically, this is same as the number of cores in the client + * machine + * + * @param selectors + * @return + */ public ClientConfig setSelectors(int selectors) { this.selectors = selectors; return this; @@ -490,6 +561,12 @@ public int getRoutingTimeout(TimeUnit unit) { * number of blocking operations can be configured using the preferred-reads * and preferred-writes configuration for the store. * + * See {@link #GETALL_ROUTING_TIMEOUT_MS_PROPERTY}, + * {@link #GET_ROUTING_TIMEOUT_MS_PROPERTY}, + * {@link #PUT_ROUTING_TIMEOUT_MS_PROPERTY}, + * {@link #DELETE_ROUTING_TIMEOUT_MS_PROPERTY} to override timeouts for + * specific operations + * * @param routingTimeout The timeout for all operations to complete. * @param unit The time unit of the timeout value */ @@ -556,35 +633,6 @@ public ClientConfig setConnectionTimeout(int connectionTimeout, TimeUnit unit) { return this; } - public int getThreadIdleTime(TimeUnit unit) { - return toInt(unit.convert(threadIdleMs, TimeUnit.MILLISECONDS)); - } - - /** - * The amount of time to keep an idle client thread alive - * - * @param threadIdleTime - */ - public ClientConfig setThreadIdleTime(long threadIdleTime, TimeUnit unit) { - this.threadIdleMs = unit.toMillis(threadIdleTime); - return this; - } - - public int getMaxQueuedRequests() { - return maxQueuedRequests; - } - - /** - * Set the maximum number of queued node operations before client actions - * will be blocked - * - * @param maxQueuedRequests The maximum number of queued requests - */ - public ClientConfig setMaxQueuedRequests(int maxQueuedRequests) { - this.maxQueuedRequests = maxQueuedRequests; - return this; - } - public int getSocketBufferSize() { return socketBufferSize; } @@ -663,8 +711,10 @@ public RoutingTier getRoutingTier() { /** * Set the tier at which routing occurs. Client-side routing occurs on the - * client, and server-side routing on the server. This is not yet used, as - * the java client only supports client-side routing. + * client, and server-side routing on the server. + * + * NOTE : Server side routing is not used, as yet. The java client only + * supports client-side routing. * * @param routingTier The routing tier to use for routing requests */ @@ -687,6 +737,56 @@ public ClientConfig setMaxThreads(int maxThreads) { return this; } + @Deprecated + public int getMaxTotalConnections() { + return maxTotalConnections; + } + + @Deprecated + public int getThreadIdleTime(TimeUnit unit) { + return toInt(unit.convert(threadIdleMs, TimeUnit.MILLISECONDS)); + } + + /** + * The amount of time to keep an idle client thread alive + * + * @param threadIdleTime + */ + @Deprecated + public ClientConfig setThreadIdleTime(long threadIdleTime, TimeUnit unit) { + this.threadIdleMs = unit.toMillis(threadIdleTime); + return this; + } + + public int getMaxQueuedRequests() { + return maxQueuedRequests; + } + + /** + * Set the maximum number of queued node operations before client actions + * will be blocked + * + * @param maxQueuedRequests The maximum number of queued requests + */ + public ClientConfig setMaxQueuedRequests(int maxQueuedRequests) { + this.maxQueuedRequests = maxQueuedRequests; + return this; + } + + /** + * Set the maximum number of connections allowed to all voldemort nodes. + * Note: This has no effect when using NIO based pipeline routing + * + * @param maxTotalConnections The maximum total number of connections + */ + @Deprecated + public ClientConfig setMaxTotalConnections(int maxTotalConnections) { + if(maxTotalConnections <= 0) + throw new IllegalArgumentException("Value must be greater than zero."); + this.maxTotalConnections = maxTotalConnections; + return this; + } + public int toInt(long l) { return (int) Math.min(l, Integer.MAX_VALUE); } @@ -719,6 +819,16 @@ public ClientConfig setEnableLazy(boolean enableLazy) { return this; } + /** + * Sets the zone the client belongs to. This is very important in zoned + * configurations since the client always has an "affinity" towards the + * servers in its zone. + * + * Default : zone 0 + * + * @param clientZoneId + * @return + */ public ClientConfig setClientZoneId(int clientZoneId) { this.clientZoneId = clientZoneId; return this; @@ -728,6 +838,15 @@ public int getClientZoneId() { return this.clientZoneId; } + /** + * Whether or not a {@link ZenStoreClient} is created as opposed to a + * {@link DefaultStoreClient} + * + * true = DefaultStoreClient and false = ZenStoreClient + * + * @param enableDefault + * @return + */ public ClientConfig enableDefaultClient(boolean enableDefault) { this.useDefaultClient = enableDefault; return this; @@ -741,6 +860,13 @@ public boolean isPipelineRoutedStoreEnabled() { return enablePipelineRoutedStore; } + /** + * Whether or not to use the Pipeline routing which is much more resource + * efficient by employing Java NIO to handle communication with the server + * + * @param enablePipelineRoutedStore + * @return + */ public ClientConfig setEnablePipelineRoutedStore(boolean enablePipelineRoutedStore) { this.enablePipelineRoutedStore = enablePipelineRoutedStore; return this; @@ -750,6 +876,14 @@ public String getFailureDetectorImplementation() { return failureDetectorImplementation; } + /** + * FailureDetector to use. Its highly recommended to use + * {@link ThresholdFailureDetector} as opposed to using + * {@link BannagePeriodFailureDetector} + * + * @param failureDetectorImplementation + * @return + */ public ClientConfig setFailureDetectorImplementation(String failureDetectorImplementation) { this.failureDetectorImplementation = failureDetectorImplementation; return this; @@ -768,6 +902,14 @@ public int getFailureDetectorThreshold() { return failureDetectorThreshold; } + /** + * Set the percentage of exceptions that tolerated in a given failure + * detector window. If the client experiences more exceptions than this + * threshold, it will mark the erring server down + * + * @param failureDetectorThreshold + * @return + */ public ClientConfig setFailureDetectorThreshold(int failureDetectorThreshold) { this.failureDetectorThreshold = failureDetectorThreshold; return this; @@ -777,26 +919,49 @@ public int getFailureDetectorThresholdCountMinimum() { return failureDetectorThresholdCountMinimum; } + /** + * Sets the minimum number of failures (exceptions/slow responses) in a + * given failure detector window, for a server to be marked down. Guards + * against a very small number of exceptions tripping the Failure detector + * due to low activity + * + * @param failureDetectorThresholdCountMinimum + * @return + */ public ClientConfig setFailureDetectorThresholdCountMinimum(int failureDetectorThresholdCountMinimum) { this.failureDetectorThresholdCountMinimum = failureDetectorThresholdCountMinimum; return this; } public long getFailureDetectorThresholdInterval() { - return failureDetectorThresholdInterval; + return failureDetectorThresholdIntervalMs; } - public ClientConfig setFailureDetectorThresholdInterval(long failureDetectorThresholdInterval) { - this.failureDetectorThresholdInterval = failureDetectorThresholdInterval; + /** + * Time window in ms, over which the failure detector accounts the failures + * and successes + * + * @param failureDetectorThresholdIntervalMs + * @return + */ + public ClientConfig setFailureDetectorThresholdInterval(long failureDetectorThresholdIntervalMs) { + this.failureDetectorThresholdIntervalMs = failureDetectorThresholdIntervalMs; return this; } public long getFailureDetectorAsyncRecoveryInterval() { - return failureDetectorAsyncRecoveryInterval; + return failureDetectorAsyncRecoveryIntervalMs; } + /** + * Number of milliseconds, to try to check if a marked down server has come + * back up again + * + * @param failureDetectorAsyncRecoveryInterval + * @return + */ public ClientConfig setFailureDetectorAsyncRecoveryInterval(long failureDetectorAsyncRecoveryInterval) { - this.failureDetectorAsyncRecoveryInterval = failureDetectorAsyncRecoveryInterval; + this.failureDetectorAsyncRecoveryIntervalMs = failureDetectorAsyncRecoveryInterval; return this; } @@ -804,6 +969,15 @@ public List getFailureDetectorCatastrophicErrorTypes() { return failureDetectorCatastrophicErrorTypes; } + /** + * Sets the exception types that should be treated as catastrophic,by the + * failure detector, resulting in the server being immediately considered + * down. Input string list should be populated by something like + * ConnectException.class.getName() + * + * @param failureDetectorCatastrophicErrorTypes + * @return + */ public ClientConfig setFailureDetectorCatastrophicErrorTypes(List failureDetectorCatastrophicErrorTypes) { this.failureDetectorCatastrophicErrorTypes = failureDetectorCatastrophicErrorTypes; return this; @@ -813,6 +987,14 @@ public long getFailureDetectorRequestLengthThreshold() { return failureDetectorRequestLengthThreshold; } + /** + * Sets the maximum amount of time a request is allowed to take, to be not + * considered as a "slow" request and count against the server, in terms of + * failure detection + * + * @param failureDetectorRequestLengthThreshold + * @return + */ public ClientConfig setFailureDetectorRequestLengthThreshold(long failureDetectorRequestLengthThreshold) { this.failureDetectorRequestLengthThreshold = failureDetectorRequestLengthThreshold; return this; @@ -858,11 +1040,11 @@ public long getAsyncMetadataRefreshInMs() { /** * Set the interval on which client checks for metadata change on servers * - * @param asyncCheckMetadataInterval The metadata change interval + * @param asyncCheckMetadataIntervalMs The metadata change interval */ - public ClientConfig setAsyncMetadataRefreshInMs(long asyncCheckMetadataInterval) { + public ClientConfig setAsyncMetadataRefreshInMs(long asyncCheckMetadataIntervalMs) { - this.asyncCheckMetadataIntervalInMs = asyncCheckMetadataInterval; + this.asyncCheckMetadataIntervalInMs = asyncCheckMetadataIntervalMs; return this; } @@ -877,8 +1059,8 @@ public int getClientRegistryUpdateIntervalInSecs() { * @param clientRegistryRefreshIntervalInSecs The refresh interval in * seconds */ - public ClientConfig setClientRegistryUpdateIntervalInSecs(int clientRegistryRefrshInterval) { - this.clientRegistryRefreshIntervalInSecs = clientRegistryRefrshInterval; + public ClientConfig setClientRegistryUpdateIntervalInSecs(int clientRegistryRefrshIntervalInSecs) { + this.clientRegistryRefreshIntervalInSecs = clientRegistryRefrshIntervalInSecs; return this; } @@ -895,4 +1077,40 @@ public ClientConfig setAsyncJobThreadPoolSize(int asyncJobThreadPoolSize) { this.asyncJobThreadPoolSize = asyncJobThreadPoolSize; return this; } + + public boolean isEnableCompressionLayer() { + return enableCompressionLayer; + } + + public ClientConfig setEnableCompressionLayer(boolean enableCompressionLayer) { + this.enableCompressionLayer = enableCompressionLayer; + return this; + } + + public boolean isEnableSerializationLayer() { + return enableSerializationLayer; + } + + public ClientConfig setEnableSerializationLayer(boolean enableSerializationLayer) { + this.enableSerializationLayer = enableSerializationLayer; + return this; + } + + public boolean isEnableInconsistencyResolvingLayer() { + return enableInconsistencyResolvingLayer; + } + + public ClientConfig setEnableInconsistencyResolvingLayer(boolean enableInconsistencyResolvingLayer) { + this.enableInconsistencyResolvingLayer = enableInconsistencyResolvingLayer; + return this; + } + + public String toString() { + StringBuilder clientConfigInfo = new StringBuilder(); + clientConfigInfo.append("Max connections per node: " + this.maxConnectionsPerNode + "\n"); + clientConfigInfo.append("Connection timeout : " + this.connectionTimeoutMs + "\n"); + clientConfigInfo.append("Socket timeout : " + this.socketTimeoutMs + "\n"); + clientConfigInfo.append("Routing timeout : " + this.routingTimeoutMs + "\n"); + return clientConfigInfo.toString(); + } } diff --git a/src/java/voldemort/client/protocol/admin/AdminClient.java b/src/java/voldemort/client/protocol/admin/AdminClient.java index 5e368bac44..38691152bf 100644 --- a/src/java/voldemort/client/protocol/admin/AdminClient.java +++ b/src/java/voldemort/client/protocol/admin/AdminClient.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2009 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -35,6 +35,9 @@ import java.util.Map.Entry; import java.util.Properties; import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.ThreadFactory; @@ -67,6 +70,7 @@ import voldemort.store.ErrorCodeMapper; import voldemort.store.Store; import voldemort.store.StoreDefinition; +import voldemort.store.StoreUtils; import voldemort.store.metadata.MetadataStore; import voldemort.store.metadata.MetadataStore.VoldemortState; import voldemort.store.mysql.MysqlStorageConfiguration; @@ -74,18 +78,22 @@ import voldemort.store.readonly.ReadOnlyStorageFormat; import voldemort.store.readonly.ReadOnlyStorageMetadata; import voldemort.store.readonly.ReadOnlyUtils; +import voldemort.store.routed.NodeValue; import voldemort.store.slop.Slop; import voldemort.store.slop.Slop.Operation; import voldemort.store.socket.SocketDestination; +import voldemort.store.socket.SocketStore; import voldemort.store.socket.clientrequest.ClientRequestExecutorPool; import voldemort.store.system.SystemStoreConstants; import voldemort.store.views.ViewStorageConfiguration; import voldemort.utils.ByteArray; import voldemort.utils.ByteUtils; +import voldemort.utils.ClusterUtils; import voldemort.utils.MetadataVersionStoreUtils; import voldemort.utils.NetworkClassLoader; import voldemort.utils.Pair; import voldemort.utils.RebalanceUtils; +import voldemort.utils.StoreDefinitionUtils; import voldemort.utils.Utils; import voldemort.versioning.VectorClock; import voldemort.versioning.Version; @@ -120,9 +128,6 @@ public class AdminClient { private static final Logger logger = Logger.getLogger(AdminClient.class); - private final ErrorCodeMapper errorMapper; - private final SocketPool pool; - private final NetworkClassLoader networkClassLoader; private static final ClusterMapper clusterMapper = new ClusterMapper(); private static final StoreDefinitionsMapper storeMapper = new StoreDefinitionsMapper(); @@ -130,21 +135,66 @@ public class AdminClient { private static final long INITIAL_DELAY = 250; // Initial delay private static final long PRINT_STATS_THRESHOLD = 10000; private static final long PRINT_STATS_INTERVAL = 5 * 60 * 1000; // 5 minutes - private final AdminClientConfig adminClientConfig; private static final String CLUSTER_VERSION_KEY = "cluster.xml"; private static final int DEFAULT_ZONE_ID = 0; - public final static List restoreStoreEngineBlackList = Arrays.asList(MysqlStorageConfiguration.TYPE_NAME, - ReadOnlyStorageConfiguration.TYPE_NAME, + public final static List restoreStoreEngineBlackList = Arrays.asList(ReadOnlyStorageConfiguration.TYPE_NAME, ViewStorageConfiguration.TYPE_NAME); + private final ErrorCodeMapper errorMapper; + private final SocketPool socketPool; + private final AdminStoreClient adminStoreClient; + private final NetworkClassLoader networkClassLoader; + private final AdminClientConfig adminClientConfig; private Cluster currentCluster; - private SystemStore sysStoreVersion = null; private String[] cachedBootstrapURLs = null; private int cachedZoneID = -1; + final public AdminClient.HelperOperations helperOps; + final public AdminClient.RPCOperations rpcOps; + final public AdminClient.MetadataManagementOperations metadataMgmtOps; + final public AdminClient.StoreManagementOperations storeMgmtOps; + final public AdminClient.StoreMaintenanceOperations storeMntOps; + final public AdminClient.BulkStreamingFetchOperations bulkFetchOps; + final public AdminClient.StreamingOperations streamingOps; + final public AdminClient.StoreOperations storeOps; + final public AdminClient.RestoreOperations restoreOps; + final public AdminClient.RebalancingOperations rebalanceOps; + final public AdminClient.ReadOnlySpecificOperations readonlyOps; + + /** + * Common initialization of AdminClient. + * + * @param adminClientConfig Client configuration for SocketPool-based + * operations. + * @param clientConfig Client configurations for + * ClientRequestExecutorPool-based operations via the (private) + * AdminStoreClient. + */ + private AdminClient(AdminClientConfig adminClientConfig, ClientConfig clientConfig) { + this.helperOps = this.new HelperOperations(); + this.rpcOps = this.new RPCOperations(); + this.metadataMgmtOps = this.new MetadataManagementOperations(); + this.storeMgmtOps = this.new StoreManagementOperations(); + this.storeMntOps = this.new StoreMaintenanceOperations(); + this.bulkFetchOps = this.new BulkStreamingFetchOperations(); + this.streamingOps = this.new StreamingOperations(); + this.storeOps = this.new StoreOperations(); + this.restoreOps = this.new RestoreOperations(); + this.rebalanceOps = this.new RebalancingOperations(); + this.readonlyOps = this.new ReadOnlySpecificOperations(); + + this.errorMapper = new ErrorCodeMapper(); + this.networkClassLoader = new NetworkClassLoader(Thread.currentThread() + .getContextClassLoader()); + + this.adminClientConfig = adminClientConfig; + this.socketPool = helperOps.createSocketPool(adminClientConfig); + this.adminStoreClient = new AdminStoreClient(clientConfig); + } + /** * Create an instance of AdminClient given a URL of a node in the cluster. * The bootstrap URL is used to get the cluster metadata. @@ -159,14 +209,12 @@ public class AdminClient { *
  • socket buffer size
  • * */ - public AdminClient(String bootstrapURL, AdminClientConfig adminClientConfig) { - this.currentCluster = getClusterFromBootstrapURL(bootstrapURL); - this.errorMapper = new ErrorCodeMapper(); - this.pool = createSocketPool(adminClientConfig); - this.networkClassLoader = new NetworkClassLoader(Thread.currentThread() - .getContextClassLoader()); - this.adminClientConfig = adminClientConfig; - cacheSystemStoreParams(bootstrapURL, DEFAULT_ZONE_ID); + public AdminClient(String bootstrapURL, + AdminClientConfig adminClientConfig, + ClientConfig clientConfig) { + this(adminClientConfig, clientConfig); + this.currentCluster = helperOps.getClusterFromBootstrapURL(bootstrapURL); + helperOps.cacheSystemStoreParams(bootstrapURL, DEFAULT_ZONE_ID); } /** @@ -183,17 +231,14 @@ public AdminClient(String bootstrapURL, AdminClientConfig adminClientConfig) { *
  • socket buffer size
  • * */ - public AdminClient(Cluster cluster, AdminClientConfig adminClientConfig) { + public AdminClient(Cluster cluster, + AdminClientConfig adminClientConfig, + ClientConfig clientConfig) { + this(adminClientConfig, clientConfig); this.currentCluster = cluster; - this.errorMapper = new ErrorCodeMapper(); - this.pool = createSocketPool(adminClientConfig); - this.networkClassLoader = new NetworkClassLoader(Thread.currentThread() - .getContextClassLoader()); - this.adminClientConfig = adminClientConfig; - Node node = cluster.getNodeById(0); String bootstrapURL = "tcp://" + node.getHost() + ":" + node.getSocketPort(); - cacheSystemStoreParams(bootstrapURL, DEFAULT_ZONE_ID); + helperOps.cacheSystemStoreParams(bootstrapURL, DEFAULT_ZONE_ID); } /** @@ -211,2497 +256,2935 @@ public AdminClient(Cluster cluster, AdminClientConfig adminClientConfig) { * * @param zoneID The primary Zone ID for the purpose of the SystemStore */ - public AdminClient(String bootstrapURL, AdminClientConfig adminClientConfig, int zoneID) { - this(bootstrapURL, adminClientConfig); - cacheSystemStoreParams(bootstrapURL, zoneID); + public AdminClient(String bootstrapURL, + AdminClientConfig adminClientConfig, + ClientConfig clientConfig, + int zoneID) { + this(bootstrapURL, adminClientConfig, clientConfig); + helperOps.cacheSystemStoreParams(bootstrapURL, zoneID); } /** - * Cache the paramater values for the internal system store client. These - * cached values are used every time the system store client needs to be - * initialized (useful when the cluster.xml changes). - * - * @param bootstrapURL The URL to bootstrap from - * @param zoneID Indicates the primary zone of the sytem store client + * Stop the AdminClient cleanly freeing all resources. */ - private void cacheSystemStoreParams(String bootstrapURL, int zoneID) { - String[] bootstrapUrls = new String[1]; - bootstrapUrls[0] = bootstrapURL; - this.cachedBootstrapURLs = bootstrapUrls; - this.cachedZoneID = zoneID; + public void close() { + this.socketPool.close(); + this.adminStoreClient.close(); } /** - * Create a system store client based on the cached bootstrap URLs and Zone - * ID + * Set cluster info for AdminClient to use. + * + * @param cluster Set the current cluster */ - public void initSystemStoreClient() { - if(this.cachedBootstrapURLs != null && this.cachedZoneID >= 0) { - try { - this.sysStoreVersion = new SystemStore(SystemStoreConstants.SystemStoreName.voldsys$_metadata_version_persistence.name(), - this.cachedBootstrapURLs, - this.cachedZoneID); - } catch(Exception e) { - logger.debug("Error while creating a system store client for metadata version store."); - } - - } + public void setAdminClientCluster(Cluster cluster) { + this.currentCluster = cluster; } /** - * Update the metadata version for the given key (cluster or store). The new - * value set is the current timestamp. + * Get the cluster info AdminClient is using. * - * @param versionKey The metadata key for which Version should be - * incremented + * @return Returns the current cluster being used by the admin client */ - public void updateMetadataversion(String versionKey) { - initSystemStoreClient(); - Properties props = MetadataVersionStoreUtils.getProperties(this.sysStoreVersion); - long newValue = 0; - if(props != null && props.getProperty(versionKey) != null) { - logger.debug("Version obtained = " + props.getProperty(versionKey)); - newValue = System.currentTimeMillis(); - } else { - logger.debug("Current version is null. Assuming version 0."); - if(props == null) { - props = new Properties(); - } - } - props.setProperty(versionKey, Long.toString(newValue)); - MetadataVersionStoreUtils.setProperties(this.sysStoreVersion, props); + public Cluster getAdminClientCluster() { + return currentCluster; } /** - * Set the metadata versions to the given set + * Encapsulates helper methods used across the admin client * - * @param newProperties The new metadata versions to be set across all the - * nodes in the cluster */ - public void setMetadataversion(Properties newProperties) { - initSystemStoreClient(); - MetadataVersionStoreUtils.setProperties(this.sysStoreVersion, newProperties); - } + public class HelperOperations { - private Cluster getClusterFromBootstrapURL(String bootstrapURL) { - ClientConfig config = new ClientConfig(); - // try to bootstrap metadata from bootstrapUrl - config.setBootstrapUrls(bootstrapURL); - SocketStoreClientFactory factory = new SocketStoreClientFactory(config); - // get Cluster from bootStrapUrl - String clusterXml = factory.bootstrapMetadataWithRetries(MetadataStore.CLUSTER_KEY, - factory.validateUrls(config.getBootstrapUrls())); - // release all threads/sockets hold by the factory. - factory.close(); - - return clusterMapper.readCluster(new StringReader(clusterXml), false); - } + /** + * Cache the paramater values for the internal system store client. + * These cached values are used every time the system store client needs + * to be initialized (useful when the cluster.xml changes). + * + * @param bootstrapURL The URL to bootstrap from + * @param zoneID Indicates the primary zone of the sytem store client + */ + private void cacheSystemStoreParams(String bootstrapURL, int zoneID) { + String[] bootstrapUrls = new String[1]; + bootstrapUrls[0] = bootstrapURL; + AdminClient.this.cachedBootstrapURLs = bootstrapUrls; + AdminClient.this.cachedZoneID = zoneID; + } - private SocketPool createSocketPool(AdminClientConfig config) { - TimeUnit unit = TimeUnit.SECONDS; - return new SocketPool(config.getMaxConnectionsPerNode(), - (int) unit.toMillis(config.getAdminConnectionTimeoutSec()), - (int) unit.toMillis(config.getAdminSocketTimeoutSec()), - config.getAdminSocketBufferSize(), - config.getAdminSocketKeepAlive()); - } + /** + * Create a system store client based on the cached bootstrap URLs and + * Zone ID + */ + private void initSystemStoreClient() { + if(AdminClient.this.cachedBootstrapURLs != null && AdminClient.this.cachedZoneID >= 0) { + try { + sysStoreVersion = new SystemStore(SystemStoreConstants.SystemStoreName.voldsys$_metadata_version_persistence.name(), + AdminClient.this.cachedBootstrapURLs, + AdminClient.this.cachedZoneID); + } catch(Exception e) { + logger.debug("Error while creating a system store client for metadata version store."); + } + } + } - private T sendAndReceive(int nodeId, Message message, T builder) { - Node node = this.getAdminClientCluster().getNodeById(nodeId); - SocketDestination destination = new SocketDestination(node.getHost(), - node.getAdminPort(), - RequestFormatType.ADMIN_PROTOCOL_BUFFERS); - SocketAndStreams sands = pool.checkout(destination); + private Cluster getClusterFromBootstrapURL(String bootstrapURL) { + ClientConfig config = new ClientConfig(); + // try to bootstrap metadata from bootstrapUrl + config.setBootstrapUrls(bootstrapURL); + SocketStoreClientFactory factory = new SocketStoreClientFactory(config); + // get Cluster from bootStrapUrl + String clusterXml = factory.bootstrapMetadataWithRetries(MetadataStore.CLUSTER_KEY, + factory.validateUrls(config.getBootstrapUrls())); + // release all threads/sockets hold by the factory. + factory.close(); + + return clusterMapper.readCluster(new StringReader(clusterXml), false); + } - try { - DataOutputStream outputStream = sands.getOutputStream(); - DataInputStream inputStream = sands.getInputStream(); - ProtoUtils.writeMessage(outputStream, message); - outputStream.flush(); + private SocketPool createSocketPool(AdminClientConfig config) { + TimeUnit unit = TimeUnit.SECONDS; + return new SocketPool(config.getMaxConnectionsPerNode(), + (int) unit.toMillis(config.getAdminConnectionTimeoutSec()), + (int) unit.toMillis(config.getAdminSocketTimeoutSec()), + config.getAdminSocketBufferSize(), + config.getAdminSocketKeepAlive()); + } - return ProtoUtils.readToBuilder(inputStream, builder); - } catch(IOException e) { - close(sands.getSocket()); - throw new VoldemortException(e); - } finally { - pool.checkin(destination, sands); + private void close(Socket socket) { + try { + socket.close(); + } catch(IOException e) { + logger.warn("Failed to close socket"); + } } - } - /** - * Update a stream of key/value entries at the given node. The iterator - * entries are streamed from the client to the server: - *
      - *
    1. Client performs a handshake with the server (sending in the update - * entries request with a store name and a {@link VoldemortFilter} instance. - *
    2. - *
    3. While entryIterator has entries, the client will keep sending the - * updates one after another to the server, buffering the data, without - * waiting for a response from the server.
    4. - *
    5. After iteration is complete, send an end of stream message, force a - * flush of the buffer, check the response on the server to check if a - * {@link VoldemortException} has occured.
    6. - *
    - * - * @param nodeId Id of the remote node (where we wish to update the entries) - * @param storeName Store name for the entries - * @param entryIterator Iterator of key-value pairs for the entries - * @param filter Custom filter implementation to filter out entries which - * should not be updated. - * @throws VoldemortException - */ - public void updateEntries(int nodeId, - String storeName, - Iterator>> entryIterator, - VoldemortFilter filter) { - Node node = this.getAdminClientCluster().getNodeById(nodeId); - SocketDestination destination = new SocketDestination(node.getHost(), - node.getAdminPort(), - RequestFormatType.ADMIN_PROTOCOL_BUFFERS); - SocketAndStreams sands = pool.checkout(destination); - DataOutputStream outputStream = sands.getOutputStream(); - DataInputStream inputStream = sands.getInputStream(); - boolean firstMessage = true; - long printStatsTimer = System.currentTimeMillis() + PRINT_STATS_INTERVAL; - long entryCount = 0; - - try { - if(entryIterator.hasNext()) { - while(entryIterator.hasNext()) { - Pair> entry = entryIterator.next(); - VAdminProto.PartitionEntry partitionEntry = VAdminProto.PartitionEntry.newBuilder() - .setKey(ProtoUtils.encodeBytes(entry.getFirst())) - .setVersioned(ProtoUtils.encodeVersioned(entry.getSecond())) - .build(); - VAdminProto.UpdatePartitionEntriesRequest.Builder updateRequest = VAdminProto.UpdatePartitionEntriesRequest.newBuilder() - .setStore(storeName) - .setPartitionEntry(partitionEntry); - entryCount++; - if(firstMessage) { - if(filter != null) { - updateRequest.setFilter(encodeFilter(filter)); - } + // TODO: (refactor) Move this helper method to ClusterInstance + /** + * For a particular node, finds out all the [replica, partition] tuples + * it needs to steal in order to be brought back to normal state + * + * @param restoringNode The id of the node which needs to be restored + * @param cluster The cluster definition + * @param storeDef The store definition to use + * @return Map of node id to map of replica type and corresponding + * partition list + */ + public Map>> getReplicationMapping(int restoringNode, + Cluster cluster, + StoreDefinition storeDef) { + return getReplicationMapping(restoringNode, cluster, storeDef, -1); + } - ProtoUtils.writeMessage(outputStream, - VAdminProto.VoldemortAdminRequest.newBuilder() - .setType(VAdminProto.AdminRequestType.UPDATE_PARTITION_ENTRIES) - .setUpdatePartitionEntries(updateRequest) - .build()); - outputStream.flush(); - firstMessage = false; - } else { - ProtoUtils.writeMessage(outputStream, updateRequest.build()); - if(printStatsTimer <= System.currentTimeMillis() - || 0 == entryCount % PRINT_STATS_THRESHOLD) { - logger.info("UpdatePartitionEntries: fetched " + entryCount - + " to node " + nodeId + " for store " + storeName); - printStatsTimer = System.currentTimeMillis() + PRINT_STATS_INTERVAL; + // TODO: (refactor) Move this helper method to ClusterInstance + /** + * For a particular node, finds out all the [replica, partition] tuples + * it needs to steal in order to be brought back to normal state + * + * @param restoringNode The id of the node which needs to be restored + * @param cluster The cluster definition + * @param storeDef The store definition to use + * @param zoneId zone from which nodes are chosen, -1 means no zone + * preference + * @return Map of node id to map of replica type and corresponding + * partition list + */ + public Map>> getReplicationMapping(int restoringNode, + Cluster cluster, + StoreDefinition storeDef, + int zoneId) { + + Map>> returnMap = Maps.newHashMap(); + + RoutingStrategy strategy = new RoutingStrategyFactory().updateRoutingStrategy(storeDef, + cluster); + List restoringNodePartition = cluster.getNodeById(restoringNode) + .getPartitionIds(); + + // Go over every partition. As long as one of them belongs to the + // current node list, find its replica + for(Node node: cluster.getNodes()) { + for(int partitionId: node.getPartitionIds()) { + List replicatingPartitions = strategy.getReplicatingPartitionList(partitionId); + List extraCopyReplicatingPartitions = Lists.newArrayList(replicatingPartitions); + + if(replicatingPartitions.size() <= 1) { + throw new VoldemortException("Store " + + storeDef.getName() + + " cannot be restored from replica because replication factor = 1"); + } + + if(replicatingPartitions.removeAll(restoringNodePartition)) { + if(replicatingPartitions.size() == 0) { + throw new VoldemortException("Found a case where-in the overlap of " + + "the node partition list results in no replicas " + + "being left in replicating list"); } + + addDonorWithZonePreference(replicatingPartitions, + extraCopyReplicatingPartitions, + returnMap, + zoneId, + cluster, + storeDef); } - } - ProtoUtils.writeEndOfStream(outputStream); - outputStream.flush(); - VAdminProto.UpdatePartitionEntriesResponse.Builder updateResponse = ProtoUtils.readToBuilder(inputStream, - VAdminProto.UpdatePartitionEntriesResponse.newBuilder()); - if(updateResponse.hasError()) { - throwException(updateResponse.getError()); + } } - } catch(IOException e) { - close(sands.getSocket()); - throw new VoldemortException(e); - } finally { - pool.checkin(destination, sands); + return returnMap; } - } - private void initiateFetchRequest(DataOutputStream outputStream, - String storeName, - HashMap> replicaToPartitionList, - VoldemortFilter filter, - boolean fetchValues, - boolean fetchMasterEntries, - Cluster initialCluster, - long skipRecords) throws IOException { - HashMap> filteredReplicaToPartitionList = Maps.newHashMap(); - if(fetchMasterEntries) { - if(!replicaToPartitionList.containsKey(0)) { - throw new VoldemortException("Could not find any partitions for primary replica type"); + // TODO: (refactor) Move this helper method to ClusterInstance + /** + * For each partition that need to be restored, find a donor node that + * owns the partition AND has the same zone ID as requested. -1 means no + * zone preference required when finding a donor node needs to steal in + * order to + * + * @param remainderPartitions The replicating partitions without the one + * needed by the restore node + * @param originalPartitions The entire replicating partition list + * (including the one needed by the restore node) + * @param donorMap All donor nodes that will be fetched from + * @param zondId The zone from which donor nodes will be chosen from; -1 + * means all zones are fine + * @param cluster The cluster metadata + * @param storeDef The store to be restored + * @return + */ + private void addDonorWithZonePreference(List remainderPartitions, + List originalPartitions, + Map>> donorMap, + int zoneId, + Cluster cluster, + StoreDefinition storeDef) { + Map partitionToNodeId = ClusterUtils.getCurrentPartitionMapping(cluster); + int nodeId = -1; + int replicaType = -1; + int partition = -1; + boolean found = false; + int index = 0; + + while(!found && index < remainderPartitions.size()) { + replicaType = originalPartitions.indexOf(remainderPartitions.get(index)); + nodeId = partitionToNodeId.get(remainderPartitions.get(index)); + if(-1 == zoneId || cluster.getNodeById(nodeId).getZoneId() == zoneId) { + found = true; + } else { + index++; + } + } + + if(!found) { + throw new VoldemortException("unable to find a node to fetch partition " + + partition + " of replica type " + replicaType + + " for store " + storeDef.getName()); + } + + partition = originalPartitions.get(0); + HashMap> replicaToPartitionList = null; + if(donorMap.containsKey(nodeId)) { + replicaToPartitionList = donorMap.get(nodeId); } else { - filteredReplicaToPartitionList.put(0, replicaToPartitionList.get(0)); + replicaToPartitionList = Maps.newHashMap(); + donorMap.put(nodeId, replicaToPartitionList); } - } else { - filteredReplicaToPartitionList.putAll(replicaToPartitionList); - } - VAdminProto.FetchPartitionEntriesRequest.Builder fetchRequest = VAdminProto.FetchPartitionEntriesRequest.newBuilder() - .setFetchValues(fetchValues) - .addAllReplicaToPartition(ProtoUtils.encodePartitionTuple(filteredReplicaToPartitionList)) - .setStore(storeName) - .setSkipRecords(skipRecords); - try { - if(filter != null) { - fetchRequest.setFilter(encodeFilter(filter)); + List partitions = null; + if(replicaToPartitionList.containsKey(replicaType)) { + partitions = replicaToPartitionList.get(replicaType); + } else { + partitions = Lists.newArrayList(); + replicaToPartitionList.put(replicaType, partitions); } - } catch(IOException e) { - throw new VoldemortException(e); + partitions.add(partition); } - if(initialCluster != null) { - fetchRequest.setInitialCluster(new ClusterMapper().writeCluster(initialCluster)); + public void throwException(VProto.Error error) { + throw AdminClient.this.errorMapper.getError((short) error.getErrorCode(), + error.getErrorMessage()); } - VAdminProto.VoldemortAdminRequest request = VAdminProto.VoldemortAdminRequest.newBuilder() - .setType(VAdminProto.AdminRequestType.FETCH_PARTITION_ENTRIES) - .setFetchPartitionEntries(fetchRequest) - .build(); - ProtoUtils.writeMessage(outputStream, request); - outputStream.flush(); - - } + private VAdminProto.VoldemortFilter encodeFilter(VoldemortFilter filter) throws IOException { + Class cl = filter.getClass(); + byte[] classBytes = networkClassLoader.dumpClass(cl); + return VAdminProto.VoldemortFilter.newBuilder() + .setName(cl.getName()) + .setData(ProtoUtils.encodeBytes(new ByteArray(classBytes))) + .build(); + } - private VAdminProto.FetchPartitionEntriesResponse responseFromStream(DataInputStream inputStream, - int size) - throws IOException { - byte[] input = new byte[size]; - ByteUtils.read(inputStream, input); - VAdminProto.FetchPartitionEntriesResponse.Builder response = VAdminProto.FetchPartitionEntriesResponse.newBuilder(); - response.mergeFrom(input); + // TODO: (refactor) It is weird that a helper method invokes + // metadataMgmtOps.getRemoteStoreDefList. Refactor this method to split + // some of the functionality into ClusterInstance, and then move this + // method to metadataMgmtOps. Or, do the refactoring wrt ClusterInstance + // and change the method interface to require storeDef rather than + // storeName to avoid doing a metadata operation... + /** + * Converts list of partitions to map of replica type to partition list. + * + * @param nodeId Node which is donating data + * @param storeName Name of store + * @param partitions List of partitions ( primary OR replicas ) to move + * @return Map of replica type to partitions + */ + private HashMap> getReplicaToPartitionMap(int nodeId, + String storeName, + List partitions) { + List allStoreDefs = metadataMgmtOps.getRemoteStoreDefList(nodeId) + .getValue(); + allStoreDefs.addAll(SystemStoreConstants.getAllSystemStoreDefs()); + StoreDefinition def = StoreDefinitionUtils.getStoreDefinitionWithName(allStoreDefs, + storeName); + HashMap> replicaToPartitionList = Maps.newHashMap(); + for(int replicaNum = 0; replicaNum < def.getReplicationFactor(); replicaNum++) { + replicaToPartitionList.put(replicaNum, partitions); + } - return response.build(); + return replicaToPartitionList; + } } /** - * Legacy interface for fetching entries. See - * {@link AdminClient#fetchEntries(int, String, HashMap, VoldemortFilter, boolean, Cluster, long)} - * for more information. + * Encapsulates all the RPC helper methods * - * @param nodeId Id of the node to fetch from - * @param storeName Name of the store - * @param partitionList List of the partitions - * @param filter Custom filter implementation to filter out entries which - * should not be fetched. - * @param fetchMasterEntries Fetch an entry only if master replica - * @param skipRecords Number of records to skip - * @return An iterator which allows entries to be streamed as they're being - * iterated over. */ - public Iterator>> fetchEntries(int nodeId, - String storeName, - List partitionList, - VoldemortFilter filter, - boolean fetchMasterEntries, - long skipRecords) { - return fetchEntries(nodeId, - storeName, - getReplicaToPartitionMap(nodeId, storeName, partitionList), - filter, - fetchMasterEntries, - null, - skipRecords); - } + public class RPCOperations { - /** - * Legacy interface for fetching entries. See - * {@link AdminClient#fetchEntries(int, String, HashMap, VoldemortFilter, boolean, Cluster, long)} - * for more information. - * - * @param nodeId Id of the node to fetch from - * @param storeName Name of the store - * @param partitionList List of the partitions - * @param filter Custom filter implementation to filter out entries which - * should not be fetched. - * @param fetchMasterEntries Fetch an entry only if master replica - * @return An iterator which allows entries to be streamed as they're being - * iterated over. - */ - public Iterator>> fetchEntries(int nodeId, - String storeName, - List partitionList, - VoldemortFilter filter, - boolean fetchMasterEntries) { - return fetchEntries(nodeId, storeName, partitionList, filter, fetchMasterEntries, 0); - } + private T sendAndReceive(int nodeId, Message message, T builder) { + Node node = AdminClient.this.getAdminClientCluster().getNodeById(nodeId); + SocketDestination destination = new SocketDestination(node.getHost(), + node.getAdminPort(), + RequestFormatType.ADMIN_PROTOCOL_BUFFERS); + SocketAndStreams sands = socketPool.checkout(destination); - /** - * Fetch key/value tuples belonging to this map of replica type to partition - * list - *

    - * - * Streaming API - The server keeps sending the messages as it's - * iterating over the data. Once iteration has finished, the server sends an - * "end of stream" marker and flushes its buffer. A response indicating a - * {@link VoldemortException} may be sent at any time during the process. - *
    - * - *

    - * Entries are being streamed as the iteration happens i.e. the - * whole result set is not buffered in memory. - * - * @param nodeId Id of the node to fetch from - * @param storeName Name of the store - * @param replicaToPartitionList Mapping of replica type to partition list - * @param filter Custom filter implementation to filter out entries which - * should not be fetched. - * @param fetchMasterEntries Fetch an entry only if master replica - * @param initialCluster The cluster metadata to use while making the - * decision to fetch entries. This is important during rebalancing - * where-in we want to fetch keys using an older metadata compared to - * the new one. - * @param skipRecords Number of records to skip - * @return An iterator which allows entries to be streamed as they're being - * iterated over. - */ - public Iterator>> fetchEntries(int nodeId, - String storeName, - HashMap> replicaToPartitionList, - VoldemortFilter filter, - boolean fetchMasterEntries, - Cluster initialCluster, - long skipRecords) { - - Node node = this.getAdminClientCluster().getNodeById(nodeId); - final SocketDestination destination = new SocketDestination(node.getHost(), - node.getAdminPort(), - RequestFormatType.ADMIN_PROTOCOL_BUFFERS); - final SocketAndStreams sands = pool.checkout(destination); - DataOutputStream outputStream = sands.getOutputStream(); - final DataInputStream inputStream = sands.getInputStream(); - - try { - initiateFetchRequest(outputStream, - storeName, - replicaToPartitionList, - filter, - true, - fetchMasterEntries, - initialCluster, - skipRecords); - } catch(IOException e) { - close(sands.getSocket()); - pool.checkin(destination, sands); - throw new VoldemortException(e); - } - - return new AbstractIterator>>() { + try { + DataOutputStream outputStream = sands.getOutputStream(); + DataInputStream inputStream = sands.getInputStream(); + ProtoUtils.writeMessage(outputStream, message); + outputStream.flush(); - @Override - public Pair> computeNext() { - try { - int size = inputStream.readInt(); - if(size == -1) { - pool.checkin(destination, sands); - return endOfData(); - } + return ProtoUtils.readToBuilder(inputStream, builder); + } catch(IOException e) { + helperOps.close(sands.getSocket()); + throw new VoldemortException(e); + } finally { + socketPool.checkin(destination, sands); + } + } - VAdminProto.FetchPartitionEntriesResponse response = responseFromStream(inputStream, - size); + /** + * Get the status of an Async Operation running at (remote) node. + * + * If The operation is complete, then the operation will be removed + * from a list of currently running operations. + * + * @param nodeId Id on which the operation is running + * @param requestId Id of the operation itself + * @return The status of the operation + */ + public AsyncOperationStatus getAsyncRequestStatus(int nodeId, int requestId) { + VAdminProto.AsyncOperationStatusRequest asyncRequest = VAdminProto.AsyncOperationStatusRequest.newBuilder() + .setRequestId(requestId) + .build(); + VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() + .setType(VAdminProto.AdminRequestType.ASYNC_OPERATION_STATUS) + .setAsyncOperationStatus(asyncRequest) + .build(); + VAdminProto.AsyncOperationStatusResponse.Builder response = rpcOps.sendAndReceive(nodeId, + adminRequest, + VAdminProto.AsyncOperationStatusResponse.newBuilder()); + + if(response.hasError()) + helperOps.throwException(response.getError()); + + AsyncOperationStatus status = new AsyncOperationStatus(response.getRequestId(), + response.getDescription()); + status.setStatus(response.getStatus()); + status.setComplete(response.getComplete()); + + return status; + } - if(response.hasError()) { - pool.checkin(destination, sands); - throwException(response.getError()); - } + /** + * Retrieves a list of asynchronous request ids on the server. Does not + * include the completed requests + * + * @param nodeId The id of the node whose request ids we want + * @return List of async request ids + */ + public List getAsyncRequestList(int nodeId) { + return getAsyncRequestList(nodeId, false); + } - VAdminProto.PartitionEntry partitionEntry = response.getPartitionEntry(); + /** + * Retrieves a list of asynchronous request ids on the server. Depending + * on the boolean passed also retrieves the completed requests + * + * @param nodeId The id of the node whose request ids we want + * @param showComplete Boolean to indicate if we want to include the + * completed requests as well + * @return List of async request ids + */ + public List getAsyncRequestList(int nodeId, boolean showComplete) { + VAdminProto.AsyncOperationListRequest asyncOperationListRequest = VAdminProto.AsyncOperationListRequest.newBuilder() + .setShowComplete(showComplete) + .build(); + VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() + .setType(VAdminProto.AdminRequestType.ASYNC_OPERATION_LIST) + .setAsyncOperationList(asyncOperationListRequest) + .build(); + VAdminProto.AsyncOperationListResponse.Builder response = rpcOps.sendAndReceive(nodeId, + adminRequest, + VAdminProto.AsyncOperationListResponse.newBuilder()); + if(response.hasError()) + helperOps.throwException(response.getError()); + + return response.getRequestIdsList(); + } - return Pair.create(ProtoUtils.decodeBytes(partitionEntry.getKey()), - ProtoUtils.decodeVersioned(partitionEntry.getVersioned())); - } catch(IOException e) { - close(sands.getSocket()); - pool.checkin(destination, sands); - throw new VoldemortException(e); - } - } - }; + /** + * To stop an asynchronous request on the particular node + * + * @param nodeId The id of the node on which the request is running + * @param requestId The id of the request to terminate + */ + public void stopAsyncRequest(int nodeId, int requestId) { + VAdminProto.AsyncOperationStopRequest asyncOperationStopRequest = VAdminProto.AsyncOperationStopRequest.newBuilder() + .setRequestId(requestId) + .build(); + VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() + .setType(VAdminProto.AdminRequestType.ASYNC_OPERATION_STOP) + .setAsyncOperationStop(asyncOperationStopRequest) + .build(); + VAdminProto.AsyncOperationStopResponse.Builder response = rpcOps.sendAndReceive(nodeId, + adminRequest, + VAdminProto.AsyncOperationStopResponse.newBuilder()); + + if(response.hasError()) + helperOps.throwException(response.getError()); + } - } + /** + * Wait for async task at (remote) nodeId to finish completion, using + * exponential backoff to poll the task completion status. + *

    + * + * Logs the status at each status check if debug is enabled. + * + * @param nodeId Id of the node to poll + * @param requestId Id of the request to check + * @param maxWait Maximum time we'll keep checking a request until we + * give up + * @param timeUnit Unit in which maxWait is expressed. + * @param higherStatus A higher level async operation object. If this + * waiting is being run another async operation this helps us + * propagate the status all the way up. + * @return description The final description attached with the response + * @throws VoldemortException if task failed to finish in specified + * maxWait time. + */ + public String waitForCompletion(int nodeId, + int requestId, + long maxWait, + TimeUnit timeUnit, + AsyncOperationStatus higherStatus) { + long delay = INITIAL_DELAY; + long waitUntil = System.currentTimeMillis() + timeUnit.toMillis(maxWait); + + String description = null; + String oldStatus = ""; + while(System.currentTimeMillis() < waitUntil) { + try { + AsyncOperationStatus status = getAsyncRequestStatus(nodeId, requestId); + if(!status.getStatus().equalsIgnoreCase(oldStatus)) + logger.info("Status from node " + nodeId + " (" + status.getDescription() + + ") - " + status.getStatus()); + oldStatus = status.getStatus(); + + if(higherStatus != null) { + higherStatus.setStatus("Status from node " + nodeId + " (" + + status.getDescription() + ") - " + + status.getStatus()); + } + description = status.getDescription(); + if(status.hasException()) + throw status.getException(); - /** - * Fetch key/value tuples belonging to a node with given key values - * - *

    - * Entries are being queried synchronously as the iteration happens - * i.e. the whole result set is not buffered in memory. - * - * @param nodeId Id of the node to fetch from - * @param storeName Name of the store - * @param keys An Iterable of keys - * @return An iterator which allows entries to be streamed as they're being - * iterated over. - */ - public Iterator>, Exception>>> queryKeys(int nodeId, - String storeName, - final Iterator keys) { - - Node node = this.getAdminClientCluster().getNodeById(nodeId); - ClientConfig clientConfig = new ClientConfig(); - final Store store; - final ClientRequestExecutorPool clientPool = new ClientRequestExecutorPool(clientConfig.getSelectors(), - clientConfig.getMaxConnectionsPerNode(), - clientConfig.getConnectionTimeout(TimeUnit.MILLISECONDS), - clientConfig.getSocketTimeout(TimeUnit.MILLISECONDS), - clientConfig.getSocketBufferSize(), - clientConfig.getSocketKeepAlive()); - try { - store = clientPool.create(storeName, - node.getHost(), - node.getSocketPort(), - clientConfig.getRequestFormatType(), - RequestRoutingType.IGNORE_CHECKS); - - } catch(Exception e) { - clientPool.close(); - throw new VoldemortException(e); - } + if(status.isComplete()) + return status.getStatus(); - return new AbstractIterator>, Exception>>>() { + if(delay < adminClientConfig.getMaxBackoffDelayMs()) + delay <<= 1; - @Override - public Pair>, Exception>> computeNext() { - ByteArray key; - Exception exception = null; - List> value = null; - if(!keys.hasNext()) { - clientPool.close(); - return endOfData(); - } else { - key = keys.next(); - } - try { - value = store.get(key, null); + try { + Thread.sleep(delay); + } catch(InterruptedException e) { + Thread.currentThread().interrupt(); + } } catch(Exception e) { - exception = e; + throw new VoldemortException("Failed while waiting for async task (" + + description + ") at node " + nodeId + + " to finish", e); } - return Pair.create(key, Pair.create(value, exception)); } - }; - } + throw new VoldemortException("Failed to finish task requestId: " + requestId + + " in maxWait " + maxWait + " " + timeUnit.toString()); + } - /** - * Legacy interface for fetching entries. See - * {@link AdminClient#fetchKeys(int, String, HashMap, VoldemortFilter, boolean, Cluster, long)} - * for more information. - * - * @param nodeId Id of the node to fetch from - * @param storeName Name of the store - * @param partitionList List of the partitions to retrieve - * @param filter Custom filter implementation to filter out entries which - * should not be fetched. - * @param fetchMasterEntries Fetch a key only if master replica - * @param skipRecords Number of keys to skip - * @return An iterator which allows keys to be streamed as they're being - * iterated over. - */ - public Iterator fetchKeys(int nodeId, - String storeName, - List partitionList, - VoldemortFilter filter, - boolean fetchMasterEntries, - long skipRecords) { - return fetchKeys(nodeId, - storeName, - getReplicaToPartitionMap(nodeId, storeName, partitionList), - filter, - fetchMasterEntries, - null, - skipRecords); - } + /** + * Wait for async task at (remote) nodeId to finish completion, using + * exponential backoff to poll the task completion status. + *

    + * + * Logs the status at each status check if debug is enabled. + * + * @param nodeId Id of the node to poll + * @param requestId Id of the request to check + * @param maxWait Maximum time we'll keep checking a request until we + * give up + * @param timeUnit Unit in which maxWait is expressed. + * @return description The final description attached with the response + * @throws VoldemortException if task failed to finish in specified + * maxWait time. + */ + public String waitForCompletion(int nodeId, int requestId, long maxWait, TimeUnit timeUnit) { + return waitForCompletion(nodeId, requestId, maxWait, timeUnit, null); + } - /** - * Legacy interface for fetching entries. See - * {@link AdminClient#fetchKeys(int, String, HashMap, VoldemortFilter, boolean, Cluster, long)} - * for more information. - * - * @param nodeId Id of the node to fetch from - * @param storeName Name of the store - * @param partitionList List of the partitions to retrieve - * @param filter Custom filter implementation to filter out entries which - * should not be fetched. - * @param fetchMasterEntries Fetch a key only if master replica - * @return An iterator which allows keys to be streamed as they're being - * iterated over. - */ - public Iterator fetchKeys(int nodeId, - String storeName, - List partitionList, - VoldemortFilter filter, - boolean fetchMasterEntries) { - return fetchKeys(nodeId, storeName, partitionList, filter, fetchMasterEntries, 0); - } + /** + * Wait till the passed value matches with the metadata value returned + * by the remote node for the passed key. + *

    + * + * Logs the status at each status check if debug is enabled. + * + * @param nodeId Id of the node to poll + * @param key metadata key to keep checking for current value + * @param value metadata value should match for exit criteria. + * @param maxWait Maximum time we'll keep checking a request until we + * give up + * @param timeUnit Unit in which maxWait is expressed. + */ + public void waitForCompletion(int nodeId, + String key, + String value, + long maxWait, + TimeUnit timeUnit) { + long delay = INITIAL_DELAY; + long waitUntil = System.currentTimeMillis() + timeUnit.toMillis(maxWait); + + while(System.currentTimeMillis() < waitUntil) { + String currentValue = metadataMgmtOps.getRemoteMetadata(nodeId, key).getValue(); + if(value.equals(currentValue)) + return; + + logger.debug("waiting for value " + value + " for metadata key " + key + + " from remote node " + nodeId + " currentValue " + currentValue); - /** - * Fetch all keys belonging to the map of replica type to partition list. - * Identical to {@link AdminClient#fetchEntries} but - * only fetches the keys - * - * @param nodeId The node id from where to fetch the keys - * @param storeName The store name whose keys we want to retrieve - * @param replicaToPartitionList Map of replica type to corresponding - * partition list - * @param filter Custom filter - * @param initialCluster Cluster to use for selecting a key. If null, use - * the default metadata from the metadata store - * @param skipRecords Number of records to skip [ Used for sampling ] - * @return Returns an iterator of the keys - */ - public Iterator fetchKeys(int nodeId, - String storeName, - HashMap> replicaToPartitionList, - VoldemortFilter filter, - boolean fetchMasterEntries, - Cluster initialCluster, - long skipRecords) { - Node node = this.getAdminClientCluster().getNodeById(nodeId); - final SocketDestination destination = new SocketDestination(node.getHost(), - node.getAdminPort(), - RequestFormatType.ADMIN_PROTOCOL_BUFFERS); - final SocketAndStreams sands = pool.checkout(destination); - DataOutputStream outputStream = sands.getOutputStream(); - final DataInputStream inputStream = sands.getInputStream(); - - try { - initiateFetchRequest(outputStream, - storeName, - replicaToPartitionList, - filter, - false, - fetchMasterEntries, - initialCluster, - skipRecords); - } catch(IOException e) { - close(sands.getSocket()); - pool.checkin(destination, sands); - throw new VoldemortException(e); - } - - return new AbstractIterator() { + if(delay < adminClientConfig.getMaxBackoffDelayMs()) + delay <<= 1; - @Override - public ByteArray computeNext() { try { - int size = inputStream.readInt(); - if(size == -1) { - pool.checkin(destination, sands); - return endOfData(); - } - - VAdminProto.FetchPartitionEntriesResponse response = responseFromStream(inputStream, - size); - - if(response.hasError()) { - pool.checkin(destination, sands); - throwException(response.getError()); - } - - return ProtoUtils.decodeBytes(response.getKey()); - } catch(IOException e) { - close(sands.getSocket()); - pool.checkin(destination, sands); - throw new VoldemortException(e); + Thread.sleep(delay); + } catch(InterruptedException e) { + Thread.currentThread().interrupt(); } - } - }; - } + throw new VoldemortException("Failed to get matching value " + value + " for key " + + key + " at remote node " + nodeId + " in maximum wait" + + maxWait + " " + timeUnit.toString() + " time."); + } - /** - * RestoreData from copies on other machines for the given nodeId - *

    - * Recovery mechanism to recover and restore data actively from replicated - * copies in the cluster.
    - * - * @param nodeId Id of the node to restoreData - * @param parallelTransfers number of transfers - * @throws InterruptedException - */ - public void restoreDataFromReplications(int nodeId, int parallelTransfers) { - restoreDataFromReplications(nodeId, parallelTransfers, -1); } /** - * RestoreData from copies on other machines for the given nodeId - *

    - * Recovery mechanism to recover and restore data actively from replicated - * copies in the cluster.
    + * Encapsulates all operations that deal with cluster.xml and stores.xml * - * @param nodeId Id of the node to restoreData - * @param parallelTransfers number of transfers - * @param zoneId zone from which the nodes are chosen from, -1 means no zone - * preference - * @throws InterruptedException */ - public void restoreDataFromReplications(int nodeId, int parallelTransfers, int zoneId) { - ExecutorService executors = Executors.newFixedThreadPool(parallelTransfers, - new ThreadFactory() { - - public Thread newThread(Runnable r) { - Thread thread = new Thread(r); - thread.setName("restore-data-thread"); - return thread; - } - }); - try { - List storeDefList = getRemoteStoreDefList(nodeId).getValue(); - Cluster cluster = getRemoteCluster(nodeId).getValue(); - - List writableStores = Lists.newArrayList(); - for(StoreDefinition def: storeDefList) { - if(def.isView()) { - logger.info("Ignoring store " + def.getName() + " since it is a view"); - } else if(restoreStoreEngineBlackList.contains(def.getType())) { - logger.info("Ignoring store " + def.getName() - + " since we don't support restoring for " + def.getType() - + " storage engine"); - } else if(def.getReplicationFactor() == 1) { - logger.info("Ignoring store " + def.getName() - + " since replication factor is set to 1"); - } else { - writableStores.add(def); + public class MetadataManagementOperations { + + /** + * Update the metadata version for the given key (cluster or store). The + * new value set is the current timestamp. + * + * @param versionKey The metadata key for which Version should be + * incremented + */ + public void updateMetadataversion(String versionKey) { + helperOps.initSystemStoreClient(); + Properties props = MetadataVersionStoreUtils.getProperties(AdminClient.this.sysStoreVersion); + long newValue = 0; + if(props != null && props.getProperty(versionKey) != null) { + logger.debug("Version obtained = " + props.getProperty(versionKey)); + newValue = System.currentTimeMillis(); + } else { + logger.debug("Current version is null. Assuming version 0."); + if(props == null) { + props = new Properties(); } } - for(StoreDefinition def: writableStores) { - restoreStoreFromReplication(nodeId, cluster, def, executors, zoneId); - } - } finally { - executors.shutdown(); - try { - executors.awaitTermination(adminClientConfig.getRestoreDataTimeoutSec(), - TimeUnit.SECONDS); - } catch(InterruptedException e) { - logger.error("Interrupted while waiting restore operation to finish."); - } - logger.info("Finished restoring data."); + props.setProperty(versionKey, Long.toString(newValue)); + MetadataVersionStoreUtils.setProperties(AdminClient.this.sysStoreVersion, props); } - } - /** - * For a particular node, finds out all the [replica, partition] tuples it - * needs to steal in order to be brought back to normal state - * - * @param restoringNode The id of the node which needs to be restored - * @param cluster The cluster definition - * @param storeDef The store definition to use - * @return Map of node id to map of replica type and corresponding partition - * list - */ - public Map>> getReplicationMapping(int restoringNode, - Cluster cluster, - StoreDefinition storeDef) { - return getReplicationMapping(restoringNode, cluster, storeDef, -1); - } + /** + * Set the metadata versions to the given set + * + * @param newProperties The new metadata versions to be set across all + * the nodes in the cluster + */ + public void setMetadataversion(Properties newProperties) { + helperOps.initSystemStoreClient(); + MetadataVersionStoreUtils.setProperties(AdminClient.this.sysStoreVersion, newProperties); + } - /** - * For a particular node, finds out all the [replica, partition] tuples it - * needs to steal in order to be brought back to normal state - * - * @param restoringNode The id of the node which needs to be restored - * @param cluster The cluster definition - * @param storeDef The store definition to use - * @param zoneId zone from which nodes are chosen, -1 means no zone - * preference - * @return Map of node id to map of replica type and corresponding partition - * list - */ - public Map>> getReplicationMapping(int restoringNode, - Cluster cluster, - StoreDefinition storeDef, - int zoneId) { - - Map>> returnMap = Maps.newHashMap(); - - RoutingStrategy strategy = new RoutingStrategyFactory().updateRoutingStrategy(storeDef, - cluster); - List restoringNodePartition = cluster.getNodeById(restoringNode).getPartitionIds(); - - // Go over every partition. As long as one of them belongs to the - // current node list, find its replica - for(Node node: cluster.getNodes()) { - for(int partitionId: node.getPartitionIds()) { - List replicatingPartitions = strategy.getReplicatingPartitionList(partitionId); - List extraCopyReplicatingPartitions = Lists.newArrayList(replicatingPartitions); - - if(replicatingPartitions.size() <= 1) { - throw new VoldemortException("Store " - + storeDef.getName() - + " cannot be restored from replica because replication factor = 1"); - } + /** + * Update metadata at the given remoteNodeId. + *

    + * + * Metadata keys can be one of {@link MetadataStore#METADATA_KEYS}
    + * eg.
    + *

  • cluster metadata (cluster.xml as string) + *
  • stores definitions (stores.xml as string) + *
  • Server states
    + * See {@link voldemort.store.metadata.MetadataStore} for more + * information. + * + * @param remoteNodeId Id of the node + * @param key Metadata key to update + * @param value Value for the metadata key + */ + public void updateRemoteMetadata(int remoteNodeId, String key, Versioned value) { + ByteArray keyBytes = new ByteArray(ByteUtils.getBytes(key, "UTF-8")); + Versioned valueBytes = new Versioned(ByteUtils.getBytes(value.getValue(), + "UTF-8"), + value.getVersion()); - if(replicatingPartitions.removeAll(restoringNodePartition)) { - if(replicatingPartitions.size() == 0) { - throw new VoldemortException("Found a case where-in the overlap of " - + "the node partition list results in no replicas " - + "being left in replicating list"); - } + VAdminProto.VoldemortAdminRequest request = VAdminProto.VoldemortAdminRequest.newBuilder() + .setType(VAdminProto.AdminRequestType.UPDATE_METADATA) + .setUpdateMetadata(VAdminProto.UpdateMetadataRequest.newBuilder() + .setKey(ByteString.copyFrom(keyBytes.get())) + .setVersioned(ProtoUtils.encodeVersioned(valueBytes)) + .build()) + .build(); + VAdminProto.UpdateMetadataResponse.Builder response = rpcOps.sendAndReceive(remoteNodeId, + request, + VAdminProto.UpdateMetadataResponse.newBuilder()); + if(response.hasError()) + helperOps.throwException(response.getError()); + } - addDonorWithZonePreference(replicatingPartitions, - extraCopyReplicatingPartitions, - returnMap, - zoneId, - cluster, - storeDef); - } + /** + * Wrapper for updateRemoteMetadata function used against a single Node + * It basically loops over the entire list of Nodes that we need to + * execute the required operation against. It also increments the + * version of the corresponding metadata in the system store. + *

    + * + * Metadata keys can be one of {@link MetadataStore#METADATA_KEYS}
    + * eg.
    + *

  • cluster metadata (cluster.xml as string) + *
  • stores definitions (stores.xml as string) + *
  • Server states
    + * See {@link voldemort.store.metadata.MetadataStore} for more + * information. + * + * @param remoteNodeId Id of the node + * @param key Metadata key to update + * @param value Value for the metadata key + * + * */ + public void updateRemoteMetadata(List remoteNodeIds, + String key, + Versioned value) { + for(Integer currentNodeId: remoteNodeIds) { + System.out.println("Setting " + key + " for " + + getAdminClientCluster().getNodeById(currentNodeId).getHost() + + ":" + + getAdminClientCluster().getNodeById(currentNodeId).getId()); + updateRemoteMetadata(currentNodeId, key, value); + } + /* + * Assuming everything is fine, we now increment the metadata + * version for the key + */ + if(key.equals(CLUSTER_VERSION_KEY)) { + metadataMgmtOps.updateMetadataversion(key); } } - return returnMap; - } - /** - * For each partition that need to be restored, find a donor node that owns - * the partition AND has the same zone ID as requested. -1 means no zone - * preference required when finding a donor node needs to steal in order to - * - * @param remainderPartitions The replicating partitions without the one - * needed by the restore node - * @param originalPartitions The entire replicating partition list - * (including the one needed by the restore node) - * @param donorMap All donor nodes that will be fetched from - * @param zondId The zone from which donor nodes will be chosen from; -1 - * means all zones are fine - * @param cluster The cluster metadata - * @param storeDef The store to be restored - * @return - */ - private void addDonorWithZonePreference(List remainderPartitions, - List originalPartitions, - Map>> donorMap, - int zoneId, - Cluster cluster, - StoreDefinition storeDef) { - Map partitionToNodeId = RebalanceUtils.getCurrentPartitionMapping(cluster); - int nodeId = -1; - int replicaType = -1; - int partition = -1; - boolean found = false; - int index = 0; - - while(!found && index < remainderPartitions.size()) { - replicaType = originalPartitions.indexOf(remainderPartitions.get(index)); - nodeId = partitionToNodeId.get(remainderPartitions.get(index)); - if(-1 == zoneId || cluster.getNodeById(nodeId).getZoneId() == zoneId) { - found = true; - } else { - index++; - } + /** + * Get the metadata on a remote node. + *

    + * Metadata keys can be one of {@link MetadataStore#METADATA_KEYS}
    + * eg.
    + *

  • cluster metadata (cluster.xml as string) + *
  • stores definitions (stores.xml as string) + *
  • Server states
    + * See {@link voldemort.store.metadata.MetadataStore} for more + * information. + * + * @param remoteNodeId Id of the node + * @param key Metadata key to update + * @return Metadata with its associated + * {@link voldemort.versioning.Version} + */ + public Versioned getRemoteMetadata(int remoteNodeId, String key) { + ByteArray keyBytes = new ByteArray(ByteUtils.getBytes(key, "UTF-8")); + VAdminProto.VoldemortAdminRequest request = VAdminProto.VoldemortAdminRequest.newBuilder() + .setType(VAdminProto.AdminRequestType.GET_METADATA) + .setGetMetadata(VAdminProto.GetMetadataRequest.newBuilder() + .setKey(ByteString.copyFrom(keyBytes.get()))) + .build(); + VAdminProto.GetMetadataResponse.Builder response = rpcOps.sendAndReceive(remoteNodeId, + request, + VAdminProto.GetMetadataResponse.newBuilder()); + + if(response.hasError()) + helperOps.throwException(response.getError()); + + Versioned value = ProtoUtils.decodeVersioned(response.getVersion()); + return new Versioned(ByteUtils.getString(value.getValue(), "UTF-8"), + value.getVersion()); } - if(!found) { - throw new VoldemortException("unable to find a node to fetch partition " + partition - + " of replica type " + replicaType + " for store " - + storeDef.getName()); + /** + * Update the cluster information {@link MetadataStore#CLUSTER_KEY} on a + * remote node. + *

    + * + * @param nodeId Id of the remote node + * @param cluster The new cluster object + * @throws VoldemortException + */ + public void updateRemoteCluster(int nodeId, Cluster cluster, Version clock) + throws VoldemortException { + updateRemoteMetadata(nodeId, + MetadataStore.CLUSTER_KEY, + new Versioned(clusterMapper.writeCluster(cluster), clock)); } - partition = originalPartitions.get(0); - HashMap> replicaToPartitionList = null; - if(donorMap.containsKey(nodeId)) { - replicaToPartitionList = donorMap.get(nodeId); - } else { - replicaToPartitionList = Maps.newHashMap(); - donorMap.put(nodeId, replicaToPartitionList); + /** + * Get the cluster information from a remote node. + *

    + * + * @param nodeId Node to retrieve information from + * @return A cluster object with its + * {@link voldemort.versioning.Version} + * @throws VoldemortException + */ + public Versioned getRemoteCluster(int nodeId) throws VoldemortException { + Versioned value = metadataMgmtOps.getRemoteMetadata(nodeId, + MetadataStore.CLUSTER_KEY); + Cluster cluster = clusterMapper.readCluster(new StringReader(value.getValue()), false); + return new Versioned(cluster, value.getVersion()); } - List partitions = null; - if(replicaToPartitionList.containsKey(replicaType)) { - partitions = replicaToPartitionList.get(replicaType); - } else { - partitions = Lists.newArrayList(); - replicaToPartitionList.put(replicaType, partitions); + /** + * Update the store definitions on a remote node. + *

    + * + * @param nodeId The node id of the machine + * @param storesList The new store list + * @throws VoldemortException + */ + public void updateRemoteStoreDefList(int nodeId, List storesList) + throws VoldemortException { + // get current version. + VectorClock oldClock = (VectorClock) metadataMgmtOps.getRemoteStoreDefList(nodeId) + .getVersion(); + + updateRemoteMetadata(nodeId, + MetadataStore.STORES_KEY, + new Versioned(storeMapper.writeStoreList(storesList), + oldClock.incremented(nodeId, 1))); } - partitions.add(partition); - } - /** - * For a particular store and node, runs the replication job. This works - * only for read-write stores - * - * @param restoringNodeId The node which we want to restore - * @param cluster The cluster metadata - * @param storeDef The definition of the store which we want to restore - * @param executorService An executor to allow us to run the replication job - */ - private void restoreStoreFromReplication(final int restoringNodeId, - final Cluster cluster, - final StoreDefinition storeDef, - final ExecutorService executorService, - final int zoneId) { - logger.info("Restoring data for store " + storeDef.getName() + " on node " - + restoringNodeId); - - Map>> restoreMapping = getReplicationMapping(restoringNodeId, - cluster, - storeDef, - zoneId); - // migrate partition - for(final Entry>> replicationEntry: restoreMapping.entrySet()) { - final int donorNodeId = replicationEntry.getKey(); - executorService.submit(new Runnable() { - - public void run() { - try { - logger.info("Restoring data for store " + storeDef.getName() + " at node " - + restoringNodeId + " from node " + replicationEntry.getKey() - + " partitions:" + replicationEntry.getValue()); - - int migrateAsyncId = migratePartitions(donorNodeId, - restoringNodeId, - storeDef.getName(), - replicationEntry.getValue(), - null, - null, - false); - - waitForCompletion(restoringNodeId, - migrateAsyncId, - adminClientConfig.getRestoreDataTimeoutSec(), - TimeUnit.SECONDS); - - logger.info("Restoring data for store:" + storeDef.getName() - + " from node " + donorNodeId + " completed."); - } catch(Exception e) { - logger.error("Restore operation for store " + storeDef.getName() - + "from node " + donorNodeId + " failed.", e); - } - } - }); + /** + * Retrieve the store definitions from a remote node. + *

    + * + * @param nodeId The node id from which we can to remote the store + * definition + * @return The list of store definitions from the remote machine + * @throws VoldemortException + */ + public Versioned> getRemoteStoreDefList(int nodeId) + throws VoldemortException { + Versioned value = metadataMgmtOps.getRemoteMetadata(nodeId, + MetadataStore.STORES_KEY); + List storeList = storeMapper.readStoreList(new StringReader(value.getValue()), + false); + return new Versioned>(storeList, value.getVersion()); } } /** - * Rebalance a stealer-donor node pair for a set of stores. This is run on - * the donor node. + * Encapsulates all operations related to store management (addition, + * deletion) * - * @param stealInfos List of partition steal information - * @return The request id of the async operation */ - public int rebalanceNode(List stealInfos) { - List rebalancePartitionInfoMap = ProtoUtils.encodeRebalancePartitionInfoMap(stealInfos); - VAdminProto.InitiateRebalanceNodeOnDonorRequest rebalanceNodeRequest = VAdminProto.InitiateRebalanceNodeOnDonorRequest.newBuilder() - .addAllRebalancePartitionInfo(rebalancePartitionInfoMap) - .build(); - VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() - .setType(VAdminProto.AdminRequestType.INITIATE_REBALANCE_NODE_ON_DONOR) - .setInitiateRebalanceNodeOnDonor(rebalanceNodeRequest) - .build(); - VAdminProto.AsyncOperationStatusResponse.Builder response = sendAndReceive(stealInfos.get(0) - .getDonorId(), - adminRequest, - VAdminProto.AsyncOperationStatusResponse.newBuilder()); + public class StoreManagementOperations { - if(response.hasError()) - throwException(response.getError()); + /** + * Add a new store definition to all active nodes in the cluster. + *

    + * + * @param def the definition of the store to add + */ + public void addStore(StoreDefinition def) { + for(Node node: currentCluster.getNodes()) { + addStore(def, node.getId()); + } + } - return response.getRequestId(); - } + /** + * Add a new store definition to a particular node + *

    + * + * @param def the definition of the store to add + * @param nodeId Node on which to add the store + */ + public void addStore(StoreDefinition def, int nodeId) { + String value = storeMapper.writeStore(def); - /** - * Rebalance a stealer-donor node pair for a set of stores. This is run on - * the stealer node. - * - * @param stealInfo Partition steal information - * @return The request id of the async operation - */ - public int rebalanceNode(RebalancePartitionsInfo stealInfo) { - VAdminProto.RebalancePartitionInfoMap rebalancePartitionInfoMap = ProtoUtils.encodeRebalancePartitionInfoMap(stealInfo); - VAdminProto.InitiateRebalanceNodeRequest rebalanceNodeRequest = VAdminProto.InitiateRebalanceNodeRequest.newBuilder() - .setRebalancePartitionInfo(rebalancePartitionInfoMap) - .build(); - VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() - .setType(VAdminProto.AdminRequestType.INITIATE_REBALANCE_NODE) - .setInitiateRebalanceNode(rebalanceNodeRequest) - .build(); - VAdminProto.AsyncOperationStatusResponse.Builder response = sendAndReceive(stealInfo.getStealerId(), - adminRequest, - VAdminProto.AsyncOperationStatusResponse.newBuilder()); + VAdminProto.AddStoreRequest.Builder addStoreRequest = VAdminProto.AddStoreRequest.newBuilder() + .setStoreDefinition(value); + VAdminProto.VoldemortAdminRequest request = VAdminProto.VoldemortAdminRequest.newBuilder() + .setType(VAdminProto.AdminRequestType.ADD_STORE) + .setAddStore(addStoreRequest) + .build(); - if(response.hasError()) - throwException(response.getError()); + Node node = currentCluster.getNodeById(nodeId); + if(null == node) + throw new VoldemortException("Invalid node id (" + nodeId + ") specified"); + + logger.info("Adding store " + def.getName() + " on node " + node.getHost() + ":" + + node.getId()); + VAdminProto.AddStoreResponse.Builder response = rpcOps.sendAndReceive(nodeId, + request, + VAdminProto.AddStoreResponse.newBuilder()); + if(response.hasError()) + helperOps.throwException(response.getError()); + logger.info("Succesfully added " + def.getName() + " on node " + node.getHost() + ":" + + node.getId()); + } - return response.getRequestId(); - } + /** + * Delete a store from all active nodes in the cluster + *

    + * + * @param storeName name of the store to delete + */ + public void deleteStore(String storeName) { + for(Node node: currentCluster.getNodes()) { + deleteStore(storeName, node.getId()); + } + } - /** - * Converts list of partitions to map of replica type to partition list. - * - * @param nodeId Node which is donating data - * @param storeName Name of store - * @param partitions List of partitions ( primary OR replicas ) to move - * @return Map of replica type to partitions - */ - private HashMap> getReplicaToPartitionMap(int nodeId, - String storeName, - List partitions) { - List allStoreDefs = getRemoteStoreDefList(nodeId).getValue(); - allStoreDefs.addAll(SystemStoreConstants.getAllSystemStoreDefs()); - StoreDefinition def = RebalanceUtils.getStoreDefinitionWithName(allStoreDefs, storeName); - HashMap> replicaToPartitionList = Maps.newHashMap(); - for(int replicaNum = 0; replicaNum < def.getReplicationFactor(); replicaNum++) { - replicaToPartitionList.put(replicaNum, partitions); - } - - return replicaToPartitionList; - } + /** + * Delete a store from a particular node + *

    + * + * @param storeName name of the store to delete + * @param nodeId Node on which we want to delete a store + */ + public void deleteStore(String storeName, int nodeId) { + VAdminProto.DeleteStoreRequest.Builder deleteStoreRequest = VAdminProto.DeleteStoreRequest.newBuilder() + .setStoreName(storeName); + VAdminProto.VoldemortAdminRequest request = VAdminProto.VoldemortAdminRequest.newBuilder() + .setType(VAdminProto.AdminRequestType.DELETE_STORE) + .setDeleteStore(deleteStoreRequest) + .build(); + Node node = currentCluster.getNodeById(nodeId); + if(null == node) + throw new VoldemortException("Invalid node id (" + nodeId + ") specified"); - /** - * Migrate keys/values belonging to stealPartitionList ( can be primary or - * replica ) from donor node to stealer node. Does not delete the - * partitions from donorNode, merely copies them. - *

    - * See - * {@link AdminClient#migratePartitions(int, int, String, HashMap, VoldemortFilter, Cluster, boolean)} - * for more details. - * - * - * @param donorNodeId Node from which the partitions are to be - * streamed. - * @param stealerNodeId Node to which the partitions are to be - * streamed. - * @param storeName Name of the store to stream. - * @param stealPartitionList List of partitions to stream. - * @param filter Custom filter implementation to filter out entries which - * should not be deleted. - * @return The value of the - * {@link voldemort.server.protocol.admin.AsyncOperation} created on - * stealerNodeId which is performing the operation. - */ - public int migratePartitions(int donorNodeId, - int stealerNodeId, - String storeName, - List stealPartitionList, - VoldemortFilter filter) { - return migratePartitions(donorNodeId, - stealerNodeId, - storeName, - getReplicaToPartitionMap(donorNodeId, - storeName, - stealPartitionList), - filter, - null, - false); + logger.info("Deleting " + storeName + " on node " + node.getHost() + ":" + node.getId()); + VAdminProto.DeleteStoreResponse.Builder response = rpcOps.sendAndReceive(node.getId(), + request, + VAdminProto.DeleteStoreResponse.newBuilder()); + if(response.hasError()) + helperOps.throwException(response.getError()); + logger.info("Successfully deleted " + storeName + " on node " + node.getHost() + ":" + + node.getId()); + } } /** - * Migrate keys/values belonging to a map of replica type to partition list - * from donor node to stealer node. Does not delete the partitions from - * donorNode, merely copies them. - *

    - * This is a background operation (see - * {@link voldemort.server.protocol.admin.AsyncOperation} that runs on the - * stealer node where updates are performed. - *

    - * - * @param donorNodeId Node from which the partitions are to be - * streamed. - * @param stealerNodeId Node to which the partitions are to be - * streamed. - * @param storeName Name of the store to stream. - * @param replicaToPartitionList Mapping from replica type to partition to - * be stolen - * @param filter Voldemort post-filter - * @param initialCluster The cluster metadata to use for making the decision - * if the key belongs to these partitions. If not specified, falls - * back to the metadata stored on the box - * @param optimize We can run an optimization at this level where-in we try - * avoid copying of data which already exists ( in the form of a - * replica ). We do need to disable this when we're trying to recover - * a node which was completely damaged ( restore from replica ). - * @return The value of the - * {@link voldemort.server.protocol.admin.AsyncOperation} created on - * stealer node which is performing the operation. - */ - public int migratePartitions(int donorNodeId, - int stealerNodeId, - String storeName, - HashMap> replicaToPartitionList, - VoldemortFilter filter, - Cluster initialCluster, - boolean optimize) { - VAdminProto.InitiateFetchAndUpdateRequest.Builder initiateFetchAndUpdateRequest = VAdminProto.InitiateFetchAndUpdateRequest.newBuilder() - .setNodeId(donorNodeId) - .addAllReplicaToPartition(ProtoUtils.encodePartitionTuple(replicaToPartitionList)) - .setStore(storeName); + * Encapsulates all operations that aid in performing maintenance on the + * actual store's data + * + */ + public class StoreMaintenanceOperations { + + /** + * Migrate keys/values belonging to stealPartitionList ( can be primary + * or replica ) from donor node to stealer node. Does not delete the + * partitions from donorNode, merely copies them. + *

    + * See + * {@link AdminClient#migratePartitions(int, int, String, HashMap, VoldemortFilter, Cluster, boolean)} + * for more details. + * + * + * @param donorNodeId Node from which the partitions are to be + * streamed. + * @param stealerNodeId Node to which the partitions are to be + * streamed. + * @param storeName Name of the store to stream. + * @param stealPartitionList List of partitions to stream. + * @param filter Custom filter implementation to filter out entries + * which should not be deleted. + * @return The value of the + * {@link voldemort.server.protocol.admin.AsyncOperation} + * created on stealerNodeId which is performing the operation. + */ + public int migratePartitions(int donorNodeId, + int stealerNodeId, + String storeName, + List stealPartitionList, + VoldemortFilter filter) { + return migratePartitions(donorNodeId, + stealerNodeId, + storeName, + helperOps.getReplicaToPartitionMap(donorNodeId, + storeName, + stealPartitionList), + filter, + null, + false); + } + + /** + * Migrate keys/values belonging to a map of replica type to partition + * list from donor node to stealer node. Does not delete the + * partitions from donorNode, merely copies them. + *

    + * This is a background operation (see + * {@link voldemort.server.protocol.admin.AsyncOperation} that runs on + * the stealer node where updates are performed. + *

    + * + * @param donorNodeId Node from which the partitions are to be + * streamed. + * @param stealerNodeId Node to which the partitions are to be + * streamed. + * @param storeName Name of the store to stream. + * @param replicaToPartitionList Mapping from replica type to partition + * to be stolen + * @param filter Voldemort post-filter + * @param initialCluster The cluster metadata to use for making the + * decision if the key belongs to these partitions. If not + * specified, falls back to the metadata stored on the box + * @param optimize We can run an optimization at this level where-in we + * try avoid copying of data which already exists ( in the form + * of a replica ). We do need to disable this when we're trying + * to recover a node which was completely damaged ( restore from + * replica ). + * @return The value of the + * {@link voldemort.server.protocol.admin.AsyncOperation} + * created on stealer node which is performing the operation. + */ + public int migratePartitions(int donorNodeId, + int stealerNodeId, + String storeName, + HashMap> replicaToPartitionList, + VoldemortFilter filter, + Cluster initialCluster, + boolean optimize) { + VAdminProto.InitiateFetchAndUpdateRequest.Builder initiateFetchAndUpdateRequest = VAdminProto.InitiateFetchAndUpdateRequest.newBuilder() + .setNodeId(donorNodeId) + .addAllReplicaToPartition(ProtoUtils.encodePartitionTuple(replicaToPartitionList)) + .setStore(storeName); + + try { + if(filter != null) { + initiateFetchAndUpdateRequest.setFilter(helperOps.encodeFilter(filter)); + } + } catch(IOException e) { + throw new VoldemortException(e); + } - try { - if(filter != null) { - initiateFetchAndUpdateRequest.setFilter(encodeFilter(filter)); + if(initialCluster != null) { + initiateFetchAndUpdateRequest.setInitialCluster(new ClusterMapper().writeCluster(initialCluster)); + } + initiateFetchAndUpdateRequest.setOptimize(optimize); + + VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() + .setInitiateFetchAndUpdate(initiateFetchAndUpdateRequest) + .setType(VAdminProto.AdminRequestType.INITIATE_FETCH_AND_UPDATE) + .build(); + VAdminProto.AsyncOperationStatusResponse.Builder response = rpcOps.sendAndReceive(stealerNodeId, + adminRequest, + VAdminProto.AsyncOperationStatusResponse.newBuilder()); + + if(response.hasError()) { + helperOps.throwException(response.getError()); } - } catch(IOException e) { - throw new VoldemortException(e); + + return response.getRequestId(); } - if(initialCluster != null) { - initiateFetchAndUpdateRequest.setInitialCluster(new ClusterMapper().writeCluster(initialCluster)); + /** + * Delete the store completely (Deletes all data) from the remote + * node. + *

    + * + * @param nodeId The node id on which the store is present + * @param storeName The name of the store + */ + public void truncate(int nodeId, String storeName) { + VAdminProto.TruncateEntriesRequest.Builder truncateRequest = VAdminProto.TruncateEntriesRequest.newBuilder() + .setStore(storeName); + + VAdminProto.VoldemortAdminRequest request = VAdminProto.VoldemortAdminRequest.newBuilder() + .setType(VAdminProto.AdminRequestType.TRUNCATE_ENTRIES) + .setTruncateEntries(truncateRequest) + .build(); + VAdminProto.TruncateEntriesResponse.Builder response = rpcOps.sendAndReceive(nodeId, + request, + VAdminProto.TruncateEntriesResponse.newBuilder()); + + if(response.hasError()) { + helperOps.throwException(response.getError()); + } } - initiateFetchAndUpdateRequest.setOptimize(optimize); - VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() - .setInitiateFetchAndUpdate(initiateFetchAndUpdateRequest) - .setType(VAdminProto.AdminRequestType.INITIATE_FETCH_AND_UPDATE) - .build(); - VAdminProto.AsyncOperationStatusResponse.Builder response = sendAndReceive(stealerNodeId, - adminRequest, - VAdminProto.AsyncOperationStatusResponse.newBuilder()); + /** + * Delete all entries belonging to a list of partitions + * + * @param nodeId Node on which the entries to be deleted + * @param storeName Name of the store holding the entries + * @param partitionList List of partitions to delete. + * @param filter Custom filter implementation to filter out entries + * which should not be deleted. + * @return Number of entries deleted + */ + public long deletePartitions(int nodeId, + String storeName, + List partitionList, + VoldemortFilter filter) { + return deletePartitions(nodeId, + storeName, + helperOps.getReplicaToPartitionMap(nodeId, + storeName, + partitionList), + null, + filter); + } + + /** + * Delete all entries belonging to all the partitions passed as a map of + * replica_type to partition list. Works only for RW stores. + * + * @param nodeId Node on which the entries to be deleted + * @param storeName Name of the store holding the entries + * @param replicaToPartitionList Map of replica type to partition list + * @param filter Custom filter implementation to filter out entries + * which should not be deleted. + * @return Number of entries deleted + */ + public long deletePartitions(int nodeId, + String storeName, + HashMap> replicaToPartitionList, + Cluster initialCluster, + VoldemortFilter filter) { + VAdminProto.DeletePartitionEntriesRequest.Builder deleteRequest = VAdminProto.DeletePartitionEntriesRequest.newBuilder() + .addAllReplicaToPartition(ProtoUtils.encodePartitionTuple(replicaToPartitionList)) + .setStore(storeName); + + try { + if(filter != null) { + deleteRequest.setFilter(helperOps.encodeFilter(filter)); + } + } catch(IOException e) { + throw new VoldemortException(e); + } + + if(initialCluster != null) { + deleteRequest.setInitialCluster(new ClusterMapper().writeCluster(initialCluster)); + } - if(response.hasError()) { - throwException(response.getError()); + VAdminProto.VoldemortAdminRequest request = VAdminProto.VoldemortAdminRequest.newBuilder() + .setType(VAdminProto.AdminRequestType.DELETE_PARTITION_ENTRIES) + .setDeletePartitionEntries(deleteRequest) + .build(); + VAdminProto.DeletePartitionEntriesResponse.Builder response = rpcOps.sendAndReceive(nodeId, + request, + VAdminProto.DeletePartitionEntriesResponse.newBuilder()); + + if(response.hasError()) + helperOps.throwException(response.getError()); + + return response.getCount(); } - return response.getRequestId(); - } + /** + * Repair the stores on a rebalanced node 'nodeId' + *

    + * + * @param nodeId The id of the node on which to do the repair + */ + public void repairJob(int nodeId) { + VAdminProto.RepairJobRequest.Builder repairJobRequest = VAdminProto.RepairJobRequest.newBuilder(); + + VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() + .setRepairJob(repairJobRequest) + .setType(VAdminProto.AdminRequestType.REPAIR_JOB) + .build(); + Node node = AdminClient.this.getAdminClientCluster().getNodeById(nodeId); + SocketDestination destination = new SocketDestination(node.getHost(), + node.getAdminPort(), + RequestFormatType.ADMIN_PROTOCOL_BUFFERS); + SocketAndStreams sands = socketPool.checkout(destination); - /** - * Delete the store completely (Deletes all data) from the remote - * node. - *

    - * - * @param nodeId The node id on which the store is present - * @param storeName The name of the store - */ - public void truncate(int nodeId, String storeName) { - VAdminProto.TruncateEntriesRequest.Builder truncateRequest = VAdminProto.TruncateEntriesRequest.newBuilder() - .setStore(storeName); + try { + DataOutputStream outputStream = sands.getOutputStream(); + ProtoUtils.writeMessage(outputStream, adminRequest); + outputStream.flush(); + } catch(IOException e) { + helperOps.close(sands.getSocket()); + throw new VoldemortException(e); + } finally { + socketPool.checkin(destination, sands); + } + return; + } + + /** + * Native backup a store + * + * @param nodeId The node id to backup + * @param storeName The name of the store to backup + * @param destinationDirPath The destination path + * @param minutes to wait for operation to complete + * @param verify should the file checksums be verified + * @param isIncremental is the backup incremental + */ + public void nativeBackup(int nodeId, + String storeName, + String destinationDirPath, + int timeOut, + boolean verify, + boolean isIncremental) { + + VAdminProto.NativeBackupRequest nativeBackupRequest = VAdminProto.NativeBackupRequest.newBuilder() + .setStoreName(storeName) + .setBackupDir(destinationDirPath) + .setIncremental(isIncremental) + .setVerifyFiles(verify) + .build(); + VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() + .setNativeBackup(nativeBackupRequest) + .setType(VAdminProto.AdminRequestType.NATIVE_BACKUP) + .build(); + VAdminProto.AsyncOperationStatusResponse.Builder response = rpcOps.sendAndReceive(nodeId, + adminRequest, + VAdminProto.AsyncOperationStatusResponse.newBuilder()); + + if(response.hasError()) { + helperOps.throwException(response.getError()); + } + + int asyncId = response.getRequestId(); + rpcOps.waitForCompletion(nodeId, asyncId, timeOut, TimeUnit.MINUTES); + } - VAdminProto.VoldemortAdminRequest request = VAdminProto.VoldemortAdminRequest.newBuilder() - .setType(VAdminProto.AdminRequestType.TRUNCATE_ENTRIES) - .setTruncateEntries(truncateRequest) - .build(); - VAdminProto.TruncateEntriesResponse.Builder response = sendAndReceive(nodeId, - request, - VAdminProto.TruncateEntriesResponse.newBuilder()); + /** + * Reserve memory for the stores + * + * @param nodeId The node id to reserve, -1 for entire cluster + * @param stores list of stores for which to reserve + * @param sizeInMB size of reservation + */ + public void reserveMemory(int nodeId, List stores, long sizeInMB) { - if(response.hasError()) { - throwException(response.getError()); + List reserveNodes = new ArrayList(); + if(nodeId == -1) { + // if no node is specified send it to the entire cluster + for(Node node: currentCluster.getNodes()) + reserveNodes.add(node.getId()); + } else { + reserveNodes.add(nodeId); + } + for(String storeName: stores) { + for(Integer reserveNodeId: reserveNodes) { + + VAdminProto.ReserveMemoryRequest reserveRequest = VAdminProto.ReserveMemoryRequest.newBuilder() + .setStoreName(storeName) + .setSizeInMb(sizeInMB) + .build(); + VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() + .setReserveMemory(reserveRequest) + .setType(VAdminProto.AdminRequestType.RESERVE_MEMORY) + .build(); + VAdminProto.ReserveMemoryResponse.Builder response = rpcOps.sendAndReceive(reserveNodeId, + adminRequest, + VAdminProto.ReserveMemoryResponse.newBuilder()); + if(response.hasError()) + helperOps.throwException(response.getError()); + } + logger.info("Finished reserving memory for store : " + storeName); + } } } /** - * Get the status of an Async Operation running at (remote) node. + * Encapsulates all the operations to forklift data from the cluster * - * If The operation is complete, then the operation will be removed from - * a list of currently running operations. - * - * @param nodeId Id on which the operation is running - * @param requestId Id of the operation itself - * @return The status of the operation */ - public AsyncOperationStatus getAsyncRequestStatus(int nodeId, int requestId) { - VAdminProto.AsyncOperationStatusRequest asyncRequest = VAdminProto.AsyncOperationStatusRequest.newBuilder() - .setRequestId(requestId) - .build(); - VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() - .setType(VAdminProto.AdminRequestType.ASYNC_OPERATION_STATUS) - .setAsyncOperationStatus(asyncRequest) - .build(); - VAdminProto.AsyncOperationStatusResponse.Builder response = sendAndReceive(nodeId, - adminRequest, - VAdminProto.AsyncOperationStatusResponse.newBuilder()); + public class BulkStreamingFetchOperations { - if(response.hasError()) - throwException(response.getError()); + private void initiateFetchRequest(DataOutputStream outputStream, + String storeName, + HashMap> replicaToPartitionList, + VoldemortFilter filter, + boolean fetchValues, + boolean fetchMasterEntries, + Cluster initialCluster, + long recordsPerPartition) throws IOException { + HashMap> filteredReplicaToPartitionList = Maps.newHashMap(); + if(fetchMasterEntries) { + if(!replicaToPartitionList.containsKey(0)) { + throw new VoldemortException("Could not find any partitions for primary replica type"); + } else { + filteredReplicaToPartitionList.put(0, replicaToPartitionList.get(0)); + } + } else { + filteredReplicaToPartitionList.putAll(replicaToPartitionList); + } + VAdminProto.FetchPartitionEntriesRequest.Builder fetchRequest = VAdminProto.FetchPartitionEntriesRequest.newBuilder() + .setFetchValues(fetchValues) + .addAllReplicaToPartition(ProtoUtils.encodePartitionTuple(filteredReplicaToPartitionList)) + .setStore(storeName) + .setRecordsPerPartition(recordsPerPartition); - AsyncOperationStatus status = new AsyncOperationStatus(response.getRequestId(), - response.getDescription()); - status.setStatus(response.getStatus()); - status.setComplete(response.getComplete()); + try { + if(filter != null) { + fetchRequest.setFilter(helperOps.encodeFilter(filter)); + } + } catch(IOException e) { + throw new VoldemortException(e); + } - return status; - } + if(initialCluster != null) { + fetchRequest.setInitialCluster(new ClusterMapper().writeCluster(initialCluster)); + } - /** - * Retrieves a list of asynchronous request ids on the server. Does not - * include the completed requests - * - * @param nodeId The id of the node whose request ids we want - * @return List of async request ids - */ - public List getAsyncRequestList(int nodeId) { - return getAsyncRequestList(nodeId, false); - } + VAdminProto.VoldemortAdminRequest request = VAdminProto.VoldemortAdminRequest.newBuilder() + .setType(VAdminProto.AdminRequestType.FETCH_PARTITION_ENTRIES) + .setFetchPartitionEntries(fetchRequest) + .build(); + ProtoUtils.writeMessage(outputStream, request); + outputStream.flush(); - /** - * Retrieves a list of asynchronous request ids on the server. Depending on - * the boolean passed also retrieves the completed requests - * - * @param nodeId The id of the node whose request ids we want - * @param showComplete Boolean to indicate if we want to include the - * completed requests as well - * @return List of async request ids - */ - public List getAsyncRequestList(int nodeId, boolean showComplete) { - VAdminProto.AsyncOperationListRequest asyncOperationListRequest = VAdminProto.AsyncOperationListRequest.newBuilder() - .setShowComplete(showComplete) - .build(); - VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() - .setType(VAdminProto.AdminRequestType.ASYNC_OPERATION_LIST) - .setAsyncOperationList(asyncOperationListRequest) - .build(); - VAdminProto.AsyncOperationListResponse.Builder response = sendAndReceive(nodeId, - adminRequest, - VAdminProto.AsyncOperationListResponse.newBuilder()); - if(response.hasError()) - throwException(response.getError()); - - return response.getRequestIdsList(); - } + } - /** - * To stop an asynchronous request on the particular node - * - * @param nodeId The id of the node on which the request is running - * @param requestId The id of the request to terminate - */ - public void stopAsyncRequest(int nodeId, int requestId) { - VAdminProto.AsyncOperationStopRequest asyncOperationStopRequest = VAdminProto.AsyncOperationStopRequest.newBuilder() - .setRequestId(requestId) - .build(); - VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() - .setType(VAdminProto.AdminRequestType.ASYNC_OPERATION_STOP) - .setAsyncOperationStop(asyncOperationStopRequest) - .build(); - VAdminProto.AsyncOperationStopResponse.Builder response = sendAndReceive(nodeId, - adminRequest, - VAdminProto.AsyncOperationStopResponse.newBuilder()); - - if(response.hasError()) - throwException(response.getError()); - } + private VAdminProto.FetchPartitionEntriesResponse responseFromStream(DataInputStream inputStream, + int size) + throws IOException { + byte[] input = new byte[size]; + ByteUtils.read(inputStream, input); + VAdminProto.FetchPartitionEntriesResponse.Builder response = VAdminProto.FetchPartitionEntriesResponse.newBuilder(); + response.mergeFrom(input); - private VAdminProto.VoldemortFilter encodeFilter(VoldemortFilter filter) throws IOException { - Class cl = filter.getClass(); - byte[] classBytes = networkClassLoader.dumpClass(cl); - return VAdminProto.VoldemortFilter.newBuilder() - .setName(cl.getName()) - .setData(ProtoUtils.encodeBytes(new ByteArray(classBytes))) - .build(); - } + return response.build(); + } - /** - * Delete all entries belonging to a list of partitions - * - * @param nodeId Node on which the entries to be deleted - * @param storeName Name of the store holding the entries - * @param partitionList List of partitions to delete. - * @param filter Custom filter implementation to filter out entries which - * should not be deleted. - * @return Number of entries deleted - */ - public long deletePartitions(int nodeId, - String storeName, - List partitionList, - VoldemortFilter filter) { - return deletePartitions(nodeId, + /** + * Fetches entries that don't belong to the node, based on current + * metadata and yet persisted on the node + * + * @param nodeId Id of the node to fetch from + * @param storeName Name of the store + * @return An iterator which allows entries to be streamed as they're + * being iterated over. + */ + public Iterator>> fetchOrphanedEntries(int nodeId, + String storeName) { + + Node node = AdminClient.this.getAdminClientCluster().getNodeById(nodeId); + final SocketDestination destination = new SocketDestination(node.getHost(), + node.getAdminPort(), + RequestFormatType.ADMIN_PROTOCOL_BUFFERS); + final SocketAndStreams sands = socketPool.checkout(destination); + DataOutputStream outputStream = sands.getOutputStream(); + final DataInputStream inputStream = sands.getInputStream(); + + try { + VAdminProto.FetchPartitionEntriesRequest.Builder fetchOrphanedRequest = VAdminProto.FetchPartitionEntriesRequest.newBuilder() + .setFetchValues(true) + .setStore(storeName) + .setFetchOrphaned(true); + + VAdminProto.VoldemortAdminRequest request = VAdminProto.VoldemortAdminRequest.newBuilder() + .setType(VAdminProto.AdminRequestType.FETCH_PARTITION_ENTRIES) + .setFetchPartitionEntries(fetchOrphanedRequest) + .build(); + ProtoUtils.writeMessage(outputStream, request); + outputStream.flush(); + } catch(IOException e) { + helperOps.close(sands.getSocket()); + socketPool.checkin(destination, sands); + throw new VoldemortException(e); + } + + return new AbstractIterator>>() { + + @Override + public Pair> computeNext() { + try { + int size = inputStream.readInt(); + if(size == -1) { + socketPool.checkin(destination, sands); + return endOfData(); + } + + VAdminProto.FetchPartitionEntriesResponse response = responseFromStream(inputStream, + size); + + if(response.hasError()) { + socketPool.checkin(destination, sands); + helperOps.throwException(response.getError()); + } + + VAdminProto.PartitionEntry partitionEntry = response.getPartitionEntry(); + + return Pair.create(ProtoUtils.decodeBytes(partitionEntry.getKey()), + ProtoUtils.decodeVersioned(partitionEntry.getVersioned())); + } catch(IOException e) { + helperOps.close(sands.getSocket()); + socketPool.checkin(destination, sands); + throw new VoldemortException(e); + } + } + }; + } + + /** + * Legacy interface for fetching entries. See + * {@link AdminClient#fetchEntries(int, String, HashMap, VoldemortFilter, boolean, Cluster, long)} + * for more information. + * + * @param nodeId Id of the node to fetch from + * @param storeName Name of the store + * @param partitionList List of the partitions + * @param filter Custom filter implementation to filter out entries + * which should not be fetched. + * @param fetchMasterEntries Fetch an entry only if master replica + * @return An iterator which allows entries to be streamed as they're + * being iterated over. + */ + public Iterator>> fetchEntries(int nodeId, + String storeName, + List partitionList, + VoldemortFilter filter, + boolean fetchMasterEntries, + long recordsPerPartition) { + return fetchEntries(nodeId, storeName, - getReplicaToPartitionMap(nodeId, storeName, partitionList), + helperOps.getReplicaToPartitionMap(nodeId, storeName, partitionList), + filter, + fetchMasterEntries, null, - filter); - } + recordsPerPartition); + } - /** - * Delete all entries belonging to all the partitions passed as a map of - * replica_type to partition list. Works only for RW stores. - * - * @param nodeId Node on which the entries to be deleted - * @param storeName Name of the store holding the entries - * @param replicaToPartitionList Map of replica type to partition list - * @param filter Custom filter implementation to filter out entries which - * should not be deleted. - * @return Number of entries deleted - */ - public long deletePartitions(int nodeId, - String storeName, - HashMap> replicaToPartitionList, - Cluster initialCluster, - VoldemortFilter filter) { - VAdminProto.DeletePartitionEntriesRequest.Builder deleteRequest = VAdminProto.DeletePartitionEntriesRequest.newBuilder() - .addAllReplicaToPartition(ProtoUtils.encodePartitionTuple(replicaToPartitionList)) - .setStore(storeName); + /** + * Legacy interface for fetching entries. See + * {@link AdminClient#fetchEntries(int, String, HashMap, VoldemortFilter, boolean, Cluster, long)} + * for more information. + * + * @param nodeId Id of the node to fetch from + * @param storeName Name of the store + * @param partitionList List of the partitions + * @param filter Custom filter implementation to filter out entries + * which should not be fetched. + * @param fetchMasterEntries Fetch an entry only if master replica + * @return An iterator which allows entries to be streamed as they're + * being iterated over. + */ + public Iterator>> fetchEntries(int nodeId, + String storeName, + List partitionList, + VoldemortFilter filter, + boolean fetchMasterEntries) { + return fetchEntries(nodeId, storeName, partitionList, filter, fetchMasterEntries, 0); + } + + // TODO: "HashMap> replicaToPartitionList" is a + // confusing/opaque argument. Can this be made a type, or even + // unrolled/simplified? The replicaType is pretty much meaningless + // anyhow. + + // TODO: The use of "Pair" in the return for a fundamental type is + // awkward. We should have a core KeyValue type that effectively wraps + // up a ByteArray and a Versioned. + /** + * Fetch key/value tuples belonging to this map of replica type to + * partition list + *

    + * + * Streaming API - The server keeps sending the messages as it's + * iterating over the data. Once iteration has finished, the server + * sends an "end of stream" marker and flushes its buffer. A response + * indicating a {@link VoldemortException} may be sent at any time + * during the process.
    + * + *

    + * Entries are being streamed as the iteration happens i.e. the + * whole result set is not buffered in memory. + * + * @param nodeId Id of the node to fetch from + * @param storeName Name of the store + * @param replicaToPartitionList Mapping of replica type to partition + * list + * @param filter Custom filter implementation to filter out entries + * which should not be fetched. + * @param fetchMasterEntries Fetch an entry only if master replica + * @param initialCluster The cluster metadata to use while making the + * decision to fetch entries. This is important during + * rebalancing where-in we want to fetch keys using an older + * metadata compared to the new one. + * @return An iterator which allows entries to be streamed as they're + * being iterated over. + */ + public Iterator>> fetchEntries(int nodeId, + String storeName, + HashMap> replicaToPartitionList, + VoldemortFilter filter, + boolean fetchMasterEntries, + Cluster initialCluster, + long recordsPerPartition) { + + Node node = AdminClient.this.getAdminClientCluster().getNodeById(nodeId); + final SocketDestination destination = new SocketDestination(node.getHost(), + node.getAdminPort(), + RequestFormatType.ADMIN_PROTOCOL_BUFFERS); + final SocketAndStreams sands = socketPool.checkout(destination); + DataOutputStream outputStream = sands.getOutputStream(); + final DataInputStream inputStream = sands.getInputStream(); - try { - if(filter != null) { - deleteRequest.setFilter(encodeFilter(filter)); + try { + initiateFetchRequest(outputStream, + storeName, + replicaToPartitionList, + filter, + true, + fetchMasterEntries, + initialCluster, + recordsPerPartition); + } catch(IOException e) { + helperOps.close(sands.getSocket()); + socketPool.checkin(destination, sands); + throw new VoldemortException(e); } - } catch(IOException e) { - throw new VoldemortException(e); - } - if(initialCluster != null) { - deleteRequest.setInitialCluster(new ClusterMapper().writeCluster(initialCluster)); - } + return new AbstractIterator>>() { - VAdminProto.VoldemortAdminRequest request = VAdminProto.VoldemortAdminRequest.newBuilder() - .setType(VAdminProto.AdminRequestType.DELETE_PARTITION_ENTRIES) - .setDeletePartitionEntries(deleteRequest) - .build(); - VAdminProto.DeletePartitionEntriesResponse.Builder response = sendAndReceive(nodeId, - request, - VAdminProto.DeletePartitionEntriesResponse.newBuilder()); + @Override + public Pair> computeNext() { + try { + int size = inputStream.readInt(); + if(size == -1) { + socketPool.checkin(destination, sands); + return endOfData(); + } - if(response.hasError()) - throwException(response.getError()); + VAdminProto.FetchPartitionEntriesResponse response = responseFromStream(inputStream, + size); - return response.getCount(); - } + if(response.hasError()) { + socketPool.checkin(destination, sands); + helperOps.throwException(response.getError()); + } - public void throwException(VProto.Error error) { - throw errorMapper.getError((short) error.getErrorCode(), error.getErrorMessage()); - } + VAdminProto.PartitionEntry partitionEntry = response.getPartitionEntry(); + + return Pair.create(ProtoUtils.decodeBytes(partitionEntry.getKey()), + ProtoUtils.decodeVersioned(partitionEntry.getVersioned())); + } catch(IOException e) { + helperOps.close(sands.getSocket()); + socketPool.checkin(destination, sands); + throw new VoldemortException(e); + } + } + }; - private void close(Socket socket) { - try { - socket.close(); - } catch(IOException e) { - logger.warn("Failed to close socket"); } - } - /** - * Stop the AdminClient cleanly freeing all resources. - */ - public void stop() { - this.pool.close(); - } + /** + * Fetch all the keys on the node that don't belong to it, based on its + * current metadata and yet stored on the node. i.e all keys orphaned on + * the node due to say not running the repair job after a rebalance + * + * @param nodeId Id of the node to fetch from + * @param storeName Name of the store + * @return An iterator which allows keys to be streamed as they're being + * iterated over. + */ + public Iterator fetchOrphanedKeys(int nodeId, String storeName) { + Node node = AdminClient.this.getAdminClientCluster().getNodeById(nodeId); + final SocketDestination destination = new SocketDestination(node.getHost(), + node.getAdminPort(), + RequestFormatType.ADMIN_PROTOCOL_BUFFERS); + final SocketAndStreams sands = socketPool.checkout(destination); + DataOutputStream outputStream = sands.getOutputStream(); + final DataInputStream inputStream = sands.getInputStream(); - /** - * Wait for async task at (remote) nodeId to finish completion, using - * exponential backoff to poll the task completion status. - *

    - * - * Logs the status at each status check if debug is enabled. - * - * @param nodeId Id of the node to poll - * @param requestId Id of the request to check - * @param maxWait Maximum time we'll keep checking a request until we give - * up - * @param timeUnit Unit in which maxWait is expressed. - * @param higherStatus A higher level async operation object. If this - * waiting is being run another async operation this helps us - * propagate the status all the way up. - * @return description The final description attached with the response - * @throws VoldemortException if task failed to finish in specified maxWait - * time. - */ - public String waitForCompletion(int nodeId, - int requestId, - long maxWait, - TimeUnit timeUnit, - AsyncOperationStatus higherStatus) { - long delay = INITIAL_DELAY; - long waitUntil = System.currentTimeMillis() + timeUnit.toMillis(maxWait); - - String description = null; - while(System.currentTimeMillis() < waitUntil) { try { - AsyncOperationStatus status = getAsyncRequestStatus(nodeId, requestId); - logger.info("Status from node " + nodeId + " (" + status.getDescription() + ") - " - + status.getStatus()); - if(higherStatus != null) { - higherStatus.setStatus("Status from node " + nodeId + " (" - + status.getDescription() + ") - " + status.getStatus()); - } - description = status.getDescription(); - if(status.hasException()) - throw status.getException(); + VAdminProto.FetchPartitionEntriesRequest.Builder fetchOrphanedRequest = VAdminProto.FetchPartitionEntriesRequest.newBuilder() + .setFetchValues(false) + .setStore(storeName) + .setFetchOrphaned(true); + + VAdminProto.VoldemortAdminRequest request = VAdminProto.VoldemortAdminRequest.newBuilder() + .setType(VAdminProto.AdminRequestType.FETCH_PARTITION_ENTRIES) + .setFetchPartitionEntries(fetchOrphanedRequest) + .build(); + ProtoUtils.writeMessage(outputStream, request); + outputStream.flush(); + } catch(IOException e) { + helperOps.close(sands.getSocket()); + socketPool.checkin(destination, sands); + throw new VoldemortException(e); + } - if(status.isComplete()) - return status.getStatus(); + return new AbstractIterator() { - if(delay < adminClientConfig.getMaxBackoffDelayMs()) - delay <<= 1; + @Override + public ByteArray computeNext() { + try { + int size = inputStream.readInt(); + if(size == -1) { + socketPool.checkin(destination, sands); + return endOfData(); + } + + VAdminProto.FetchPartitionEntriesResponse response = responseFromStream(inputStream, + size); + + if(response.hasError()) { + socketPool.checkin(destination, sands); + helperOps.throwException(response.getError()); + } + + return ProtoUtils.decodeBytes(response.getKey()); + } catch(IOException e) { + helperOps.close(sands.getSocket()); + socketPool.checkin(destination, sands); + throw new VoldemortException(e); + } - try { - Thread.sleep(delay); - } catch(InterruptedException e) { - Thread.currentThread().interrupt(); } - } catch(Exception e) { - throw new VoldemortException("Failed while waiting for async task (" + description - + ") at node " + nodeId + " to finish", e); + }; + } + + /** + * Legacy interface for fetching entries. See + * {@link AdminClient#fetchKeys(int, String, HashMap, VoldemortFilter, boolean, Cluster, long)} + * for more information. + * + * @param nodeId Id of the node to fetch from + * @param storeName Name of the store + * @param partitionList List of the partitions to retrieve + * @param filter Custom filter implementation to filter out entries + * which should not be fetched. + * @param fetchMasterEntries Fetch a key only if master replica + * @return An iterator which allows keys to be streamed as they're being + * iterated over. + */ + public Iterator fetchKeys(int nodeId, + String storeName, + List partitionList, + VoldemortFilter filter, + boolean fetchMasterEntries, + long recordsPerPartition) { + return fetchKeys(nodeId, + storeName, + helperOps.getReplicaToPartitionMap(nodeId, storeName, partitionList), + filter, + fetchMasterEntries, + null, + recordsPerPartition); + } + + /** + * Legacy interface for fetching entries. See + * {@link AdminClient#fetchKeys(int, String, HashMap, VoldemortFilter, boolean, Cluster, long)} + * for more information. + * + * @param nodeId Id of the node to fetch from + * @param storeName Name of the store + * @param partitionList List of the partitions to retrieve + * @param filter Custom filter implementation to filter out entries + * which should not be fetched. + * @param fetchMasterEntries Fetch a key only if master replica + * @return An iterator which allows keys to be streamed as they're being + * iterated over. + */ + public Iterator fetchKeys(int nodeId, + String storeName, + List partitionList, + VoldemortFilter filter, + boolean fetchMasterEntries) { + return fetchKeys(nodeId, storeName, partitionList, filter, fetchMasterEntries, 0); + } + + /** + * Fetch all keys belonging to the map of replica type to partition + * list. Identical to {@link AdminClient#fetchEntries} but + * only fetches the keys + * + * @param nodeId The node id from where to fetch the keys + * @param storeName The store name whose keys we want to retrieve + * @param replicaToPartitionList Map of replica type to corresponding + * partition list + * @param filter Custom filter + * @param initialCluster Cluster to use for selecting a key. If null, + * use the default metadata from the metadata store + * @return Returns an iterator of the keys + */ + public Iterator fetchKeys(int nodeId, + String storeName, + HashMap> replicaToPartitionList, + VoldemortFilter filter, + boolean fetchMasterEntries, + Cluster initialCluster, + long recordsPerPartition) { + Node node = AdminClient.this.getAdminClientCluster().getNodeById(nodeId); + final SocketDestination destination = new SocketDestination(node.getHost(), + node.getAdminPort(), + RequestFormatType.ADMIN_PROTOCOL_BUFFERS); + final SocketAndStreams sands = socketPool.checkout(destination); + DataOutputStream outputStream = sands.getOutputStream(); + final DataInputStream inputStream = sands.getInputStream(); + + try { + initiateFetchRequest(outputStream, + storeName, + replicaToPartitionList, + filter, + false, + fetchMasterEntries, + initialCluster, + recordsPerPartition); + } catch(IOException e) { + helperOps.close(sands.getSocket()); + socketPool.checkin(destination, sands); + throw new VoldemortException(e); } + + return new AbstractIterator() { + + @Override + public ByteArray computeNext() { + try { + int size = inputStream.readInt(); + if(size == -1) { + socketPool.checkin(destination, sands); + return endOfData(); + } + + VAdminProto.FetchPartitionEntriesResponse response = responseFromStream(inputStream, + size); + + if(response.hasError()) { + socketPool.checkin(destination, sands); + helperOps.throwException(response.getError()); + } + + return ProtoUtils.decodeBytes(response.getKey()); + } catch(IOException e) { + helperOps.close(sands.getSocket()); + socketPool.checkin(destination, sands); + throw new VoldemortException(e); + } + + } + }; } - throw new VoldemortException("Failed to finish task requestId: " + requestId - + " in maxWait " + maxWait + " " + timeUnit.toString()); } - /** - * Wait for async task at (remote) nodeId to finish completion, using - * exponential backoff to poll the task completion status. - *

    - * - * Logs the status at each status check if debug is enabled. - * - * @param nodeId Id of the node to poll - * @param requestId Id of the request to check - * @param maxWait Maximum time we'll keep checking a request until we give - * up - * @param timeUnit Unit in which maxWait is expressed. - * @return description The final description attached with the response - * @throws VoldemortException if task failed to finish in specified maxWait - * time. - */ - public String waitForCompletion(int nodeId, int requestId, long maxWait, TimeUnit timeUnit) { - return waitForCompletion(nodeId, requestId, maxWait, timeUnit, null); - } + private class AdminStoreClient { - /** - * Wait till the passed value matches with the metadata value returned by - * the remote node for the passed key. - *

    - * - * Logs the status at each status check if debug is enabled. - * - * @param nodeId Id of the node to poll - * @param key metadata key to keep checking for current value - * @param value metadata value should match for exit criteria. - * @param maxWait Maximum time we'll keep checking a request until we give - * up - * @param timeUnit Unit in which maxWait is expressed. - */ - public void waitForCompletion(int nodeId, - String key, - String value, - long maxWait, - TimeUnit timeUnit) { - long delay = INITIAL_DELAY; - long waitUntil = System.currentTimeMillis() + timeUnit.toMillis(maxWait); - - while(System.currentTimeMillis() < waitUntil) { - String currentValue = getRemoteMetadata(nodeId, key).getValue(); - if(value.equals(currentValue)) - return; + private class NodeStore { - logger.debug("waiting for value " + value + " for metadata key " + key - + " from remote node " + nodeId + " currentValue " + currentValue); + final public Integer nodeId; + final public String storeName; - if(delay < adminClientConfig.getMaxBackoffDelayMs()) - delay <<= 1; + NodeStore(int nodeId, String storeName) { + this.nodeId = new Integer(nodeId); + this.storeName = storeName; + } - try { - Thread.sleep(delay); - } catch(InterruptedException e) { - Thread.currentThread().interrupt(); + @Override + public boolean equals(Object obj) { + if(this == obj) + return true; + if(!(obj instanceof NodeStore)) + return false; + NodeStore other = (NodeStore) obj; + return nodeId.equals(other.nodeId) && storeName.equals(other.storeName); + } + + @Override + public int hashCode() { + return nodeId.hashCode() + storeName.hashCode(); } } - throw new VoldemortException("Failed to get matching value " + value + " for key " + key - + " at remote node " + nodeId + " in maximum wait" + maxWait - + " " + timeUnit.toString() + " time."); - } - /** - * Update metadata at the given remoteNodeId. - *

    - * - * Metadata keys can be one of {@link MetadataStore#METADATA_KEYS}
    - * eg.
    - *

  • cluster metadata (cluster.xml as string) - *
  • stores definitions (stores.xml as string) - *
  • Server states
    - * See {@link voldemort.store.metadata.MetadataStore} for more information. - * - * @param remoteNodeId Id of the node - * @param key Metadata key to update - * @param value Value for the metadata key - */ - public void updateRemoteMetadata(int remoteNodeId, String key, Versioned value) { - ByteArray keyBytes = new ByteArray(ByteUtils.getBytes(key, "UTF-8")); - Versioned valueBytes = new Versioned(ByteUtils.getBytes(value.getValue(), - "UTF-8"), - value.getVersion()); - - VAdminProto.VoldemortAdminRequest request = VAdminProto.VoldemortAdminRequest.newBuilder() - .setType(VAdminProto.AdminRequestType.UPDATE_METADATA) - .setUpdateMetadata(VAdminProto.UpdateMetadataRequest.newBuilder() - .setKey(ByteString.copyFrom(keyBytes.get())) - .setVersioned(ProtoUtils.encodeVersioned(valueBytes)) - .build()) - .build(); - VAdminProto.UpdateMetadataResponse.Builder response = sendAndReceive(remoteNodeId, - request, - VAdminProto.UpdateMetadataResponse.newBuilder()); - if(response.hasError()) - throwException(response.getError()); - } + final private ClientConfig clientConfig; + final private ClientRequestExecutorPool clientPool; - /** - * Wrapper for updateRemoteMetadata function used against a single Node It - * basically loops over the entire list of Nodes that we need to execute the - * required operation against. It also increments the version of the - * corresponding metadata in the system store. - *

    - * - * Metadata keys can be one of {@link MetadataStore#METADATA_KEYS}
    - * eg.
    - *

  • cluster metadata (cluster.xml as string) - *
  • stores definitions (stores.xml as string) - *
  • Server states
    - * See {@link voldemort.store.metadata.MetadataStore} for more information. - * - * @param remoteNodeId Id of the node - * @param key Metadata key to update - * @param value Value for the metadata key - * - * */ - public void updateRemoteMetadata(List remoteNodeIds, - String key, - Versioned value) { - for(Integer currentNodeId: remoteNodeIds) { - System.out.println("Setting " + key + " for " - + getAdminClientCluster().getNodeById(currentNodeId).getHost() + ":" - + getAdminClientCluster().getNodeById(currentNodeId).getId()); - updateRemoteMetadata(currentNodeId, key, value); + private final ConcurrentMap nodeStoreSocketCache; + + AdminStoreClient(ClientConfig clientConfig) { + this.clientConfig = clientConfig; + clientPool = new ClientRequestExecutorPool(clientConfig.getSelectors(), + clientConfig.getMaxConnectionsPerNode(), + clientConfig.getConnectionTimeout(TimeUnit.MILLISECONDS), + clientConfig.getSocketTimeout(TimeUnit.MILLISECONDS), + clientConfig.getSocketBufferSize(), + clientConfig.getSocketKeepAlive()); + nodeStoreSocketCache = new ConcurrentHashMap(); } - /* - * Assuming everything is fine, we now increment the metadata version - * for the key - */ - if(key.equals(CLUSTER_VERSION_KEY)) { - updateMetadataversion(key); + public SocketStore getSocketStore(int nodeId, String storeName) { + NodeStore nodeStore = new NodeStore(nodeId, storeName); + + SocketStore socketStore = nodeStoreSocketCache.get(nodeStore); + if(socketStore == null) { + Node node = getAdminClientCluster().getNodeById(nodeId); + + SocketStore newSocketStore = null; + try { + newSocketStore = clientPool.create(storeName, + node.getHost(), + node.getSocketPort(), + clientConfig.getRequestFormatType(), + RequestRoutingType.IGNORE_CHECKS); + } catch(Exception e) { + clientPool.close(); + throw new VoldemortException(e); + } + + socketStore = nodeStoreSocketCache.putIfAbsent(nodeStore, newSocketStore); + if(socketStore == null) { + socketStore = newSocketStore; + } else { + newSocketStore.close(); + } + } + + return socketStore; } - } - /** - * Get the metadata on a remote node. - *

    - * Metadata keys can be one of {@link MetadataStore#METADATA_KEYS}
    - * eg.
    - *

  • cluster metadata (cluster.xml as string) - *
  • stores definitions (stores.xml as string) - *
  • Server states
    - * See {@link voldemort.store.metadata.MetadataStore} for more information. - * - * @param remoteNodeId Id of the node - * @param key Metadata key to update - * @return Metadata with its associated {@link voldemort.versioning.Version} - */ - public Versioned getRemoteMetadata(int remoteNodeId, String key) { - ByteArray keyBytes = new ByteArray(ByteUtils.getBytes(key, "UTF-8")); - VAdminProto.VoldemortAdminRequest request = VAdminProto.VoldemortAdminRequest.newBuilder() - .setType(VAdminProto.AdminRequestType.GET_METADATA) - .setGetMetadata(VAdminProto.GetMetadataRequest.newBuilder() - .setKey(ByteString.copyFrom(keyBytes.get()))) - .build(); - VAdminProto.GetMetadataResponse.Builder response = sendAndReceive(remoteNodeId, - request, - VAdminProto.GetMetadataResponse.newBuilder()); - - if(response.hasError()) - throwException(response.getError()); - - Versioned value = ProtoUtils.decodeVersioned(response.getVersion()); - return new Versioned(ByteUtils.getString(value.getValue(), "UTF-8"), - value.getVersion()); + public void close() { + clientPool.close(); + } } - /** - * Update the cluster information {@link MetadataStore#CLUSTER_KEY} on a - * remote node. - *

    - * - * @param nodeId Id of the remote node - * @param cluster The new cluster object - * @throws VoldemortException - */ - public void updateRemoteCluster(int nodeId, Cluster cluster, Version clock) - throws VoldemortException { - updateRemoteMetadata(nodeId, - MetadataStore.CLUSTER_KEY, - new Versioned(clusterMapper.writeCluster(cluster), clock)); - } + public class StoreOperations { - /** - * Get the cluster information from a remote node. - *

    - * - * @param nodeId Node to retrieve information from - * @return A cluster object with its {@link voldemort.versioning.Version} - * @throws VoldemortException - */ - public Versioned getRemoteCluster(int nodeId) throws VoldemortException { - Versioned value = getRemoteMetadata(nodeId, MetadataStore.CLUSTER_KEY); - Cluster cluster = clusterMapper.readCluster(new StringReader(value.getValue()), false); - return new Versioned(cluster, value.getVersion()); - } + /** + * This method updates exactly one key/value for a specific store on a + * specific node. + * + * @param storeName Name of the store + * @param nodeKeyValue A specific key/value to update on a specific + * node. + * @return RepairEntryResult with success/exception details. + */ + public void putNodeKeyValue(String storeName, NodeValue nodeKeyValue) { + SocketStore socketStore = adminStoreClient.getSocketStore(nodeKeyValue.getNodeId(), + storeName); - /** - * Update the store definitions on a remote node. - *

    - * - * @param nodeId The node id of the machine - * @param storesList The new store list - * @throws VoldemortException - */ - public void updateRemoteStoreDefList(int nodeId, List storesList) - throws VoldemortException { - // get current version. - VectorClock oldClock = (VectorClock) getRemoteStoreDefList(nodeId).getVersion(); - - updateRemoteMetadata(nodeId, - MetadataStore.STORES_KEY, - new Versioned(storeMapper.writeStoreList(storesList), - oldClock.incremented(nodeId, 1))); - } + socketStore.put(nodeKeyValue.getKey(), nodeKeyValue.getVersioned(), null); + } - /** - * Retrieve the store definitions from a remote node. - *

    - * - * @param nodeId The node id from which we can to remote the store - * definition - * @return The list of store definitions from the remote machine - * @throws VoldemortException - */ - public Versioned> getRemoteStoreDefList(int nodeId) - throws VoldemortException { - Versioned value = getRemoteMetadata(nodeId, MetadataStore.STORES_KEY); - List storeList = storeMapper.readStoreList(new StringReader(value.getValue()), - false); - return new Versioned>(storeList, value.getVersion()); - } + /** + * Fetch key/value tuple for given key for a specific store on specified + * node. + * + * @param storeName Name of the store + * @param nodeId Id of the node to query from + * @param key for which to query + * @return List> of values for the specified NodeKey. + */ + public List> getNodeKey(String storeName, int nodeId, ByteArray key) { + SocketStore socketStore = adminStoreClient.getSocketStore(nodeId, storeName); + return socketStore.get(key, null); + } - /** - * Update the server state ( - * {@link voldemort.store.metadata.MetadataStore.VoldemortState}) on a - * remote node. - * - * @param nodeId The node id on which we want to update the state - * @param state The state to update to - * @param clock The vector clock - */ - public void updateRemoteServerState(int nodeId, - MetadataStore.VoldemortState state, - Version clock) { - updateRemoteMetadata(nodeId, - MetadataStore.SERVER_STATE_KEY, - new Versioned(state.toString(), clock)); - } + // As needed, add 'getall', 'delete', and so on interfaces... + } + + /** + * Encapsulates all steaming operations that actually read and write + * key-value pairs into the cluster + * + */ + public class StreamingOperations { + + /** + * Update a stream of key/value entries at the given node. The iterator + * entries are streamed from the client to the server: + *

      + *
    1. Client performs a handshake with the server (sending in the + * update entries request with a store name and a + * {@link VoldemortFilter} instance.
    2. + *
    3. While entryIterator has entries, the client will keep sending the + * updates one after another to the server, buffering the data, without + * waiting for a response from the server.
    4. + *
    5. After iteration is complete, send an end of stream message, force + * a flush of the buffer, check the response on the server to check if a + * {@link VoldemortException} has occured.
    6. + *
    + * + * @param nodeId Id of the remote node (where we wish to update the + * entries) + * @param storeName Store name for the entries + * @param entryIterator Iterator of key-value pairs for the entries + * @param filter Custom filter implementation to filter out entries + * which should not be updated. + * @throws VoldemortException + */ + public void updateEntries(int nodeId, + String storeName, + Iterator>> entryIterator, + VoldemortFilter filter) { + Node node = AdminClient.this.getAdminClientCluster().getNodeById(nodeId); + SocketDestination destination = new SocketDestination(node.getHost(), + node.getAdminPort(), + RequestFormatType.ADMIN_PROTOCOL_BUFFERS); + SocketAndStreams sands = socketPool.checkout(destination); + DataOutputStream outputStream = sands.getOutputStream(); + DataInputStream inputStream = sands.getInputStream(); + boolean firstMessage = true; + long printStatsTimer = System.currentTimeMillis() + PRINT_STATS_INTERVAL; + long entryCount = 0; - /** - * Delete the rebalancing metadata related to the store on the stealer node - * - * @param donorNodeId The donor node id - * @param stealerNodeId The stealer node id - * @param storeName The name of the store - */ - public void deleteStoreRebalanceState(int donorNodeId, int stealerNodeId, String storeName) { - - VAdminProto.VoldemortAdminRequest request = VAdminProto.VoldemortAdminRequest.newBuilder() - .setType(VAdminProto.AdminRequestType.DELETE_STORE_REBALANCE_STATE) - .setDeleteStoreRebalanceState(VAdminProto.DeleteStoreRebalanceStateRequest.newBuilder() - .setNodeId(donorNodeId) - .setStoreName(storeName) - .build()) - .build(); - VAdminProto.DeleteStoreRebalanceStateResponse.Builder response = sendAndReceive(stealerNodeId, - request, - VAdminProto.DeleteStoreRebalanceStateResponse.newBuilder()); - if(response.hasError()) - throwException(response.getError()); + try { + if(entryIterator.hasNext()) { + while(entryIterator.hasNext()) { + Pair> entry = entryIterator.next(); + VAdminProto.PartitionEntry partitionEntry = VAdminProto.PartitionEntry.newBuilder() + .setKey(ProtoUtils.encodeBytes(entry.getFirst())) + .setVersioned(ProtoUtils.encodeVersioned(entry.getSecond())) + .build(); + VAdminProto.UpdatePartitionEntriesRequest.Builder updateRequest = VAdminProto.UpdatePartitionEntriesRequest.newBuilder() + .setStore(storeName) + .setPartitionEntry(partitionEntry); + entryCount++; + if(firstMessage) { + if(filter != null) { + updateRequest.setFilter(helperOps.encodeFilter(filter)); + } + + ProtoUtils.writeMessage(outputStream, + VAdminProto.VoldemortAdminRequest.newBuilder() + .setType(VAdminProto.AdminRequestType.UPDATE_PARTITION_ENTRIES) + .setUpdatePartitionEntries(updateRequest) + .build()); + outputStream.flush(); + firstMessage = false; + } else { + ProtoUtils.writeMessage(outputStream, updateRequest.build()); + if(printStatsTimer <= System.currentTimeMillis() + || 0 == entryCount % PRINT_STATS_THRESHOLD) { + logger.info("UpdatePartitionEntries: fetched " + entryCount + + " to node " + nodeId + " for store " + storeName); + printStatsTimer = System.currentTimeMillis() + PRINT_STATS_INTERVAL; + } + } + } + ProtoUtils.writeEndOfStream(outputStream); + outputStream.flush(); + VAdminProto.UpdatePartitionEntriesResponse.Builder updateResponse = ProtoUtils.readToBuilder(inputStream, + VAdminProto.UpdatePartitionEntriesResponse.newBuilder()); + if(updateResponse.hasError()) { + helperOps.throwException(updateResponse.getError()); + } + } + } catch(IOException e) { + helperOps.close(sands.getSocket()); + throw new VoldemortException(e); + } finally { + socketPool.checkin(destination, sands); + } + } - } + /** + * Fetch key/value tuples belonging to a node with given key values + * + *

    + * Entries are being queried synchronously + * as the iteration happens i.e. the whole result set is + * not buffered in memory. + * + * @param nodeId Id of the node to fetch from + * @param storeName Name of the store + * @param keys An Iterable of keys + * @return An iterator which allows entries to be streamed as they're + * being iterated over. + */ + public Iterator queryKeys(int nodeId, + String storeName, + final Iterator keys) { - /** - * Retrieve the server - * {@link voldemort.store.metadata.MetadataStore.VoldemortState} from a - * remote node. - * - * @param nodeId The node from which we want to retrieve the state - * @return The server state - */ - public Versioned getRemoteServerState(int nodeId) { - Versioned value = getRemoteMetadata(nodeId, MetadataStore.SERVER_STATE_KEY); - return new Versioned(VoldemortState.valueOf(value.getValue()), - value.getVersion()); - } + final Store store; - /** - * Return the remote rebalancer state for remote node - * - * @param nodeId Node id - * @return The rebalancer state - */ - public Versioned getRemoteRebalancerState(int nodeId) { - Versioned value = getRemoteMetadata(nodeId, MetadataStore.REBALANCING_STEAL_INFO); - return new Versioned(RebalancerState.create(value.getValue()), - value.getVersion()); - } + try { + store = adminStoreClient.getSocketStore(nodeId, storeName); - /** - * Add a new store definition to all active nodes in the cluster. - *

    - * - * @param def the definition of the store to add - */ - public void addStore(StoreDefinition def) { - for(Node node: currentCluster.getNodes()) { - addStore(def, node.getId()); + } catch(Exception e) { + throw new VoldemortException(e); + } + + return new AbstractIterator() { + + @Override + public QueryKeyResult computeNext() { + ByteArray key; + List> value = null; + if(!keys.hasNext()) { + return endOfData(); + } else { + key = keys.next(); + } + try { + value = store.get(key, null); + return new QueryKeyResult(key, value); + } catch(Exception e) { + return new QueryKeyResult(key, e); + } + } + }; } - } - /** - * Add a new store definition to a particular node - *

    - * - * @param def the definition of the store to add - * @param nodeId Node on which to add the store - */ - public void addStore(StoreDefinition def, int nodeId) { - String value = storeMapper.writeStore(def); - - VAdminProto.AddStoreRequest.Builder addStoreRequest = VAdminProto.AddStoreRequest.newBuilder() - .setStoreDefinition(value); - VAdminProto.VoldemortAdminRequest request = VAdminProto.VoldemortAdminRequest.newBuilder() - .setType(VAdminProto.AdminRequestType.ADD_STORE) - .setAddStore(addStoreRequest) - .build(); - - Node node = currentCluster.getNodeById(nodeId); - if(null == node) - throw new VoldemortException("Invalid node id (" + nodeId + ") specified"); - - logger.info("Adding store " + def.getName() + " on node " + node.getHost() + ":" - + node.getId()); - VAdminProto.AddStoreResponse.Builder response = sendAndReceive(nodeId, - request, - VAdminProto.AddStoreResponse.newBuilder()); - if(response.hasError()) - throwException(response.getError()); - logger.info("Succesfully added " + def.getName() + " on node " + node.getHost() + ":" - + node.getId()); - } + /** + * Update slops which may be meant for multiple stores + * + * @param nodeId The id of the node + * @param entryIterator An iterator over all the slops for this + * particular node + */ + public void updateSlopEntries(int nodeId, Iterator> entryIterator) { + Node node = AdminClient.this.getAdminClientCluster().getNodeById(nodeId); + SocketDestination destination = new SocketDestination(node.getHost(), + node.getAdminPort(), + RequestFormatType.ADMIN_PROTOCOL_BUFFERS); + SocketAndStreams sands = socketPool.checkout(destination); + DataOutputStream outputStream = sands.getOutputStream(); + DataInputStream inputStream = sands.getInputStream(); + boolean firstMessage = true; + + try { + if(entryIterator.hasNext()) { + while(entryIterator.hasNext()) { + Versioned versionedSlop = entryIterator.next(); + Slop slop = versionedSlop.getValue(); + + // Build the message + RequestType requestType = null; + if(slop.getOperation().equals(Operation.PUT)) { + requestType = RequestType.PUT; + } else if(slop.getOperation().equals(Operation.DELETE)) { + requestType = RequestType.DELETE; + } else { + logger.error("Unsupported operation. Skipping"); + continue; + } + VAdminProto.UpdateSlopEntriesRequest.Builder updateRequest = VAdminProto.UpdateSlopEntriesRequest.newBuilder() + .setStore(slop.getStoreName()) + .setKey(ProtoUtils.encodeBytes(slop.getKey())) + .setVersion(ProtoUtils.encodeClock(versionedSlop.getVersion())) + .setRequestType(requestType); + // Add transforms and value only if required + if(slop.getTransforms() != null) + updateRequest.setTransform(ProtoUtils.encodeTransform(slop.getTransforms())); + if(slop.getValue() != null) + updateRequest.setValue(ByteString.copyFrom(slop.getValue())); + + if(firstMessage) { + ProtoUtils.writeMessage(outputStream, + VAdminProto.VoldemortAdminRequest.newBuilder() + .setType(VAdminProto.AdminRequestType.UPDATE_SLOP_ENTRIES) + .setUpdateSlopEntries(updateRequest) + .build()); + outputStream.flush(); + firstMessage = false; + } else { + ProtoUtils.writeMessage(outputStream, updateRequest.build()); + } + } + ProtoUtils.writeEndOfStream(outputStream); + outputStream.flush(); + VAdminProto.UpdateSlopEntriesResponse.Builder updateResponse = ProtoUtils.readToBuilder(inputStream, + VAdminProto.UpdateSlopEntriesResponse.newBuilder()); + if(updateResponse.hasError()) { + helperOps.throwException(updateResponse.getError()); + } + } + } catch(IOException e) { + helperOps.close(sands.getSocket()); + throw new VoldemortException(e); + } finally { + socketPool.checkin(destination, sands); + } - /** - * Delete a store from all active nodes in the cluster - *

    - * - * @param storeName name of the store to delete - */ - public void deleteStore(String storeName) { - for(Node node: currentCluster.getNodes()) { - deleteStore(storeName, node.getId()); } } /** - * Delete a store from a particular node - *

    + * Encapsulates all operations concerning cluster expansion * - * @param storeName name of the store to delete - * @param nodeId Node on which we want to delete a store */ - public void deleteStore(String storeName, int nodeId) { - VAdminProto.DeleteStoreRequest.Builder deleteStoreRequest = VAdminProto.DeleteStoreRequest.newBuilder() - .setStoreName(storeName); - VAdminProto.VoldemortAdminRequest request = VAdminProto.VoldemortAdminRequest.newBuilder() - .setType(VAdminProto.AdminRequestType.DELETE_STORE) - .setDeleteStore(deleteStoreRequest) - .build(); - Node node = currentCluster.getNodeById(nodeId); - if(null == node) - throw new VoldemortException("Invalid node id (" + nodeId + ") specified"); - - logger.info("Deleting " + storeName + " on node " + node.getHost() + ":" + node.getId()); - VAdminProto.DeleteStoreResponse.Builder response = sendAndReceive(node.getId(), - request, - VAdminProto.DeleteStoreResponse.newBuilder()); - if(response.hasError()) - throwException(response.getError()); - logger.info("Successfully deleted " + storeName + " on node " + node.getHost() + ":" - + node.getId()); - } + public class RebalancingOperations { - /** - * Set cluster info for AdminClient to use. - * - * @param cluster Set the current cluster - */ - public void setAdminClientCluster(Cluster cluster) { - this.currentCluster = cluster; - } + /** + * Rebalance a stealer-donor node pair for a set of stores. This is run + * on the donor node. + * + * @param stealInfos List of partition steal information + * @return The request id of the async operation + */ + public int rebalanceNode(List stealInfos) { + List rebalancePartitionInfoMap = ProtoUtils.encodeRebalancePartitionInfoMap(stealInfos); + VAdminProto.InitiateRebalanceNodeOnDonorRequest rebalanceNodeRequest = VAdminProto.InitiateRebalanceNodeOnDonorRequest.newBuilder() + .addAllRebalancePartitionInfo(rebalancePartitionInfoMap) + .build(); + VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() + .setType(VAdminProto.AdminRequestType.INITIATE_REBALANCE_NODE_ON_DONOR) + .setInitiateRebalanceNodeOnDonor(rebalanceNodeRequest) + .build(); + VAdminProto.AsyncOperationStatusResponse.Builder response = rpcOps.sendAndReceive(stealInfos.get(0) + .getDonorId(), + adminRequest, + VAdminProto.AsyncOperationStatusResponse.newBuilder()); + + if(response.hasError()) + helperOps.throwException(response.getError()); + + return response.getRequestId(); + } - /** - * Get the cluster info AdminClient is using. - * - * @return Returns the current cluster being used by the admin client - */ - public Cluster getAdminClientCluster() { - return currentCluster; - } + /** + * Rebalance a stealer-donor node pair for a set of stores. This is run + * on the stealer node. + * + * @param stealInfo Partition steal information + * @return The request id of the async operation + */ + public int rebalanceNode(RebalancePartitionsInfo stealInfo) { + VAdminProto.RebalancePartitionInfoMap rebalancePartitionInfoMap = ProtoUtils.encodeRebalancePartitionInfoMap(stealInfo); + VAdminProto.InitiateRebalanceNodeRequest rebalanceNodeRequest = VAdminProto.InitiateRebalanceNodeRequest.newBuilder() + .setRebalancePartitionInfo(rebalancePartitionInfoMap) + .build(); + VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() + .setType(VAdminProto.AdminRequestType.INITIATE_REBALANCE_NODE) + .setInitiateRebalanceNode(rebalanceNodeRequest) + .build(); + VAdminProto.AsyncOperationStatusResponse.Builder response = rpcOps.sendAndReceive(stealInfo.getStealerId(), + adminRequest, + VAdminProto.AsyncOperationStatusResponse.newBuilder()); + + if(response.hasError()) + helperOps.throwException(response.getError()); + + return response.getRequestId(); + } - /** - * Rollback RO store to most recent backup of the current store - *

    - * - * @param nodeId The node id on which to rollback - * @param storeName The name of the RO Store to rollback - * @param pushVersion The version of the push to revert back to - */ - public void rollbackStore(int nodeId, String storeName, long pushVersion) { - VAdminProto.RollbackStoreRequest.Builder rollbackStoreRequest = VAdminProto.RollbackStoreRequest.newBuilder() - .setStoreName(storeName) - .setPushVersion(pushVersion); - - VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() - .setRollbackStore(rollbackStoreRequest) - .setType(VAdminProto.AdminRequestType.ROLLBACK_STORE) - .build(); - VAdminProto.RollbackStoreResponse.Builder response = sendAndReceive(nodeId, - adminRequest, - VAdminProto.RollbackStoreResponse.newBuilder()); - if(response.hasError()) { - throwException(response.getError()); - } - return; - } + /** + * Delete the rebalancing metadata related to the store on the stealer + * node + * + * @param donorNodeId The donor node id + * @param stealerNodeId The stealer node id + * @param storeName The name of the store + */ + public void deleteStoreRebalanceState(int donorNodeId, int stealerNodeId, String storeName) { + + VAdminProto.VoldemortAdminRequest request = VAdminProto.VoldemortAdminRequest.newBuilder() + .setType(VAdminProto.AdminRequestType.DELETE_STORE_REBALANCE_STATE) + .setDeleteStoreRebalanceState(VAdminProto.DeleteStoreRebalanceStateRequest.newBuilder() + .setNodeId(donorNodeId) + .setStoreName(storeName) + .build()) + .build(); + VAdminProto.DeleteStoreRebalanceStateResponse.Builder response = rpcOps.sendAndReceive(stealerNodeId, + request, + VAdminProto.DeleteStoreRebalanceStateResponse.newBuilder()); + if(response.hasError()) + helperOps.throwException(response.getError()); - /** - * Repair the stores on a rebalanced node 'nodeId' - *

    - * - * @param nodeId The id of the node on which to do the repair - */ - public void repairJob(int nodeId) { - VAdminProto.RepairJobRequest.Builder repairJobRequest = VAdminProto.RepairJobRequest.newBuilder(); - - VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() - .setRepairJob(repairJobRequest) - .setType(VAdminProto.AdminRequestType.REPAIR_JOB) - .build(); - Node node = this.getAdminClientCluster().getNodeById(nodeId); - SocketDestination destination = new SocketDestination(node.getHost(), - node.getAdminPort(), - RequestFormatType.ADMIN_PROTOCOL_BUFFERS); - SocketAndStreams sands = pool.checkout(destination); - - try { - DataOutputStream outputStream = sands.getOutputStream(); - ProtoUtils.writeMessage(outputStream, adminRequest); - outputStream.flush(); - } catch(IOException e) { - close(sands.getSocket()); - throw new VoldemortException(e); - } finally { - pool.checkin(destination, sands); } - return; - } - /** - * Fetch data from directory 'storeDir' on node id - *

    - * - * @param nodeId The id of the node on which to fetch the data - * @param storeName The name of the store - * @param storeDir The directory from where to read the data - * @param pushVersion The version of the push - * @param timeoutMs Time timeout in milliseconds - * @return The path of the directory where the data is stored finally - */ - public String fetchStore(int nodeId, - String storeName, - String storeDir, - long pushVersion, - long timeoutMs) { - VAdminProto.FetchStoreRequest.Builder fetchStoreRequest = VAdminProto.FetchStoreRequest.newBuilder() - .setStoreName(storeName) - .setStoreDir(storeDir); - if(pushVersion > 0) { - fetchStoreRequest.setPushVersion(pushVersion); - } - - VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() - .setFetchStore(fetchStoreRequest) - .setType(VAdminProto.AdminRequestType.FETCH_STORE) - .build(); - VAdminProto.AsyncOperationStatusResponse.Builder response = sendAndReceive(nodeId, - adminRequest, - VAdminProto.AsyncOperationStatusResponse.newBuilder()); + /** + * Retrieve the server + * {@link voldemort.store.metadata.MetadataStore.VoldemortState} from a + * remote node. + * + * @param nodeId The node from which we want to retrieve the state + * @return The server state + */ + public Versioned getRemoteServerState(int nodeId) { + Versioned value = metadataMgmtOps.getRemoteMetadata(nodeId, + MetadataStore.SERVER_STATE_KEY); + return new Versioned(VoldemortState.valueOf(value.getValue()), + value.getVersion()); + } - if(response.hasError()) { - throwException(response.getError()); + /** + * Return the remote rebalancer state for remote node + * + * @param nodeId Node id + * @return The rebalancer state + */ + public Versioned getRemoteRebalancerState(int nodeId) { + Versioned value = metadataMgmtOps.getRemoteMetadata(nodeId, + MetadataStore.REBALANCING_STEAL_INFO); + return new Versioned(RebalancerState.create(value.getValue()), + value.getVersion()); } - int asyncId = response.getRequestId(); - return waitForCompletion(nodeId, asyncId, timeoutMs, TimeUnit.MILLISECONDS); - } + /** + * Used in rebalancing to indicate change in states. Groups the + * partition plans on the basis of stealer nodes and sends them over. + * + * The various combinations and their order of execution is given below + * + *

    +         * | swapRO | changeClusterMetadata | changeRebalanceState | Order |
    +         * | f | t | t | cluster -> rebalance | 
    +         * | f | f | t | rebalance |
    +         * | t | t | f | cluster -> swap |
    +         * | t | t | t | cluster -> swap -> rebalance |
    +         * 
    + * + * + * Similarly for rollback: + * + *
    +         * | swapRO | changeClusterMetadata | changeRebalanceState | Order |
    +         * | f | t | t | remove from rebalance -> cluster  | 
    +         * | f | f | t | remove from rebalance |
    +         * | t | t | f | cluster -> swap |
    +         * | t | t | t | remove from rebalance -> cluster -> swap  |
    +         * 
    + * + * + * @param existingCluster Current cluster + * @param transitionCluster Transition cluster + * @param rebalancePartitionPlanList The list of rebalance partition + * info plans + * @param swapRO Boolean indicating if we need to swap RO stores + * @param changeClusterMetadata Boolean indicating if we need to change + * cluster metadata + * @param changeRebalanceState Boolean indicating if we need to change + * rebalancing state + * @param rollback Do we want to do a rollback step in case of failures? + * @param failEarly Do we want to fail early while doing state change? + */ + public void rebalanceStateChange(Cluster existingCluster, + Cluster transitionCluster, + List rebalancePartitionPlanList, + boolean swapRO, + boolean changeClusterMetadata, + boolean changeRebalanceState, + boolean rollback, + boolean failEarly) { + HashMap> stealerNodeToPlan = RebalanceUtils.groupPartitionsInfoByNode(rebalancePartitionPlanList, + true); + Set completedNodeIds = Sets.newHashSet(); + + int nodeId = 0; + HashMap exceptions = Maps.newHashMap(); - /** - * When a fetch store fails, we don't need to keep the pushed data around. - * This function deletes its... - * - * @param nodeId The node id on which to delete the data - * @param storeName The name of the store - * @param storeDir The directory to delete - */ - public void failedFetchStore(int nodeId, String storeName, String storeDir) { - VAdminProto.FailedFetchStoreRequest.Builder failedFetchStoreRequest = VAdminProto.FailedFetchStoreRequest.newBuilder() - .setStoreDir(storeDir) - .setStoreName(storeName); - - VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() - .setFailedFetchStore(failedFetchStoreRequest) - .setType(VAdminProto.AdminRequestType.FAILED_FETCH_STORE) - .build(); - VAdminProto.FailedFetchStoreResponse.Builder response = sendAndReceive(nodeId, - adminRequest, - VAdminProto.FailedFetchStoreResponse.newBuilder()); - if(response.hasError()) { - throwException(response.getError()); - } - return; - } + try { + while(nodeId < transitionCluster.getNumberOfNodes()) { - /** - * Swap store data atomically on a single node - *

    - * - * @param nodeId The node id where we would want to swap the data - * @param storeName Name of the store - * @param storeDir The directory where the data is present - * @return Returns the location of the previous directory - */ - public String swapStore(int nodeId, String storeName, String storeDir) { - VAdminProto.SwapStoreRequest.Builder swapStoreRequest = VAdminProto.SwapStoreRequest.newBuilder() - .setStoreDir(storeDir) - .setStoreName(storeName); - VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() - .setSwapStore(swapStoreRequest) - .setType(VAdminProto.AdminRequestType.SWAP_STORE) - .build(); - VAdminProto.SwapStoreResponse.Builder response = sendAndReceive(nodeId, - adminRequest, - VAdminProto.SwapStoreResponse.newBuilder()); - if(response.hasError()) { - throwException(response.getError()); - } - return response.getPreviousStoreDir(); - } + try { + individualStateChange(nodeId, + transitionCluster, + stealerNodeToPlan.get(nodeId), + swapRO, + changeClusterMetadata, + changeRebalanceState, + false); + completedNodeIds.add(nodeId); + } catch(Exception e) { + exceptions.put(nodeId, e); + if(failEarly) { + throw e; + } + } + nodeId++; - /** - * Returns the read-only storage format - {@link ReadOnlyStorageFormat} for - * a list of stores - * - * @param nodeId The id of the node on which the stores are present - * @param storeNames List of all the store names - * @return Returns a map of store name to its corresponding RO storage - * format - */ - public Map getROStorageFormat(int nodeId, List storeNames) { - VAdminProto.GetROStorageFormatRequest.Builder getRORequest = VAdminProto.GetROStorageFormatRequest.newBuilder() - .addAllStoreName(storeNames); - VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() - .setGetRoStorageFormat(getRORequest) - .setType(VAdminProto.AdminRequestType.GET_RO_STORAGE_FORMAT) - .build(); - VAdminProto.GetROStorageFormatResponse.Builder response = sendAndReceive(nodeId, - adminRequest, - VAdminProto.GetROStorageFormatResponse.newBuilder()); - if(response.hasError()) { - throwException(response.getError()); - } - - Map storeToValues = ProtoUtils.encodeROMap(response.getRoStoreVersionsList()); - - if(storeToValues.size() != storeNames.size()) { - storeNames.removeAll(storeToValues.keySet()); - throw new VoldemortException("Did not retrieve values for " + storeNames); - } - return storeToValues; - } + } - /** - * Returns the max version of push currently being used by read-only store. - * Important to remember that this may not be the 'current' version since - * multiple pushes (with greater version numbers) may be in progress - * currently - * - * @param nodeId The id of the node on which the store is present - * @param storeNames List of all the stores - * @return Returns a map of store name to the respective store directory - */ - public Map getROMaxVersionDir(int nodeId, List storeNames) { - VAdminProto.GetROMaxVersionDirRequest.Builder getRORequest = VAdminProto.GetROMaxVersionDirRequest.newBuilder() - .addAllStoreName(storeNames); - VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() - .setGetRoMaxVersionDir(getRORequest) - .setType(VAdminProto.AdminRequestType.GET_RO_MAX_VERSION_DIR) - .build(); - VAdminProto.GetROMaxVersionDirResponse.Builder response = sendAndReceive(nodeId, - adminRequest, - VAdminProto.GetROMaxVersionDirResponse.newBuilder()); - if(response.hasError()) { - throwException(response.getError()); - } - - Map storeToValues = ProtoUtils.encodeROMap(response.getRoStoreVersionsList()); - - if(storeToValues.size() != storeNames.size()) { - storeNames.removeAll(storeToValues.keySet()); - throw new VoldemortException("Did not retrieve values for " + storeNames); - } - return storeToValues; - } + if(exceptions.size() > 0) { + throw new VoldemortRebalancingException("Got exceptions from nodes " + + exceptions.keySet()); + } - /** - * Returns the 'current' versions of all RO stores provided - * - * @param nodeId The id of the node on which the store is present - * @param storeNames List of all the RO stores - * @return Returns a map of store name to the respective max version - * directory - */ - public Map getROCurrentVersionDir(int nodeId, List storeNames) { - VAdminProto.GetROCurrentVersionDirRequest.Builder getRORequest = VAdminProto.GetROCurrentVersionDirRequest.newBuilder() - .addAllStoreName(storeNames); - VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() - .setGetRoCurrentVersionDir(getRORequest) - .setType(VAdminProto.AdminRequestType.GET_RO_CURRENT_VERSION_DIR) - .build(); - VAdminProto.GetROCurrentVersionDirResponse.Builder response = sendAndReceive(nodeId, - adminRequest, - VAdminProto.GetROCurrentVersionDirResponse.newBuilder()); - if(response.hasError()) { - throwException(response.getError()); - } - - Map storeToValues = ProtoUtils.encodeROMap(response.getRoStoreVersionsList()); - - if(storeToValues.size() != storeNames.size()) { - storeNames.removeAll(storeToValues.keySet()); - throw new VoldemortException("Did not retrieve values for " + storeNames); - } - return storeToValues; - } + /* + * If everything went smoothly, update the version of the + * cluster metadata + */ + if(changeClusterMetadata) { + try { + metadataMgmtOps.updateMetadataversion(CLUSTER_VERSION_KEY); + } catch(Exception e) { + logger.info("Exception occurred while setting cluster metadata version during Rebalance state change !!!"); + } + } + } catch(Exception e) { - /** - * Returns the 'current' version of RO store - * - * @param nodeId The id of the node on which the store is present - * @param storeNames List of all the stores - * @return Returns a map of store name to the respective max version number - */ - public Map getROCurrentVersion(int nodeId, List storeNames) { - Map returnMap = Maps.newHashMapWithExpectedSize(storeNames.size()); - Map versionDirs = getROCurrentVersionDir(nodeId, storeNames); - for(String storeName: versionDirs.keySet()) { - returnMap.put(storeName, - ReadOnlyUtils.getVersionId(new File(versionDirs.get(storeName)))); - } - return returnMap; - } + if(rollback) { + logger.error("Got exceptions from nodes " + exceptions.keySet() + + " while changing state. Rolling back state on " + + completedNodeIds); + + // Rollback changes on completed nodes + for(int completedNodeId: completedNodeIds) { + try { + individualStateChange(completedNodeId, + existingCluster, + stealerNodeToPlan.get(completedNodeId), + swapRO, + changeClusterMetadata, + changeRebalanceState, + true); + } catch(Exception exception) { + logger.error("Error while reverting back state change for completed node " + + completedNodeId, + exception); + } + } + } else { + logger.error("Got exceptions from nodes " + exceptions.keySet() + + " while changing state"); + } + throw new VoldemortRebalancingException("Got exceptions from nodes " + + exceptions.keySet() + + " while changing state", + Lists.newArrayList(exceptions.values())); + } - /** - * Returns the max version of push currently being used by read-only store. - * Important to remember that this may not be the 'current' version since - * multiple pushes (with greater version numbers) may be in progress - * currently - * - * @param nodeId The id of the node on which the store is present - * @param storeNames List of all the stores - * @return Returns a map of store name to the respective max version number - */ - public Map getROMaxVersion(int nodeId, List storeNames) { - Map returnMap = Maps.newHashMapWithExpectedSize(storeNames.size()); - Map versionDirs = getROMaxVersionDir(nodeId, storeNames); - for(String storeName: versionDirs.keySet()) { - returnMap.put(storeName, - ReadOnlyUtils.getVersionId(new File(versionDirs.get(storeName)))); - } - return returnMap; - } + } - /** - * This is a wrapper around - * {@link voldemort.client.protocol.admin.AdminClient#getROMaxVersion(int, List)} - * where-in we find the max versions on each machine and then return the max - * of all of them - * - * @param storeNames List of all read-only stores - * @return A map of store-name to their corresponding max version id - */ - public Map getROMaxVersion(List storeNames) { - Map storeToMaxVersion = Maps.newHashMapWithExpectedSize(storeNames.size()); - for(String storeName: storeNames) { - storeToMaxVersion.put(storeName, 0L); - } - - for(Node node: currentCluster.getNodes()) { - Map currentNodeVersions = getROMaxVersion(node.getId(), storeNames); - for(String storeName: currentNodeVersions.keySet()) { - Long maxVersion = storeToMaxVersion.get(storeName); - if(maxVersion != null && maxVersion < currentNodeVersions.get(storeName)) { - storeToMaxVersion.put(storeName, currentNodeVersions.get(storeName)); + /** + * Single node rebalance state change + * + * @param nodeId Stealer node id + * @param cluster Cluster information which we need to update + * @param rebalancePartitionPlanList The list of rebalance partition + * info plans + * @param swapRO Boolean indicating if we need to swap RO stores + * @param changeClusterMetadata Boolean indicating if we need to change + * cluster metadata + * @param changeRebalanceState Boolean indicating if we need to change + * rebalancing state + * @param rollback Are we doing a rollback or a normal state? + */ + private void individualStateChange(int nodeId, + Cluster cluster, + List rebalancePartitionPlanList, + boolean swapRO, + boolean changeClusterMetadata, + boolean changeRebalanceState, + boolean rollback) { + + // If we do not want to change the metadata and are not one of the + // stealer nodes, nothing to do + if(!changeClusterMetadata && rebalancePartitionPlanList == null) { + return; + } + + logger.info("Node " + + nodeId + + "] Performing " + + (rollback ? "rollback" : "normal") + + " rebalance state change " + + (swapRO ? "" : "") + + (changeClusterMetadata ? "" : "") + + (changeRebalanceState ? "" : "")); + + VAdminProto.RebalanceStateChangeRequest.Builder getRebalanceStateChangeRequestBuilder = VAdminProto.RebalanceStateChangeRequest.newBuilder(); + + if(rebalancePartitionPlanList != null) { + List map = Lists.newArrayList(); + for(RebalancePartitionsInfo stealInfo: rebalancePartitionPlanList) { + RebalancePartitionInfoMap infoMap = ProtoUtils.encodeRebalancePartitionInfoMap(stealInfo); + map.add(infoMap); } + getRebalanceStateChangeRequestBuilder.addAllRebalancePartitionInfoList(map); + } + + VAdminProto.RebalanceStateChangeRequest getRebalanceStateChangeRequest = getRebalanceStateChangeRequestBuilder.setSwapRo(swapRO) + .setChangeClusterMetadata(changeClusterMetadata) + .setChangeRebalanceState(changeRebalanceState) + .setClusterString(clusterMapper.writeCluster(cluster)) + .setRollback(rollback) + .build(); + VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() + .setRebalanceStateChange(getRebalanceStateChangeRequest) + .setType(VAdminProto.AdminRequestType.REBALANCE_STATE_CHANGE) + .build(); + VAdminProto.RebalanceStateChangeResponse.Builder response = rpcOps.sendAndReceive(nodeId, + adminRequest, + VAdminProto.RebalanceStateChangeResponse.newBuilder()); + if(response.hasError()) { + helperOps.throwException(response.getError()); } } - return storeToMaxVersion; } /** - * Update slops which may be meant for multiple stores + * Encapsulates all operations to restore data in the cluster * - * @param nodeId The id of the node - * @param entryIterator An iterator over all the slops for this particular - * node */ - public void updateSlopEntries(int nodeId, Iterator> entryIterator) { - Node node = this.getAdminClientCluster().getNodeById(nodeId); - SocketDestination destination = new SocketDestination(node.getHost(), - node.getAdminPort(), - RequestFormatType.ADMIN_PROTOCOL_BUFFERS); - SocketAndStreams sands = pool.checkout(destination); - DataOutputStream outputStream = sands.getOutputStream(); - DataInputStream inputStream = sands.getInputStream(); - boolean firstMessage = true; - - try { - if(entryIterator.hasNext()) { - while(entryIterator.hasNext()) { - Versioned versionedSlop = entryIterator.next(); - Slop slop = versionedSlop.getValue(); - - // Build the message - RequestType requestType = null; - if(slop.getOperation().equals(Operation.PUT)) { - requestType = RequestType.PUT; - } else if(slop.getOperation().equals(Operation.DELETE)) { - requestType = RequestType.DELETE; - } else { - logger.error("Unsupported operation. Skipping"); - continue; - } - VAdminProto.UpdateSlopEntriesRequest.Builder updateRequest = VAdminProto.UpdateSlopEntriesRequest.newBuilder() - .setStore(slop.getStoreName()) - .setKey(ProtoUtils.encodeBytes(slop.getKey())) - .setVersion(ProtoUtils.encodeClock(versionedSlop.getVersion())) - .setRequestType(requestType); - // Add transforms and value only if required - if(slop.getTransforms() != null) - updateRequest.setTransform(ProtoUtils.encodeTransform(slop.getTransforms())); - if(slop.getValue() != null) - updateRequest.setValue(ByteString.copyFrom(slop.getValue())); - - if(firstMessage) { - ProtoUtils.writeMessage(outputStream, - VAdminProto.VoldemortAdminRequest.newBuilder() - .setType(VAdminProto.AdminRequestType.UPDATE_SLOP_ENTRIES) - .setUpdateSlopEntries(updateRequest) - .build()); - outputStream.flush(); - firstMessage = false; + public class RestoreOperations { + + /** + * RestoreData from copies on other machines for the given nodeId + *

    + * Recovery mechanism to recover and restore data actively from + * replicated copies in the cluster.
    + * + * @param nodeId Id of the node to restoreData + * @param parallelTransfers number of transfers + * @throws InterruptedException + */ + public void restoreDataFromReplications(int nodeId, int parallelTransfers) { + restoreDataFromReplications(nodeId, parallelTransfers, -1); + } + + /** + * RestoreData from copies on other machines for the given nodeId + *

    + * Recovery mechanism to recover and restore data actively from + * replicated copies in the cluster.
    + * + * @param nodeId Id of the node to restoreData + * @param parallelTransfers number of transfers + * @param zoneId zone from which the nodes are chosen from, -1 means no + * zone preference + * @throws InterruptedException + */ + public void restoreDataFromReplications(int nodeId, int parallelTransfers, int zoneId) { + ExecutorService executors = Executors.newFixedThreadPool(parallelTransfers, + new ThreadFactory() { + + @Override + public Thread newThread(Runnable r) { + Thread thread = new Thread(r); + thread.setName("restore-data-thread"); + return thread; + } + }); + try { + List storeDefList = metadataMgmtOps.getRemoteStoreDefList(nodeId) + .getValue(); + Cluster cluster = metadataMgmtOps.getRemoteCluster(nodeId).getValue(); + + List writableStores = Lists.newArrayList(); + for(StoreDefinition def: storeDefList) { + if(def.isView()) { + logger.info("Ignoring store " + def.getName() + " since it is a view"); + } else if(restoreStoreEngineBlackList.contains(def.getType())) { + logger.info("Ignoring store " + def.getName() + + " since we don't support restoring for " + def.getType() + + " storage engine"); + } else if(def.getReplicationFactor() == 1) { + logger.info("Ignoring store " + def.getName() + + " since replication factor is set to 1"); } else { - ProtoUtils.writeMessage(outputStream, updateRequest.build()); + writableStores.add(def); } } - ProtoUtils.writeEndOfStream(outputStream); - outputStream.flush(); - VAdminProto.UpdateSlopEntriesResponse.Builder updateResponse = ProtoUtils.readToBuilder(inputStream, - VAdminProto.UpdateSlopEntriesResponse.newBuilder()); - if(updateResponse.hasError()) { - throwException(updateResponse.getError()); + for(StoreDefinition def: writableStores) { + restoreStoreFromReplication(nodeId, cluster, def, executors, zoneId); + } + } finally { + executors.shutdown(); + try { + executors.awaitTermination(adminClientConfig.getRestoreDataTimeoutSec(), + TimeUnit.SECONDS); + } catch(InterruptedException e) { + logger.error("Interrupted while waiting restore operation to finish."); } + logger.info("Finished restoring data."); + } + } + + /** + * For a particular store and node, runs the replication job. This works + * only for read-write stores + * + * @param restoringNodeId The node which we want to restore + * @param cluster The cluster metadata + * @param storeDef The definition of the store which we want to restore + * @param executorService An executor to allow us to run the replication + * job + */ + private void restoreStoreFromReplication(final int restoringNodeId, + final Cluster cluster, + final StoreDefinition storeDef, + final ExecutorService executorService, + final int zoneId) { + logger.info("Restoring data for store " + storeDef.getName() + " on node " + + restoringNodeId); + + Map>> restoreMapping = helperOps.getReplicationMapping(restoringNodeId, + cluster, + storeDef, + zoneId); + // migrate partition + for(final Entry>> replicationEntry: restoreMapping.entrySet()) { + final int donorNodeId = replicationEntry.getKey(); + executorService.submit(new Runnable() { + + @Override + public void run() { + try { + logger.info("Restoring data for store " + storeDef.getName() + + " at node " + restoringNodeId + " from node " + + replicationEntry.getKey() + " partitions:" + + replicationEntry.getValue()); + + int migrateAsyncId = storeMntOps.migratePartitions(donorNodeId, + restoringNodeId, + storeDef.getName(), + replicationEntry.getValue(), + null, + null, + false); + + rpcOps.waitForCompletion(restoringNodeId, + migrateAsyncId, + adminClientConfig.getRestoreDataTimeoutSec(), + TimeUnit.SECONDS); + + logger.info("Restoring data for store:" + storeDef.getName() + + " from node " + donorNodeId + " completed."); + } catch(Exception e) { + logger.error("Restore operation for store " + storeDef.getName() + + "from node " + donorNodeId + " failed.", e); + } + } + }); } - } catch(IOException e) { - close(sands.getSocket()); - throw new VoldemortException(e); - } finally { - pool.checkin(destination, sands); } + /** + * Mirror data from another voldemort server + * + * @param nodeId node in the current cluster to mirror to + * @param nodeIdToMirrorFrom node from which to mirror data + * @param urlToMirrorFrom cluster bootstrap url to mirror from + * @param stores set of stores to be mirrored + * + */ + public void mirrorData(final int nodeId, + final int nodeIdToMirrorFrom, + final String urlToMirrorFrom, + List stores) { + final AdminClient mirrorAdminClient = new AdminClient(urlToMirrorFrom, + new AdminClientConfig(), + new ClientConfig()); + final AdminClient currentAdminClient = AdminClient.this; + + // determine the partitions residing on the mirror node + Node mirrorNode = mirrorAdminClient.getAdminClientCluster() + .getNodeById(nodeIdToMirrorFrom); + Node currentNode = currentAdminClient.getAdminClientCluster().getNodeById(nodeId); + + if(mirrorNode == null) { + logger.error("Mirror node specified does not exist in the mirror cluster"); + return; + } + + if(currentNode == null) { + logger.error("node specified does not exist in the current cluster"); + return; + } + + // compare the mirror-from and mirrored-to nodes have same set of + // stores + List currentStoreList = StoreUtils.getStoreNames(currentAdminClient.metadataMgmtOps.getRemoteStoreDefList(nodeId) + .getValue(), + true); + List mirrorStoreList = StoreUtils.getStoreNames(mirrorAdminClient.metadataMgmtOps.getRemoteStoreDefList(nodeIdToMirrorFrom) + .getValue(), + true); + if(stores == null) + stores = currentStoreList; + + if(!currentStoreList.containsAll(stores) || !mirrorStoreList.containsAll(stores)) { + logger.error("Make sure the set of stores match on both sides"); + return; + } + + // check if the partitions are same on both the nodes + if(!currentNode.getPartitionIds().equals(mirrorNode.getPartitionIds())) { + logger.error("Make sure the same set of partitions exist on both sides"); + return; + } + + ExecutorService executors = Executors.newFixedThreadPool(stores.size(), + new ThreadFactory() { + + @Override + public Thread newThread(Runnable r) { + Thread thread = new Thread(r); + thread.setName("mirror-data-thread"); + return thread; + } + }); + + final List partitionIdList = mirrorNode.getPartitionIds(); + final CountDownLatch waitLatch = new CountDownLatch(stores.size()); + try { + for(final String storeName: stores) + executors.submit(new Runnable() { + + @Override + public void run() { + try { + logger.info("Mirroring data for store " + storeName + " from node " + + nodeIdToMirrorFrom + "(" + urlToMirrorFrom + + ") to node " + nodeId + " partitions:" + + partitionIdList); + + Iterator>> iterator = mirrorAdminClient.bulkFetchOps.fetchEntries(nodeIdToMirrorFrom, + storeName, + partitionIdList, + null, + false); + currentAdminClient.streamingOps.updateEntries(nodeId, + storeName, + iterator, + null); + + logger.info("Mirroring data for store:" + storeName + " from node " + + nodeIdToMirrorFrom + " completed."); + } catch(Exception e) { + logger.error("Mirroring operation for store " + storeName + + "from node " + nodeIdToMirrorFrom + " failed.", e); + } finally { + waitLatch.countDown(); + } + } + }); + waitLatch.await(); + } catch(Exception e) { + logger.error("Mirroring operation failed.", e); + } finally { + executors.shutdown(); + logger.info("Finished mirroring data."); + } + } } /** - * Fetch read-only store files to a specified directory. This is run on the - * stealer node side + * Encapsulates all operations specific to read-only stores alone * - * @param nodeId The node id from where to copy - * @param storeName The name of the read-only store - * @param replicaToPartitionList Map of replica type to partition list - * @param destinationDirPath The destination path - * @param notAcceptedBuckets These are Pair< partition, replica > which we - * cannot copy AT all. This is because these are current mmap-ed and - * are serving traffic. - * @param running A boolean which will control when we want to stop the - * copying of files. As long this is true, we will continue copying. - * Once this is changed to false we'll disable the copying */ - public void fetchPartitionFiles(int nodeId, - String storeName, - HashMap> replicaToPartitionList, - String destinationDirPath, - Set notAcceptedBuckets, - AtomicBoolean running) { - if(!Utils.isReadableDir(destinationDirPath)) { - throw new VoldemortException("The destination path (" + destinationDirPath - + ") to store " + storeName + " does not exist"); - } - - Node node = this.getAdminClientCluster().getNodeById(nodeId); - final SocketDestination destination = new SocketDestination(node.getHost(), - node.getAdminPort(), - RequestFormatType.ADMIN_PROTOCOL_BUFFERS); - final SocketAndStreams sands = pool.checkout(destination); - DataOutputStream outputStream = sands.getOutputStream(); - final DataInputStream inputStream = sands.getInputStream(); - - try { - - // Add the metadata file if it doesn't exist - We do this because - // for new nodes the stores don't start with any metadata file - - File metadataFile = new File(destinationDirPath, ".metadata"); - if(!metadataFile.exists()) { - ReadOnlyStorageMetadata metadata = new ReadOnlyStorageMetadata(); - metadata.add(ReadOnlyStorageMetadata.FORMAT, - ReadOnlyStorageFormat.READONLY_V2.getCode()); - FileUtils.writeStringToFile(metadataFile, metadata.toJsonString()); - } - - VAdminProto.FetchPartitionFilesRequest fetchPartitionFileRequest = VAdminProto.FetchPartitionFilesRequest.newBuilder() - .addAllReplicaToPartition(ProtoUtils.encodePartitionTuple(replicaToPartitionList)) - .setStore(storeName) - .build(); - VAdminProto.VoldemortAdminRequest request = VAdminProto.VoldemortAdminRequest.newBuilder() - .setFetchPartitionFiles(fetchPartitionFileRequest) - .setType(VAdminProto.AdminRequestType.FETCH_PARTITION_FILES) - .build(); - ProtoUtils.writeMessage(outputStream, request); - outputStream.flush(); + public class ReadOnlySpecificOperations { - while(true && running.get()) { - int size = 0; + /** + * Rollback RO store to most recent backup of the current store + *

    + * + * @param nodeId The node id on which to rollback + * @param storeName The name of the RO Store to rollback + * @param pushVersion The version of the push to revert back to + */ + public void rollbackStore(int nodeId, String storeName, long pushVersion) { + VAdminProto.RollbackStoreRequest.Builder rollbackStoreRequest = VAdminProto.RollbackStoreRequest.newBuilder() + .setStoreName(storeName) + .setPushVersion(pushVersion); + + VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() + .setRollbackStore(rollbackStoreRequest) + .setType(VAdminProto.AdminRequestType.ROLLBACK_STORE) + .build(); + VAdminProto.RollbackStoreResponse.Builder response = rpcOps.sendAndReceive(nodeId, + adminRequest, + VAdminProto.RollbackStoreResponse.newBuilder()); + if(response.hasError()) { + helperOps.throwException(response.getError()); + } + return; + } - try { - size = inputStream.readInt(); - } catch(IOException e) { - logger.error("Received IOException while fetching files", e); - throw e; - } + /** + * Fetch data from directory 'storeDir' on node id + *

    + * + * @param nodeId The id of the node on which to fetch the data + * @param storeName The name of the store + * @param storeDir The directory from where to read the data + * @param pushVersion The version of the push + * @param timeoutMs Time timeout in milliseconds + * @return The path of the directory where the data is stored finally + */ + public String fetchStore(int nodeId, + String storeName, + String storeDir, + long pushVersion, + long timeoutMs) { + VAdminProto.FetchStoreRequest.Builder fetchStoreRequest = VAdminProto.FetchStoreRequest.newBuilder() + .setStoreName(storeName) + .setStoreDir(storeDir); + if(pushVersion > 0) { + fetchStoreRequest.setPushVersion(pushVersion); + } - if(size == -1) { - close(sands.getSocket()); - break; - } + VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() + .setFetchStore(fetchStoreRequest) + .setType(VAdminProto.AdminRequestType.FETCH_STORE) + .build(); + VAdminProto.AsyncOperationStatusResponse.Builder response = rpcOps.sendAndReceive(nodeId, + adminRequest, + VAdminProto.AsyncOperationStatusResponse.newBuilder()); - byte[] input = new byte[size]; - ByteUtils.read(inputStream, input); - VAdminProto.FileEntry fileEntry = VAdminProto.FileEntry.newBuilder() - .mergeFrom(input) - .build(); - - if(notAcceptedBuckets != null) { - Pair partitionReplicaTuple = ReadOnlyUtils.getPartitionReplicaTuple(fileEntry.getFileName()); - if(notAcceptedBuckets.contains(partitionReplicaTuple)) { - throw new VoldemortException("Cannot copy file " + fileEntry.getFileName() - + " since it is one of the mmap-ed files"); - } - } - logger.info("Receiving file " + fileEntry.getFileName()); + if(response.hasError()) { + helperOps.throwException(response.getError()); + } - FileChannel fileChannel = new FileOutputStream(new File(destinationDirPath, - fileEntry.getFileName())).getChannel(); - ReadableByteChannel channelIn = Channels.newChannel(inputStream); - fileChannel.transferFrom(channelIn, 0, fileEntry.getFileSizeBytes()); - fileChannel.force(true); - fileChannel.close(); + int asyncId = response.getRequestId(); + return rpcOps.waitForCompletion(nodeId, asyncId, timeoutMs, TimeUnit.MILLISECONDS); + } - logger.info("Completed file " + fileEntry.getFileName()); + /** + * When a fetch store fails, we don't need to keep the pushed data + * around. This function deletes its... + * + * @param nodeId The node id on which to delete the data + * @param storeName The name of the store + * @param storeDir The directory to delete + */ + public void failedFetchStore(int nodeId, String storeName, String storeDir) { + VAdminProto.FailedFetchStoreRequest.Builder failedFetchStoreRequest = VAdminProto.FailedFetchStoreRequest.newBuilder() + .setStoreDir(storeDir) + .setStoreName(storeName); + + VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() + .setFailedFetchStore(failedFetchStoreRequest) + .setType(VAdminProto.AdminRequestType.FAILED_FETCH_STORE) + .build(); + VAdminProto.FailedFetchStoreResponse.Builder response = rpcOps.sendAndReceive(nodeId, + adminRequest, + VAdminProto.FailedFetchStoreResponse.newBuilder()); + if(response.hasError()) { + helperOps.throwException(response.getError()); } + return; + } - } catch(IOException e) { - close(sands.getSocket()); - throw new VoldemortException(e); - } finally { - pool.checkin(destination, sands); + /** + * Swap store data atomically on a single node + *

    + * + * @param nodeId The node id where we would want to swap the data + * @param storeName Name of the store + * @param storeDir The directory where the data is present + * @return Returns the location of the previous directory + */ + public String swapStore(int nodeId, String storeName, String storeDir) { + VAdminProto.SwapStoreRequest.Builder swapStoreRequest = VAdminProto.SwapStoreRequest.newBuilder() + .setStoreDir(storeDir) + .setStoreName(storeName); + VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() + .setSwapStore(swapStoreRequest) + .setType(VAdminProto.AdminRequestType.SWAP_STORE) + .build(); + VAdminProto.SwapStoreResponse.Builder response = rpcOps.sendAndReceive(nodeId, + adminRequest, + VAdminProto.SwapStoreResponse.newBuilder()); + if(response.hasError()) { + helperOps.throwException(response.getError()); + } + return response.getPreviousStoreDir(); } - } + /** + * Returns the read-only storage format - {@link ReadOnlyStorageFormat} + * for a list of stores + * + * @param nodeId The id of the node on which the stores are present + * @param storeNames List of all the store names + * @return Returns a map of store name to its corresponding RO storage + * format + */ + public Map getROStorageFormat(int nodeId, List storeNames) { + VAdminProto.GetROStorageFormatRequest.Builder getRORequest = VAdminProto.GetROStorageFormatRequest.newBuilder() + .addAllStoreName(storeNames); + VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() + .setGetRoStorageFormat(getRORequest) + .setType(VAdminProto.AdminRequestType.GET_RO_STORAGE_FORMAT) + .build(); + VAdminProto.GetROStorageFormatResponse.Builder response = rpcOps.sendAndReceive(nodeId, + adminRequest, + VAdminProto.GetROStorageFormatResponse.newBuilder()); + if(response.hasError()) { + helperOps.throwException(response.getError()); + } - /** - * Used in rebalancing to indicate change in states. Groups the partition - * plans on the basis of stealer nodes and sends them over. - * - * The various combinations and their order of execution is given below - * - *

    -     * | swapRO | changeClusterMetadata | changeRebalanceState | Order |
    -     * | f | t | t | cluster -> rebalance | 
    -     * | f | f | t | rebalance |
    -     * | t | t | f | cluster -> swap |
    -     * | t | t | t | cluster -> swap -> rebalance |
    -     * 
    - * - * - * Similarly for rollback: - * - *
    -     * | swapRO | changeClusterMetadata | changeRebalanceState | Order |
    -     * | f | t | t | remove from rebalance -> cluster  | 
    -     * | f | f | t | remove from rebalance |
    -     * | t | t | f | cluster -> swap |
    -     * | t | t | t | remove from rebalance -> cluster -> swap  |
    -     * 
    - * - * - * @param existingCluster Current cluster - * @param transitionCluster Transition cluster - * @param rebalancePartitionPlanList The list of rebalance partition info - * plans - * @param swapRO Boolean indicating if we need to swap RO stores - * @param changeClusterMetadata Boolean indicating if we need to change - * cluster metadata - * @param changeRebalanceState Boolean indicating if we need to change - * rebalancing state - * @param rollback Do we want to do a rollback step in case of failures? - * @param failEarly Do we want to fail early while doing state change? - */ - public void rebalanceStateChange(Cluster existingCluster, - Cluster transitionCluster, - List rebalancePartitionPlanList, - boolean swapRO, - boolean changeClusterMetadata, - boolean changeRebalanceState, - boolean rollback, - boolean failEarly) { - HashMap> stealerNodeToPlan = RebalanceUtils.groupPartitionsInfoByNode(rebalancePartitionPlanList, - true); - Set completedNodeIds = Sets.newHashSet(); - - int nodeId = 0; - HashMap exceptions = Maps.newHashMap(); - - try { - while(nodeId < transitionCluster.getNumberOfNodes()) { + Map storeToValues = ProtoUtils.encodeROMap(response.getRoStoreVersionsList()); - try { - individualStateChange(nodeId, - transitionCluster, - stealerNodeToPlan.get(nodeId), - swapRO, - changeClusterMetadata, - changeRebalanceState, - false); - completedNodeIds.add(nodeId); - } catch(Exception e) { - exceptions.put(nodeId, e); - if(failEarly) { - throw e; - } - } - nodeId++; + if(storeToValues.size() != storeNames.size()) { + storeNames.removeAll(storeToValues.keySet()); + throw new VoldemortException("Did not retrieve values for " + storeNames); + } + return storeToValues; + } + /** + * Returns the max version of push currently being used by read-only + * store. Important to remember that this may not be the 'current' + * version since multiple pushes (with greater version numbers) may be + * in progress currently + * + * @param nodeId The id of the node on which the store is present + * @param storeNames List of all the stores + * @return Returns a map of store name to the respective store directory + */ + public Map getROMaxVersionDir(int nodeId, List storeNames) { + VAdminProto.GetROMaxVersionDirRequest.Builder getRORequest = VAdminProto.GetROMaxVersionDirRequest.newBuilder() + .addAllStoreName(storeNames); + VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() + .setGetRoMaxVersionDir(getRORequest) + .setType(VAdminProto.AdminRequestType.GET_RO_MAX_VERSION_DIR) + .build(); + VAdminProto.GetROMaxVersionDirResponse.Builder response = rpcOps.sendAndReceive(nodeId, + adminRequest, + VAdminProto.GetROMaxVersionDirResponse.newBuilder()); + if(response.hasError()) { + helperOps.throwException(response.getError()); } - if(exceptions.size() > 0) { - throw new VoldemortRebalancingException("Got exceptions from nodes " - + exceptions.keySet()); + Map storeToValues = ProtoUtils.encodeROMap(response.getRoStoreVersionsList()); + + if(storeToValues.size() != storeNames.size()) { + storeNames.removeAll(storeToValues.keySet()); + throw new VoldemortException("Did not retrieve values for " + storeNames); } + return storeToValues; + } - /* - * If everything went smoothly, update the version of the cluster - * metadata - */ - if(changeClusterMetadata) { - try { - updateMetadataversion(CLUSTER_VERSION_KEY); - } catch(Exception e) { - logger.info("Exception occurred while setting cluster metadata version during Rebalance state change !!!"); - } + /** + * Returns the 'current' versions of all RO stores provided + * + * @param nodeId The id of the node on which the store is present + * @param storeNames List of all the RO stores + * @return Returns a map of store name to the respective max version + * directory + */ + public Map getROCurrentVersionDir(int nodeId, List storeNames) { + VAdminProto.GetROCurrentVersionDirRequest.Builder getRORequest = VAdminProto.GetROCurrentVersionDirRequest.newBuilder() + .addAllStoreName(storeNames); + VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() + .setGetRoCurrentVersionDir(getRORequest) + .setType(VAdminProto.AdminRequestType.GET_RO_CURRENT_VERSION_DIR) + .build(); + VAdminProto.GetROCurrentVersionDirResponse.Builder response = rpcOps.sendAndReceive(nodeId, + adminRequest, + VAdminProto.GetROCurrentVersionDirResponse.newBuilder()); + if(response.hasError()) { + helperOps.throwException(response.getError()); } - } catch(Exception e) { - if(rollback) { - logger.error("Got exceptions from nodes " + exceptions.keySet() - + " while changing state. Rolling back state on " + completedNodeIds); + Map storeToValues = ProtoUtils.encodeROMap(response.getRoStoreVersionsList()); - // Rollback changes on completed nodes - for(int completedNodeId: completedNodeIds) { - try { - individualStateChange(completedNodeId, - existingCluster, - stealerNodeToPlan.get(completedNodeId), - swapRO, - changeClusterMetadata, - changeRebalanceState, - true); - } catch(Exception exception) { - logger.error("Error while reverting back state change for completed node " - + completedNodeId, exception); - } - } - } else { - logger.error("Got exceptions from nodes " + exceptions.keySet() - + " while changing state"); + if(storeToValues.size() != storeNames.size()) { + storeNames.removeAll(storeToValues.keySet()); + throw new VoldemortException("Did not retrieve values for " + storeNames); } - throw new VoldemortRebalancingException("Got exceptions from nodes " - + exceptions.keySet() - + " while changing state", - Lists.newArrayList(exceptions.values())); + return storeToValues; } - } - - /** - * Single node rebalance state change - * - * @param nodeId Stealer node id - * @param cluster Cluster information which we need to update - * @param rebalancePartitionPlanList The list of rebalance partition info - * plans - * @param swapRO Boolean indicating if we need to swap RO stores - * @param changeClusterMetadata Boolean indicating if we need to change - * cluster metadata - * @param changeRebalanceState Boolean indicating if we need to change - * rebalancing state - * @param rollback Are we doing a rollback or a normal state? - */ - private void individualStateChange(int nodeId, - Cluster cluster, - List rebalancePartitionPlanList, - boolean swapRO, - boolean changeClusterMetadata, - boolean changeRebalanceState, - boolean rollback) { - - // If we do not want to change the metadata and are not one of the - // stealer nodes, nothing to do - if(!changeClusterMetadata && rebalancePartitionPlanList == null) { - return; + /** + * Returns the 'current' version of RO store + * + * @param nodeId The id of the node on which the store is present + * @param storeNames List of all the stores + * @return Returns a map of store name to the respective max version + * number + */ + public Map getROCurrentVersion(int nodeId, List storeNames) { + Map returnMap = Maps.newHashMapWithExpectedSize(storeNames.size()); + Map versionDirs = getROCurrentVersionDir(nodeId, storeNames); + for(String storeName: versionDirs.keySet()) { + returnMap.put(storeName, + ReadOnlyUtils.getVersionId(new File(versionDirs.get(storeName)))); + } + return returnMap; } - logger.info("Node " - + nodeId - + "] Performing " - + (rollback ? "rollback" : "normal") - + " rebalance state change " - + (swapRO ? "" : "") - + (changeClusterMetadata ? "" : "") - + (changeRebalanceState ? "" : "")); - - VAdminProto.RebalanceStateChangeRequest.Builder getRebalanceStateChangeRequestBuilder = VAdminProto.RebalanceStateChangeRequest.newBuilder(); - - if(rebalancePartitionPlanList != null) { - List map = Lists.newArrayList(); - for(RebalancePartitionsInfo stealInfo: rebalancePartitionPlanList) { - RebalancePartitionInfoMap infoMap = ProtoUtils.encodeRebalancePartitionInfoMap(stealInfo); - map.add(infoMap); - } - getRebalanceStateChangeRequestBuilder.addAllRebalancePartitionInfoList(map); - } - - VAdminProto.RebalanceStateChangeRequest getRebalanceStateChangeRequest = getRebalanceStateChangeRequestBuilder.setSwapRo(swapRO) - .setChangeClusterMetadata(changeClusterMetadata) - .setChangeRebalanceState(changeRebalanceState) - .setClusterString(clusterMapper.writeCluster(cluster)) - .setRollback(rollback) - .build(); - VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() - .setRebalanceStateChange(getRebalanceStateChangeRequest) - .setType(VAdminProto.AdminRequestType.REBALANCE_STATE_CHANGE) - .build(); - VAdminProto.RebalanceStateChangeResponse.Builder response = sendAndReceive(nodeId, - adminRequest, - VAdminProto.RebalanceStateChangeResponse.newBuilder()); - if(response.hasError()) { - throwException(response.getError()); + /** + * Returns the max version of push currently being used by read-only + * store. Important to remember that this may not be the 'current' + * version since multiple pushes (with greater version numbers) may be + * in progress currently + * + * @param nodeId The id of the node on which the store is present + * @param storeNames List of all the stores + * @return Returns a map of store name to the respective max version + * number + */ + public Map getROMaxVersion(int nodeId, List storeNames) { + Map returnMap = Maps.newHashMapWithExpectedSize(storeNames.size()); + Map versionDirs = getROMaxVersionDir(nodeId, storeNames); + for(String storeName: versionDirs.keySet()) { + returnMap.put(storeName, + ReadOnlyUtils.getVersionId(new File(versionDirs.get(storeName)))); + } + return returnMap; } - } - /** - * Given a list of partition infos, generates a map of stealer node to list - * of partition infos - * - * @param rebalancePartitionPlanList Complete list of partition plans - * @return Flattens it into a map on a per stealer node basis - */ - private HashMap> groupPartitionsInfoByStealerNode(List rebalancePartitionPlanList) { - HashMap> stealerNodeToPlan = Maps.newHashMap(); - if(rebalancePartitionPlanList != null) { - for(RebalancePartitionsInfo partitionInfo: rebalancePartitionPlanList) { - List partitionInfos = stealerNodeToPlan.get(partitionInfo.getStealerId()); - if(partitionInfos == null) { - partitionInfos = Lists.newArrayList(); - stealerNodeToPlan.put(partitionInfo.getStealerId(), partitionInfos); + /** + * This is a wrapper around + * {@link voldemort.client.protocol.admin.AdminClient#getROMaxVersion(int, List)} + * where-in we find the max versions on each machine and then return the + * max of all of them + * + * @param storeNames List of all read-only stores + * @return A map of store-name to their corresponding max version id + */ + public Map getROMaxVersion(List storeNames) { + Map storeToMaxVersion = Maps.newHashMapWithExpectedSize(storeNames.size()); + for(String storeName: storeNames) { + storeToMaxVersion.put(storeName, 0L); + } + + for(Node node: currentCluster.getNodes()) { + Map currentNodeVersions = getROMaxVersion(node.getId(), storeNames); + for(String storeName: currentNodeVersions.keySet()) { + Long maxVersion = storeToMaxVersion.get(storeName); + if(maxVersion != null && maxVersion < currentNodeVersions.get(storeName)) { + storeToMaxVersion.put(storeName, currentNodeVersions.get(storeName)); + } } - partitionInfos.add(partitionInfo); } + return storeToMaxVersion; } - return stealerNodeToPlan; - } - /** - * Native backup a store - * - * @param nodeId The node id to backup - * @param storeName The name of the store to backup - * @param destinationDirPath The destination path - * @param minutes to wait for operation to complete - * @param verify should the file checksums be verified - * @param isIncremental is the backup incremental - */ - public void nativeBackup(int nodeId, - String storeName, - String destinationDirPath, - int timeOut, - boolean verify, - boolean isIncremental) { - - VAdminProto.NativeBackupRequest nativeBackupRequest = VAdminProto.NativeBackupRequest.newBuilder() - .setStoreName(storeName) - .setBackupDir(destinationDirPath) - .setIncremental(isIncremental) - .setVerifyFiles(verify) + /** + * Fetch read-only store files to a specified directory. This is run on + * the stealer node side + * + * @param nodeId The node id from where to copy + * @param storeName The name of the read-only store + * @param replicaToPartitionList Map of replica type to partition list + * @param destinationDirPath The destination path + * @param notAcceptedBuckets These are Pair< partition, replica > which + * we cannot copy AT all. This is because these are current + * mmap-ed and are serving traffic. + * @param running A boolean which will control when we want to stop the + * copying of files. As long this is true, we will continue + * copying. Once this is changed to false we'll disable the + * copying + */ + public void fetchPartitionFiles(int nodeId, + String storeName, + HashMap> replicaToPartitionList, + String destinationDirPath, + Set notAcceptedBuckets, + AtomicBoolean running) { + if(!Utils.isReadableDir(destinationDirPath)) { + throw new VoldemortException("The destination path (" + destinationDirPath + + ") to store " + storeName + " does not exist"); + } + + Node node = AdminClient.this.getAdminClientCluster().getNodeById(nodeId); + final SocketDestination destination = new SocketDestination(node.getHost(), + node.getAdminPort(), + RequestFormatType.ADMIN_PROTOCOL_BUFFERS); + final SocketAndStreams sands = socketPool.checkout(destination); + DataOutputStream outputStream = sands.getOutputStream(); + final DataInputStream inputStream = sands.getInputStream(); + + try { + + // Add the metadata file if it doesn't exist - We do this + // because + // for new nodes the stores don't start with any metadata file + + File metadataFile = new File(destinationDirPath, ".metadata"); + if(!metadataFile.exists()) { + ReadOnlyStorageMetadata metadata = new ReadOnlyStorageMetadata(); + metadata.add(ReadOnlyStorageMetadata.FORMAT, + ReadOnlyStorageFormat.READONLY_V2.getCode()); + FileUtils.writeStringToFile(metadataFile, metadata.toJsonString()); + } + + VAdminProto.FetchPartitionFilesRequest fetchPartitionFileRequest = VAdminProto.FetchPartitionFilesRequest.newBuilder() + .addAllReplicaToPartition(ProtoUtils.encodePartitionTuple(replicaToPartitionList)) + .setStore(storeName) + .build(); + VAdminProto.VoldemortAdminRequest request = VAdminProto.VoldemortAdminRequest.newBuilder() + .setFetchPartitionFiles(fetchPartitionFileRequest) + .setType(VAdminProto.AdminRequestType.FETCH_PARTITION_FILES) .build(); - VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() - .setNativeBackup(nativeBackupRequest) - .setType(VAdminProto.AdminRequestType.NATIVE_BACKUP) - .build(); - VAdminProto.AsyncOperationStatusResponse.Builder response = sendAndReceive(nodeId, - adminRequest, - VAdminProto.AsyncOperationStatusResponse.newBuilder()); + ProtoUtils.writeMessage(outputStream, request); + outputStream.flush(); - if(response.hasError()) { - throwException(response.getError()); - } + while(true && running.get()) { + int size = 0; - int asyncId = response.getRequestId(); - waitForCompletion(nodeId, asyncId, timeOut, TimeUnit.MINUTES); - } + try { + size = inputStream.readInt(); + } catch(IOException e) { + logger.error("Received IOException while fetching files", e); + throw e; + } + + if(size == -1) { + helperOps.close(sands.getSocket()); + break; + } + + byte[] input = new byte[size]; + ByteUtils.read(inputStream, input); + VAdminProto.FileEntry fileEntry = VAdminProto.FileEntry.newBuilder() + .mergeFrom(input) + .build(); + + if(notAcceptedBuckets != null) { + Pair partitionReplicaTuple = ReadOnlyUtils.getPartitionReplicaTuple(fileEntry.getFileName()); + if(notAcceptedBuckets.contains(partitionReplicaTuple)) { + throw new VoldemortException("Cannot copy file " + + fileEntry.getFileName() + + " since it is one of the mmap-ed files"); + } + } + logger.info("Receiving file " + fileEntry.getFileName()); + + FileChannel fileChannel = new FileOutputStream(new File(destinationDirPath, + fileEntry.getFileName())).getChannel(); + ReadableByteChannel channelIn = Channels.newChannel(inputStream); + fileChannel.transferFrom(channelIn, 0, fileEntry.getFileSizeBytes()); + fileChannel.force(true); + fileChannel.close(); + + logger.info("Completed file " + fileEntry.getFileName()); + } + + } catch(IOException e) { + helperOps.close(sands.getSocket()); + throw new VoldemortException(e); + } finally { + socketPool.checkin(destination, sands); + } - /** - * Reserve memory for the stores - * - * @param nodeId The node id to reserve, -1 for entire cluster - * @param stores list of stores for which to reserve - * @param sizeInMB size of reservation - */ - public void reserveMemory(int nodeId, List stores, long sizeInMB) { - - List reserveNodes = new ArrayList(); - if(nodeId == -1) { - // if no node is specified send it to the entire cluster - for(Node node: currentCluster.getNodes()) - reserveNodes.add(node.getId()); - } else { - reserveNodes.add(nodeId); - } - for(String storeName: stores) { - for(Integer reserveNodeId: reserveNodes) { - - VAdminProto.ReserveMemoryRequest reserveRequest = VAdminProto.ReserveMemoryRequest.newBuilder() - .setStoreName(storeName) - .setSizeInMb(sizeInMB) - .build(); - VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() - .setReserveMemory(reserveRequest) - .setType(VAdminProto.AdminRequestType.RESERVE_MEMORY) - .build(); - VAdminProto.ReserveMemoryResponse.Builder response = sendAndReceive(reserveNodeId, - adminRequest, - VAdminProto.ReserveMemoryResponse.newBuilder()); - if(response.hasError()) - throwException(response.getError()); - } - logger.info("Finished reserving memory for store : " + storeName); } } } diff --git a/src/java/voldemort/client/protocol/admin/QueryKeyResult.java b/src/java/voldemort/client/protocol/admin/QueryKeyResult.java new file mode 100644 index 0000000000..48bb3be9d6 --- /dev/null +++ b/src/java/voldemort/client/protocol/admin/QueryKeyResult.java @@ -0,0 +1,70 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package voldemort.client.protocol.admin; + +import java.util.List; + +import voldemort.utils.ByteArray; +import voldemort.versioning.Versioned; + +/** + * Return type of AdminClient.QueryKeys. Intended to ensure the following + * invariant: .hasValues() == !.hasException() + */ +public class QueryKeyResult { + + private final ByteArray key; + private final List> values; + private final Exception exception; + + public QueryKeyResult(ByteArray key, List> values) { + this.key = key; + this.values = values; + this.exception = null; + } + + public QueryKeyResult(ByteArray key, Exception exception) { + this.key = key; + this.values = null; + this.exception = exception; + } + + public ByteArray getKey() { + return key; + } + + /** + * @return true iff values were returned. + */ + public boolean hasValues() { + return (values != null); + } + + public List> getValues() { + return values; + } + + /** + * @return true iff exception occured during queryKeys. + */ + public boolean hasException() { + return (exception != null); + } + + public Exception getException() { + return exception; + } +} \ No newline at end of file diff --git a/src/java/voldemort/client/protocol/admin/StreamingClient.java b/src/java/voldemort/client/protocol/admin/StreamingClient.java new file mode 100644 index 0000000000..4de4dda835 --- /dev/null +++ b/src/java/voldemort/client/protocol/admin/StreamingClient.java @@ -0,0 +1,714 @@ +/* + * Copyright 2008-2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.client.protocol.admin; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.net.Socket; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; + +import org.apache.log4j.Logger; + +import voldemort.VoldemortException; +import voldemort.client.ClientConfig; +import voldemort.client.protocol.RequestFormatType; +import voldemort.client.protocol.pb.ProtoUtils; +import voldemort.client.protocol.pb.VAdminProto; +import voldemort.cluster.Node; +import voldemort.routing.RoutingStrategy; +import voldemort.routing.RoutingStrategyFactory; +import voldemort.store.StoreDefinition; +import voldemort.store.socket.SocketDestination; +import voldemort.utils.ByteArray; +import voldemort.utils.EventThrottler; +import voldemort.utils.Pair; +import voldemort.versioning.Versioned; + +/** + * + * @author anagpal + * + * The streaming API allows for send events into voldemort stores in the + * async fashion. All the partition and replication logic will be taken + * care of internally. + * + * The users is expected to provide two callbacks, one for performing + * period checkpoints and one for recovering the streaming process from + * the last checkpoint. + * + * NOTE: The API is not thread safe, since if multiple threads use this + * API we cannot make any guarantees about correctness of the + * checkpointing mechanism. + * + * Right now we expect this to used by a single thread per data source + * + */ +public class StreamingClient { + + private static final Logger logger = Logger.getLogger(StreamingClient.class); + @SuppressWarnings("rawtypes") + private Callable checkpointCallback = null; + @SuppressWarnings("rawtypes") + private Callable recoveryCallback = null; + private boolean allowMerge = false; + + private SocketPool streamingSocketPool; + + List remoteStoreDefs; + protected RoutingStrategy routingStrategy; + + boolean newBatch; + boolean cleanedUp = false; + + boolean isMultiSession; + ExecutorService streamingresults; + + // Every batch size we commit + private static int CHECKPOINT_COMMIT_SIZE; + + // TODO + // provide knobs to tune this + private static int TIME_COMMIT_SIZE = 30; + // we have to throttle to a certain qps + private static int THROTTLE_QPS; + private int entriesProcessed; + + // In case a Recovery callback fails we got to stop it from getting any + // worse + // so we mark the session as bad and dont take any more requests + private static boolean MARKED_BAD = false; + + protected EventThrottler throttler; + + AdminClient adminClient; + AdminClientConfig adminClientConfig; + + String bootstrapURL; + + // Data structures for the streaming maps from Pair to + // Resource + + private HashMap storeToRoutingStrategy; + private HashMap, Boolean> nodeIdStoreInitialized; + + private HashMap, SocketDestination> nodeIdStoreToSocketRequest; + private HashMap, DataOutputStream> nodeIdStoreToOutputStreamRequest; + private HashMap, DataInputStream> nodeIdStoreToInputStreamRequest; + + private HashMap, SocketAndStreams> nodeIdStoreToSocketAndStreams; + + private List storeNames; + + private List nodesToStream; + private List blackListedNodes; + + private final static int MAX_STORES_PER_SESSION = 100; + + Calendar calendar = Calendar.getInstance(); + + public StreamingClient(StreamingClientConfig config) { + this.bootstrapURL = config.getBootstrapURL(); + CHECKPOINT_COMMIT_SIZE = config.getBatchSize(); + THROTTLE_QPS = config.getThrottleQPS(); + + } + + public synchronized void updateThrottleLimit(int throttleQPS) { + THROTTLE_QPS = throttleQPS; + + this.throttler = new EventThrottler(THROTTLE_QPS); + } + + /** + ** + * @param store - the name of the store to be streamed to + * + * @param checkpointCallback - the callback that allows for the user to + * record the progress, up to the last event delivered. This callable + * would be invoked every so often internally. + * + * @param recoveryCallback - the callback that allows the user to rewind the + * upstream to the position recorded by the last complete call on + * checkpointCallback whenever an exception occurs during the + * streaming session. + * + * @param allowMerge - whether to allow for the streaming event to be merged + * with online writes. If not, all online writes since the completion + * of the last streaming session will be lost at the end of the + * current streaming session. + **/ + @SuppressWarnings({ "rawtypes", "unchecked" }) + public synchronized void initStreamingSession(String store, + Callable checkpointCallback, + Callable recoveryCallback, + boolean allowMerge) { + + // internally call initsessions with a single store + List stores = new ArrayList(); + stores.add(store); + initStreamingSessions(stores, checkpointCallback, recoveryCallback, allowMerge); + + } + + /** + * A Streaming Put call + ** + * @param key - The key + * + * @param value - The value + **/ + @SuppressWarnings({}) + public synchronized void streamingPut(ByteArray key, Versioned value) { + + if(MARKED_BAD) { + logger.error("Cannot stream more entries since Recovery Callback Failed!"); + throw new VoldemortException("Cannot stream more entries since Recovery Callback Failed!"); + } + + for(String store: storeNames) { + streamingPut(key, value, store); + } + + } + + /** + ** + * @param resetCheckpointCallback - the callback that allows for the user to + * clean up the checkpoint at the end of the streaming session so a + * new session could, if necessary, start from 0 position. + **/ + @SuppressWarnings({ "rawtypes" }) + public synchronized void closeStreamingSession(Callable resetCheckpointCallback) { + + closeStreamingSessions(resetCheckpointCallback); + + } + + /** + * Close the streaming session Flush all n/w buffers and call the commit + * callback + **/ + @SuppressWarnings({}) + public synchronized void closeStreamingSession() { + + closeStreamingSessions(); + + } + + private void close(Socket socket) { + try { + socket.close(); + } catch(IOException e) { + logger.warn("Failed to close socket"); + } + } + + @Override + protected void finalize() { + + if(!cleanedUp) { + cleanupSessions(); + } + + } + + /** + ** + * @param stores - the list of name of the stores to be streamed to + * + * + * @param checkpointCallback - the callback that allows for the user to + * record the progress, up to the last event delivered. This callable + * would be invoked every so often internally. + * + * @param recoveryCallback - the callback that allows the user to rewind the + * upstream to the position recorded by the last complete call on + * checkpointCallback whenever an exception occurs during the + * streaming session. + * + * @param allowMerge - whether to allow for the streaming event to be merged + * with online writes. If not, all online writes since the completion + * of the last streaming session will be lost at the end of the + * current streaming session. + **/ + @SuppressWarnings({ "rawtypes" }) + public synchronized void initStreamingSessions(List stores, + Callable checkpointCallback, + Callable recoveryCallback, + boolean allowMerge) { + + initStreamingSessions(stores, checkpointCallback, recoveryCallback, allowMerge, null); + + } + + /** + ** + * @param stores - the list of name of the stores to be streamed to + * + * + * @param checkpointCallback - the callback that allows for the user to + * record the progress, up to the last event delivered. This callable + * would be invoked every so often internally. + * + * @param recoveryCallback - the callback that allows the user to rewind the + * upstream to the position recorded by the last complete call on + * checkpointCallback whenever an exception occurs during the + * streaming session. + * + * @param allowMerge - whether to allow for the streaming event to be merged + * with online writes. If not, all online writes since the completion + * of the last streaming session will be lost at the end of the + * current streaming session. + * + * @param blackListedNodes - the list of Nodes not to stream to; we can + * probably recover them later from the replicas + **/ + + @SuppressWarnings({ "unchecked", "rawtypes" }) + public synchronized void initStreamingSessions(List stores, + Callable checkpointCallback, + Callable recoveryCallback, + boolean allowMerge, + List blackListedNodes) { + + logger.info("Initializing a streaming session"); + adminClientConfig = new AdminClientConfig(); + adminClient = new AdminClient(bootstrapURL, adminClientConfig, new ClientConfig()); + this.checkpointCallback = checkpointCallback; + this.recoveryCallback = recoveryCallback; + this.allowMerge = allowMerge; + streamingresults = Executors.newFixedThreadPool(3); + entriesProcessed = 0; + newBatch = true; + isMultiSession = true; + storeNames = new ArrayList(); + this.throttler = new EventThrottler(THROTTLE_QPS); + + TimeUnit unit = TimeUnit.SECONDS; + + Collection nodesInCluster = adminClient.getAdminClientCluster().getNodes(); + nodesToStream = new ArrayList(); + + if(blackListedNodes != null && blackListedNodes.size() > 0) { + + this.blackListedNodes = blackListedNodes; + + } + for(Node node: nodesInCluster) { + if(blackListedNodes != null && blackListedNodes.size() > 0) { + if(!blackListedNodes.contains(node.getId())) { + nodesToStream.add(node); + } + } else + nodesToStream.add(node); + + } + // socket pool + streamingSocketPool = new SocketPool(adminClient.getAdminClientCluster().getNumberOfNodes() + * MAX_STORES_PER_SESSION, + (int) unit.toMillis(adminClientConfig.getAdminConnectionTimeoutSec()), + (int) unit.toMillis(adminClientConfig.getAdminSocketTimeoutSec()), + adminClientConfig.getAdminSocketBufferSize(), + adminClientConfig.getAdminSocketKeepAlive()); + + nodeIdStoreToSocketRequest = new HashMap(); + nodeIdStoreToOutputStreamRequest = new HashMap(); + nodeIdStoreToInputStreamRequest = new HashMap(); + nodeIdStoreInitialized = new HashMap(); + storeToRoutingStrategy = new HashMap(); + nodeIdStoreToSocketAndStreams = new HashMap(); + for(String store: stores) { + + addStoreToSession(store); + } + + } + + /** + * Add another store destination to an existing streaming session + * + * + * @param store: the name of the store to stream to + */ + @SuppressWarnings({ "unchecked", "rawtypes" }) + private void addStoreToSession(String store) { + + storeNames.add(store); + + for(Node node: nodesToStream) { + + SocketDestination destination = new SocketDestination(node.getHost(), + node.getAdminPort(), + RequestFormatType.ADMIN_PROTOCOL_BUFFERS); + SocketAndStreams sands = streamingSocketPool.checkout(destination); + + try { + DataOutputStream outputStream = sands.getOutputStream(); + DataInputStream inputStream = sands.getInputStream(); + + nodeIdStoreToSocketRequest.put(new Pair(store, node.getId()), destination); + nodeIdStoreToOutputStreamRequest.put(new Pair(store, node.getId()), outputStream); + nodeIdStoreToInputStreamRequest.put(new Pair(store, node.getId()), inputStream); + nodeIdStoreToSocketAndStreams.put(new Pair(store, node.getId()), sands); + nodeIdStoreInitialized.put(new Pair(store, node.getId()), false); + + remoteStoreDefs = adminClient.metadataMgmtOps.getRemoteStoreDefList(node.getId()) + .getValue(); + + } catch(Exception e) { + close(sands.getSocket()); + streamingSocketPool.checkin(destination, sands); + throw new VoldemortException(e); + } + + } + + boolean foundStore = false; + + for(StoreDefinition remoteStoreDef: remoteStoreDefs) { + if(remoteStoreDef.getName().equals(store)) { + RoutingStrategyFactory factory = new RoutingStrategyFactory(); + RoutingStrategy storeRoutingStrategy = factory.updateRoutingStrategy(remoteStoreDef, + adminClient.getAdminClientCluster()); + + storeToRoutingStrategy.put(store, storeRoutingStrategy); + foundStore = true; + break; + } + } + if(!foundStore) { + logger.error("Store Name not found on the cluster"); + throw new VoldemortException("Store Name not found on the cluster"); + + } + + } + + /** + * Remove a list of stores from the session + * + * First commit all entries for these stores and then cleanup resources + * + * @param storeNameToRemove List of stores to be removed from the current + * streaming session + * + **/ + @SuppressWarnings({}) + public synchronized void removeStoreFromSession(List storeNameToRemove) { + + logger.info("closing the Streaming session for a few stores"); + + commitToVoldemort(storeNameToRemove); + cleanupSessions(storeNameToRemove); + + } + + /** + ** + * @param key - The key + * + * @param value - The value + * + * @param storeName takes an additional store name as a parameter + * + * If a store is added mid way through a streaming session we do not + * play catchup and entries that were processed earlier during the + * session will not be applied for the store. + * + **/ + @SuppressWarnings({ "unchecked", "rawtypes" }) + public synchronized void streamingPut(ByteArray key, Versioned value, String storeName) { + + // If store does not exist in the stores list + // add it and checkout a socket + if(!storeNames.contains(storeName)) { + addStoreToSession(storeName); + } + + if(MARKED_BAD) { + logger.error("Cannot stream more entries since Recovery Callback Failed!"); + throw new VoldemortException("Cannot stream more entries since Recovery Callback Failed! You Need to restart the session"); + } + + List nodeList = storeToRoutingStrategy.get(storeName).routeRequest(key.get()); + + // sent the k/v pair to the nodes + for(Node node: nodeList) { + + if(blackListedNodes != null && blackListedNodes.size() > 0) { + if(blackListedNodes.contains(node.getId())) + continue; + } + // if node! in blacklistednodes + + VAdminProto.PartitionEntry partitionEntry = VAdminProto.PartitionEntry.newBuilder() + .setKey(ProtoUtils.encodeBytes(key)) + .setVersioned(ProtoUtils.encodeVersioned(value)) + .build(); + + VAdminProto.UpdatePartitionEntriesRequest.Builder updateRequest = VAdminProto.UpdatePartitionEntriesRequest.newBuilder() + .setStore(storeName) + .setPartitionEntry(partitionEntry); + + DataOutputStream outputStream = nodeIdStoreToOutputStreamRequest.get(new Pair(storeName, + node.getId())); + try { + if(nodeIdStoreInitialized.get(new Pair(storeName, node.getId()))) { + ProtoUtils.writeMessage(outputStream, updateRequest.build()); + } else { + ProtoUtils.writeMessage(outputStream, + VAdminProto.VoldemortAdminRequest.newBuilder() + .setType(VAdminProto.AdminRequestType.UPDATE_PARTITION_ENTRIES) + .setUpdatePartitionEntries(updateRequest) + .build()); + outputStream.flush(); + nodeIdStoreInitialized.put(new Pair(storeName, node.getId()), true); + + } + + entriesProcessed++; + + } catch(IOException e) { + logger.warn("Invoking the Recovery Callback"); + Future future = streamingresults.submit(recoveryCallback); + try { + future.get(); + + } catch(InterruptedException e1) { + MARKED_BAD = true; + logger.error("Recovery Callback failed"); + e1.printStackTrace(); + throw new VoldemortException("Recovery Callback failed"); + } catch(ExecutionException e1) { + MARKED_BAD = true; + logger.error("Recovery Callback failed"); + e1.printStackTrace(); + throw new VoldemortException("Recovery Callback failed"); + } + + e.printStackTrace(); + } + + } + + int secondsTime = calendar.get(Calendar.SECOND); + if(entriesProcessed == CHECKPOINT_COMMIT_SIZE || secondsTime % TIME_COMMIT_SIZE == 0) { + entriesProcessed = 0; + + commitToVoldemort(); + + } + + throttler.maybeThrottle(1); + + } + + /** + * Flush the network buffer and write all entries to the server Wait for an + * ack from the server This is a blocking call. It is invoked on every + * Commit batch size of entries It is also called on the close session call + */ + + public synchronized void commitToVoldemort() { + entriesProcessed = 0; + commitToVoldemort(storeNames); + } + + /** + * Flush the network buffer and write all entries to the serve. then wait + * for an ack from the server. This is a blocking call. It is invoked on + * every Commit batch size of entries, It is also called on the close + * session call + * + * @param storeNameToCommit List of stores to be flushed and committed + * + */ + @SuppressWarnings({ "unchecked", "rawtypes", "unused" }) + private void commitToVoldemort(List storeNamesToCommit) { + + if(logger.isDebugEnabled()) { + logger.debug("Trying to commit to Voldemort"); + } + for(Node node: nodesToStream) { + + for(String store: storeNamesToCommit) { + if(!nodeIdStoreInitialized.get(new Pair(store, node.getId()))) + continue; + + nodeIdStoreInitialized.put(new Pair(store, node.getId()), false); + + DataOutputStream outputStream = nodeIdStoreToOutputStreamRequest.get(new Pair(store, + node.getId())); + + try { + ProtoUtils.writeEndOfStream(outputStream); + outputStream.flush(); + DataInputStream inputStream = nodeIdStoreToInputStreamRequest.get(new Pair(store, + node.getId())); + VAdminProto.UpdatePartitionEntriesResponse.Builder updateResponse = ProtoUtils.readToBuilder(inputStream, + VAdminProto.UpdatePartitionEntriesResponse.newBuilder()); + if(updateResponse.hasError()) { + logger.warn("Invoking the Recovery Callback"); + Future future = streamingresults.submit(recoveryCallback); + try { + future.get(); + + } catch(InterruptedException e1) { + MARKED_BAD = true; + logger.error("Recovery Callback failed"); + e1.printStackTrace(); + throw new VoldemortException("Recovery Callback failed"); + } catch(ExecutionException e1) { + MARKED_BAD = true; + logger.error("Recovery Callback failed"); + e1.printStackTrace(); + throw new VoldemortException("Recovery Callback failed"); + } + } else { + if(logger.isDebugEnabled()) { + logger.debug("Commit successful"); + logger.debug("calling checkpoint callback"); + } + Future future = streamingresults.submit(checkpointCallback); + try { + future.get(); + + } catch(InterruptedException e1) { + + logger.warn("Checkpoint callback failed!"); + e1.printStackTrace(); + } catch(ExecutionException e1) { + logger.warn("Checkpoint callback failed!"); + e1.printStackTrace(); + } + } + + } catch(IOException e) { + + logger.warn("Invoking the Recovery Callback"); + Future future = streamingresults.submit(recoveryCallback); + try { + future.get(); + + } catch(InterruptedException e1) { + MARKED_BAD = true; + logger.error("Recovery Callback failed"); + e1.printStackTrace(); + throw new VoldemortException("Recovery Callback failed"); + } catch(ExecutionException e1) { + MARKED_BAD = true; + logger.error("Recovery Callback failed"); + e1.printStackTrace(); + throw new VoldemortException("Recovery Callback failed"); + } + + e.printStackTrace(); + } + } + + } + + } + + /** + ** + * @param resetCheckpointCallback - the callback that allows for the user to + * clean up the checkpoint at the end of the streaming session so a + * new session could, if necessary, start from 0 position. + **/ + @SuppressWarnings({ "unchecked", "rawtypes" }) + public synchronized void closeStreamingSessions(Callable resetCheckpointCallback) { + + closeStreamingSessions(); + + Future future = streamingresults.submit(resetCheckpointCallback); + try { + future.get(); + + } catch(InterruptedException e1) { + // TODO Auto-generated catch block + e1.printStackTrace(); + } catch(ExecutionException e1) { + // TODO Auto-generated catch block + e1.printStackTrace(); + } + } + + /** + * Close the streaming session Flush all n/w buffers and call the commit + * callback + **/ + @SuppressWarnings({}) + public synchronized void closeStreamingSessions() { + + logger.info("closing the Streaming session"); + + commitToVoldemort(); + cleanupSessions(); + + } + + /** + * Helper method to Close all open socket connections and checkin back to + * the pool + */ + private void cleanupSessions() { + + cleanupSessions(storeNames); + } + + /** + * Helper method to Close all open socket connections and checkin back to + * the pool + * + * @param storeNameToCleanUp List of stores to be cleanedup from the current + * streaming session + */ + @SuppressWarnings({ "rawtypes", "unchecked" }) + private void cleanupSessions(List storeNamesToCleanUp) { + + logger.info("Performing cleanup"); + for(String store: storeNamesToCleanUp) { + + for(Node node: nodesToStream) { + + SocketAndStreams sands = nodeIdStoreToSocketAndStreams.get(new Pair(store, + node.getId())); + close(sands.getSocket()); + SocketDestination destination = nodeIdStoreToSocketRequest.get(new Pair(store, + node.getId())); + streamingSocketPool.checkin(destination, sands); + } + + } + + cleanedUp = true; + + } + +} diff --git a/src/java/voldemort/client/protocol/admin/StreamingClientConfig.java b/src/java/voldemort/client/protocol/admin/StreamingClientConfig.java new file mode 100644 index 0000000000..308a2ba154 --- /dev/null +++ b/src/java/voldemort/client/protocol/admin/StreamingClientConfig.java @@ -0,0 +1,73 @@ +package voldemort.client.protocol.admin; + +import java.io.Serializable; + +import voldemort.VoldemortException; +import voldemort.utils.Props; +import voldemort.utils.UndefinedPropertyException; + +public class StreamingClientConfig implements Serializable { + + private static final long serialVersionUID = 1L; + + private static final int DEFAULT_BATCH_SIZE = 10000; + private static final int DEFAULT_THROTTLE_QPS = 3000; + + private int batchSize; + private int throttleQPS; + + private String bootstrapURL; + + public StreamingClientConfig() { + + } + + public StreamingClientConfig(Props props) { + + this.batchSize = props.getInt("streaming.platform.commit.batch", DEFAULT_BATCH_SIZE); + this.throttleQPS = props.getInt("streaming.platform.throttle.qps", DEFAULT_THROTTLE_QPS); + + try { + this.bootstrapURL = props.getString("streaming.platform.bootstrapURL"); + } catch(UndefinedPropertyException e) { + throw new VoldemortException("BootStrap URL Not defined"); + } + + validateParams(); + + } + + public int getBatchSize() { + return batchSize; + } + + public void setBatchSize(int batchSize) { + this.batchSize = batchSize; + } + + public int getThrottleQPS() { + return throttleQPS; + } + + public void setThrottleQPS(int throttleQPS) { + this.throttleQPS = throttleQPS; + } + + public String getBootstrapURL() { + return bootstrapURL; + } + + public void setBootstrapURL(String bootstrapURL) { + this.bootstrapURL = bootstrapURL; + } + + private void validateParams() { + + if(batchSize < 0) + throw new IllegalArgumentException("streaming.platform.commit.batch cannot be less than 1"); + + if(throttleQPS < 0) + throw new IllegalArgumentException("streaming.platform.throttle.qps cannot be less than 1"); + + } +} diff --git a/src/java/voldemort/client/protocol/pb/ProtoUtils.java b/src/java/voldemort/client/protocol/pb/ProtoUtils.java index efc97b8538..7eef3359b0 100644 --- a/src/java/voldemort/client/protocol/pb/ProtoUtils.java +++ b/src/java/voldemort/client/protocol/pb/ProtoUtils.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2009 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -147,6 +147,12 @@ public static List encodePartitionTuple(HashMap> decodePartitionTuple(List partitionTuples) { HashMap> replicaToPartitionList = Maps.newHashMap(); for(PartitionTuple tuple: partitionTuples) { diff --git a/src/java/voldemort/client/protocol/pb/VAdminProto.java b/src/java/voldemort/client/protocol/pb/VAdminProto.java index 4fed345bc4..e34aec3563 100644 --- a/src/java/voldemort/client/protocol/pb/VAdminProto.java +++ b/src/java/voldemort/client/protocol/pb/VAdminProto.java @@ -4486,12 +4486,12 @@ public voldemort.client.protocol.pb.VAdminProto.PartitionTuple getReplicaToParti public boolean hasFetchValues() { return hasFetchValues; } public boolean getFetchValues() { return fetchValues_; } - // optional int64 skip_records = 5; - public static final int SKIP_RECORDS_FIELD_NUMBER = 5; - private boolean hasSkipRecords; - private long skipRecords_ = 0L; - public boolean hasSkipRecords() { return hasSkipRecords; } - public long getSkipRecords() { return skipRecords_; } + // optional int64 OBSOLETE__DO_NOT_USE__skip_records = 5; + public static final int OBSOLETE__DO_NOT_USE__SKIP_RECORDS_FIELD_NUMBER = 5; + private boolean hasOBSOLETEDONOTUSESkipRecords; + private long oBSOLETEDONOTUSESkipRecords_ = 0L; + public boolean hasOBSOLETEDONOTUSESkipRecords() { return hasOBSOLETEDONOTUSESkipRecords; } + public long getOBSOLETEDONOTUSESkipRecords() { return oBSOLETEDONOTUSESkipRecords_; } // optional string initial_cluster = 6; public static final int INITIAL_CLUSTER_FIELD_NUMBER = 6; @@ -4500,6 +4500,20 @@ public voldemort.client.protocol.pb.VAdminProto.PartitionTuple getReplicaToParti public boolean hasInitialCluster() { return hasInitialCluster; } public java.lang.String getInitialCluster() { return initialCluster_; } + // optional bool fetch_orphaned = 7; + public static final int FETCH_ORPHANED_FIELD_NUMBER = 7; + private boolean hasFetchOrphaned; + private boolean fetchOrphaned_ = false; + public boolean hasFetchOrphaned() { return hasFetchOrphaned; } + public boolean getFetchOrphaned() { return fetchOrphaned_; } + + // optional int64 records_per_partition = 8; + public static final int RECORDS_PER_PARTITION_FIELD_NUMBER = 8; + private boolean hasRecordsPerPartition; + private long recordsPerPartition_ = 0L; + public boolean hasRecordsPerPartition() { return hasRecordsPerPartition; } + public long getRecordsPerPartition() { return recordsPerPartition_; } + private void initFields() { filter_ = voldemort.client.protocol.pb.VAdminProto.VoldemortFilter.getDefaultInstance(); } @@ -4529,12 +4543,18 @@ public void writeTo(com.google.protobuf.CodedOutputStream output) if (hasFetchValues()) { output.writeBool(4, getFetchValues()); } - if (hasSkipRecords()) { - output.writeInt64(5, getSkipRecords()); + if (hasOBSOLETEDONOTUSESkipRecords()) { + output.writeInt64(5, getOBSOLETEDONOTUSESkipRecords()); } if (hasInitialCluster()) { output.writeString(6, getInitialCluster()); } + if (hasFetchOrphaned()) { + output.writeBool(7, getFetchOrphaned()); + } + if (hasRecordsPerPartition()) { + output.writeInt64(8, getRecordsPerPartition()); + } getUnknownFields().writeTo(output); } @@ -4560,14 +4580,22 @@ public int getSerializedSize() { size += com.google.protobuf.CodedOutputStream .computeBoolSize(4, getFetchValues()); } - if (hasSkipRecords()) { + if (hasOBSOLETEDONOTUSESkipRecords()) { size += com.google.protobuf.CodedOutputStream - .computeInt64Size(5, getSkipRecords()); + .computeInt64Size(5, getOBSOLETEDONOTUSESkipRecords()); } if (hasInitialCluster()) { size += com.google.protobuf.CodedOutputStream .computeStringSize(6, getInitialCluster()); } + if (hasFetchOrphaned()) { + size += com.google.protobuf.CodedOutputStream + .computeBoolSize(7, getFetchOrphaned()); + } + if (hasRecordsPerPartition()) { + size += com.google.protobuf.CodedOutputStream + .computeInt64Size(8, getRecordsPerPartition()); + } size += getUnknownFields().getSerializedSize(); memoizedSerializedSize = size; return size; @@ -4745,12 +4773,18 @@ public Builder mergeFrom(voldemort.client.protocol.pb.VAdminProto.FetchPartition if (other.hasFetchValues()) { setFetchValues(other.getFetchValues()); } - if (other.hasSkipRecords()) { - setSkipRecords(other.getSkipRecords()); + if (other.hasOBSOLETEDONOTUSESkipRecords()) { + setOBSOLETEDONOTUSESkipRecords(other.getOBSOLETEDONOTUSESkipRecords()); } if (other.hasInitialCluster()) { setInitialCluster(other.getInitialCluster()); } + if (other.hasFetchOrphaned()) { + setFetchOrphaned(other.getFetchOrphaned()); + } + if (other.hasRecordsPerPartition()) { + setRecordsPerPartition(other.getRecordsPerPartition()); + } this.mergeUnknownFields(other.getUnknownFields()); return this; } @@ -4800,13 +4834,21 @@ public Builder mergeFrom( break; } case 40: { - setSkipRecords(input.readInt64()); + setOBSOLETEDONOTUSESkipRecords(input.readInt64()); break; } case 50: { setInitialCluster(input.readString()); break; } + case 56: { + setFetchOrphaned(input.readBool()); + break; + } + case 64: { + setRecordsPerPartition(input.readInt64()); + break; + } } } } @@ -4939,21 +4981,21 @@ public Builder clearFetchValues() { return this; } - // optional int64 skip_records = 5; - public boolean hasSkipRecords() { - return result.hasSkipRecords(); + // optional int64 OBSOLETE__DO_NOT_USE__skip_records = 5; + public boolean hasOBSOLETEDONOTUSESkipRecords() { + return result.hasOBSOLETEDONOTUSESkipRecords(); } - public long getSkipRecords() { - return result.getSkipRecords(); + public long getOBSOLETEDONOTUSESkipRecords() { + return result.getOBSOLETEDONOTUSESkipRecords(); } - public Builder setSkipRecords(long value) { - result.hasSkipRecords = true; - result.skipRecords_ = value; + public Builder setOBSOLETEDONOTUSESkipRecords(long value) { + result.hasOBSOLETEDONOTUSESkipRecords = true; + result.oBSOLETEDONOTUSESkipRecords_ = value; return this; } - public Builder clearSkipRecords() { - result.hasSkipRecords = false; - result.skipRecords_ = 0L; + public Builder clearOBSOLETEDONOTUSESkipRecords() { + result.hasOBSOLETEDONOTUSESkipRecords = false; + result.oBSOLETEDONOTUSESkipRecords_ = 0L; return this; } @@ -4978,6 +5020,42 @@ public Builder clearInitialCluster() { return this; } + // optional bool fetch_orphaned = 7; + public boolean hasFetchOrphaned() { + return result.hasFetchOrphaned(); + } + public boolean getFetchOrphaned() { + return result.getFetchOrphaned(); + } + public Builder setFetchOrphaned(boolean value) { + result.hasFetchOrphaned = true; + result.fetchOrphaned_ = value; + return this; + } + public Builder clearFetchOrphaned() { + result.hasFetchOrphaned = false; + result.fetchOrphaned_ = false; + return this; + } + + // optional int64 records_per_partition = 8; + public boolean hasRecordsPerPartition() { + return result.hasRecordsPerPartition(); + } + public long getRecordsPerPartition() { + return result.getRecordsPerPartition(); + } + public Builder setRecordsPerPartition(long value) { + result.hasRecordsPerPartition = true; + result.recordsPerPartition_ = value; + return this; + } + public Builder clearRecordsPerPartition() { + result.hasRecordsPerPartition = false; + result.recordsPerPartition_ = 0L; + return this; + } + // @@protoc_insertion_point(builder_scope:voldemort.FetchPartitionEntriesRequest) } @@ -23085,174 +23163,176 @@ public Builder clearReserveMemory() { "e\022\037\n\005error\030\001 \001(\0132\020.voldemort.Error\"d\n\032Fe" + "tchPartitionFilesRequest\022\r\n\005store\030\001 \002(\t\022" + "7\n\024replica_to_partition\030\002 \003(\0132\031.voldemor" + - "t.PartitionTuple\"\327\001\n\034FetchPartitionEntri" + + "t.PartitionTuple\"\244\002\n\034FetchPartitionEntri" + "esRequest\0227\n\024replica_to_partition\030\001 \003(\0132" + "\031.voldemort.PartitionTuple\022\r\n\005store\030\002 \002(" + "\t\022*\n\006filter\030\003 \001(\0132\032.voldemort.VoldemortF", - "ilter\022\024\n\014fetch_values\030\004 \001(\010\022\024\n\014skip_reco" + - "rds\030\005 \001(\003\022\027\n\017initial_cluster\030\006 \001(\t\"\201\001\n\035F" + - "etchPartitionEntriesResponse\0222\n\017partitio" + - "n_entry\030\001 \001(\0132\031.voldemort.PartitionEntry" + - "\022\013\n\003key\030\002 \001(\014\022\037\n\005error\030\003 \001(\0132\020.voldemort" + - ".Error\"\254\001\n\035DeletePartitionEntriesRequest" + - "\022\r\n\005store\030\001 \002(\t\0227\n\024replica_to_partition\030" + - "\002 \003(\0132\031.voldemort.PartitionTuple\022*\n\006filt" + - "er\030\003 \001(\0132\032.voldemort.VoldemortFilter\022\027\n\017" + - "initial_cluster\030\004 \001(\t\"P\n\036DeletePartition", - "EntriesResponse\022\r\n\005count\030\001 \001(\003\022\037\n\005error\030" + - "\002 \001(\0132\020.voldemort.Error\"\317\001\n\035InitiateFetc" + - "hAndUpdateRequest\022\017\n\007node_id\030\001 \002(\005\022\r\n\005st" + - "ore\030\002 \002(\t\022*\n\006filter\030\003 \001(\0132\032.voldemort.Vo" + - "ldemortFilter\0227\n\024replica_to_partition\030\004 " + - "\003(\0132\031.voldemort.PartitionTuple\022\027\n\017initia" + - "l_cluster\030\005 \001(\t\022\020\n\010optimize\030\006 \001(\010\"1\n\033Asy" + - "ncOperationStatusRequest\022\022\n\nrequest_id\030\001" + - " \002(\005\"/\n\031AsyncOperationStopRequest\022\022\n\nreq" + - "uest_id\030\001 \002(\005\"=\n\032AsyncOperationStopRespo", - "nse\022\037\n\005error\030\001 \001(\0132\020.voldemort.Error\"2\n\031" + - "AsyncOperationListRequest\022\025\n\rshow_comple" + - "te\030\002 \002(\010\"R\n\032AsyncOperationListResponse\022\023" + - "\n\013request_ids\030\001 \003(\005\022\037\n\005error\030\002 \001(\0132\020.vol" + - "demort.Error\":\n\016PartitionTuple\022\024\n\014replic" + - "a_type\030\001 \002(\005\022\022\n\npartitions\030\002 \003(\005\"e\n\026PerS" + - "torePartitionTuple\022\022\n\nstore_name\030\001 \002(\t\0227" + - "\n\024replica_to_partition\030\002 \003(\0132\031.voldemort" + - ".PartitionTuple\"\370\001\n\031RebalancePartitionIn" + - "foMap\022\022\n\nstealer_id\030\001 \002(\005\022\020\n\010donor_id\030\002 ", - "\002(\005\022\017\n\007attempt\030\003 \002(\005\022C\n\030replica_to_add_p" + - "artition\030\004 \003(\0132!.voldemort.PerStoreParti" + - "tionTuple\022F\n\033replica_to_delete_partition" + - "\030\005 \003(\0132!.voldemort.PerStorePartitionTupl" + - "e\022\027\n\017initial_cluster\030\006 \002(\t\"f\n\034InitiateRe" + - "balanceNodeRequest\022F\n\030rebalance_partitio" + - "n_info\030\001 \002(\0132$.voldemort.RebalancePartit" + - "ionInfoMap\"m\n#InitiateRebalanceNodeOnDon" + - "orRequest\022F\n\030rebalance_partition_info\030\001 " + - "\003(\0132$.voldemort.RebalancePartitionInfoMa", - "p\"\212\001\n\034AsyncOperationStatusResponse\022\022\n\nre" + - "quest_id\030\001 \001(\005\022\023\n\013description\030\002 \001(\t\022\016\n\006s" + - "tatus\030\003 \001(\t\022\020\n\010complete\030\004 \001(\010\022\037\n\005error\030\005" + - " \001(\0132\020.voldemort.Error\"\'\n\026TruncateEntrie" + - "sRequest\022\r\n\005store\030\001 \002(\t\":\n\027TruncateEntri" + - "esResponse\022\037\n\005error\030\001 \001(\0132\020.voldemort.Er" + - "ror\"*\n\017AddStoreRequest\022\027\n\017storeDefinitio" + - "n\030\001 \002(\t\"3\n\020AddStoreResponse\022\037\n\005error\030\001 \001" + - "(\0132\020.voldemort.Error\"\'\n\022DeleteStoreReque" + - "st\022\021\n\tstoreName\030\001 \002(\t\"6\n\023DeleteStoreResp", - "onse\022\037\n\005error\030\001 \001(\0132\020.voldemort.Error\"P\n" + - "\021FetchStoreRequest\022\022\n\nstore_name\030\001 \002(\t\022\021" + - "\n\tstore_dir\030\002 \002(\t\022\024\n\014push_version\030\003 \001(\003\"" + - "9\n\020SwapStoreRequest\022\022\n\nstore_name\030\001 \002(\t\022" + - "\021\n\tstore_dir\030\002 \002(\t\"P\n\021SwapStoreResponse\022" + - "\037\n\005error\030\001 \001(\0132\020.voldemort.Error\022\032\n\022prev" + - "ious_store_dir\030\002 \001(\t\"@\n\024RollbackStoreReq" + - "uest\022\022\n\nstore_name\030\001 \002(\t\022\024\n\014push_version" + - "\030\002 \002(\003\"8\n\025RollbackStoreResponse\022\037\n\005error" + - "\030\001 \001(\0132\020.voldemort.Error\"&\n\020RepairJobReq", - "uest\022\022\n\nstore_name\030\001 \001(\t\"4\n\021RepairJobRes" + - "ponse\022\037\n\005error\030\001 \001(\0132\020.voldemort.Error\"=" + - "\n\024ROStoreVersionDirMap\022\022\n\nstore_name\030\001 \002" + - "(\t\022\021\n\tstore_dir\030\002 \002(\t\"/\n\031GetROMaxVersion" + - "DirRequest\022\022\n\nstore_name\030\001 \003(\t\"y\n\032GetROM" + - "axVersionDirResponse\022:\n\021ro_store_version" + - "s\030\001 \003(\0132\037.voldemort.ROStoreVersionDirMap" + - "\022\037\n\005error\030\002 \001(\0132\020.voldemort.Error\"3\n\035Get" + - "ROCurrentVersionDirRequest\022\022\n\nstore_name" + - "\030\001 \003(\t\"}\n\036GetROCurrentVersionDirResponse", - "\022:\n\021ro_store_versions\030\001 \003(\0132\037.voldemort." + - "ROStoreVersionDirMap\022\037\n\005error\030\002 \001(\0132\020.vo" + - "ldemort.Error\"/\n\031GetROStorageFormatReque" + - "st\022\022\n\nstore_name\030\001 \003(\t\"y\n\032GetROStorageFo" + - "rmatResponse\022:\n\021ro_store_versions\030\001 \003(\0132" + - "\037.voldemort.ROStoreVersionDirMap\022\037\n\005erro" + - "r\030\002 \001(\0132\020.voldemort.Error\"@\n\027FailedFetch" + - "StoreRequest\022\022\n\nstore_name\030\001 \002(\t\022\021\n\tstor" + - "e_dir\030\002 \002(\t\";\n\030FailedFetchStoreResponse\022" + - "\037\n\005error\030\001 \001(\0132\020.voldemort.Error\"\346\001\n\033Reb", - "alanceStateChangeRequest\022K\n\035rebalance_pa" + - "rtition_info_list\030\001 \003(\0132$.voldemort.Reba" + - "lancePartitionInfoMap\022\026\n\016cluster_string\030" + - "\002 \002(\t\022\017\n\007swap_ro\030\003 \002(\010\022\037\n\027change_cluster" + - "_metadata\030\004 \002(\010\022\036\n\026change_rebalance_stat" + - "e\030\005 \002(\010\022\020\n\010rollback\030\006 \002(\010\"?\n\034RebalanceSt" + - "ateChangeResponse\022\037\n\005error\030\001 \001(\0132\020.volde" + - "mort.Error\"G\n DeleteStoreRebalanceStateR" + - "equest\022\022\n\nstore_name\030\001 \002(\t\022\017\n\007node_id\030\002 " + - "\002(\005\"D\n!DeleteStoreRebalanceStateResponse", - "\022\037\n\005error\030\001 \001(\0132\020.voldemort.Error\"h\n\023Nat" + - "iveBackupRequest\022\022\n\nstore_name\030\001 \002(\t\022\022\n\n" + - "backup_dir\030\002 \002(\t\022\024\n\014verify_files\030\003 \002(\010\022\023" + - "\n\013incremental\030\004 \002(\010\">\n\024ReserveMemoryRequ" + - "est\022\022\n\nstore_name\030\001 \002(\t\022\022\n\nsize_in_mb\030\002 " + - "\002(\003\"8\n\025ReserveMemoryResponse\022\037\n\005error\030\001 " + - "\001(\0132\020.voldemort.Error\"\360\016\n\025VoldemortAdmin" + - "Request\022)\n\004type\030\001 \002(\0162\033.voldemort.AdminR" + - "equestType\0223\n\014get_metadata\030\002 \001(\0132\035.volde" + - "mort.GetMetadataRequest\0229\n\017update_metada", - "ta\030\003 \001(\0132 .voldemort.UpdateMetadataReque" + - "st\022J\n\030update_partition_entries\030\004 \001(\0132(.v" + - "oldemort.UpdatePartitionEntriesRequest\022H" + - "\n\027fetch_partition_entries\030\005 \001(\0132\'.voldem" + - "ort.FetchPartitionEntriesRequest\022J\n\030dele" + - "te_partition_entries\030\006 \001(\0132(.voldemort.D" + - "eletePartitionEntriesRequest\022K\n\031initiate" + - "_fetch_and_update\030\007 \001(\0132(.voldemort.Init" + - "iateFetchAndUpdateRequest\022F\n\026async_opera" + - "tion_status\030\010 \001(\0132&.voldemort.AsyncOpera", - "tionStatusRequest\022H\n\027initiate_rebalance_" + - "node\030\t \001(\0132\'.voldemort.InitiateRebalance" + - "NodeRequest\022B\n\024async_operation_stop\030\n \001(" + - "\0132$.voldemort.AsyncOperationStopRequest\022" + - "B\n\024async_operation_list\030\013 \001(\0132$.voldemor" + - "t.AsyncOperationListRequest\022;\n\020truncate_" + - "entries\030\014 \001(\0132!.voldemort.TruncateEntrie" + - "sRequest\022-\n\tadd_store\030\r \001(\0132\032.voldemort." + - "AddStoreRequest\0223\n\014delete_store\030\016 \001(\0132\035." + - "voldemort.DeleteStoreRequest\0221\n\013fetch_st", - "ore\030\017 \001(\0132\034.voldemort.FetchStoreRequest\022" + - "/\n\nswap_store\030\020 \001(\0132\033.voldemort.SwapStor" + - "eRequest\0227\n\016rollback_store\030\021 \001(\0132\037.volde" + - "mort.RollbackStoreRequest\022D\n\026get_ro_max_" + - "version_dir\030\022 \001(\0132$.voldemort.GetROMaxVe" + - "rsionDirRequest\022L\n\032get_ro_current_versio" + - "n_dir\030\023 \001(\0132(.voldemort.GetROCurrentVers" + - "ionDirRequest\022D\n\025fetch_partition_files\030\024" + - " \001(\0132%.voldemort.FetchPartitionFilesRequ" + - "est\022@\n\023update_slop_entries\030\026 \001(\0132#.volde", - "mort.UpdateSlopEntriesRequest\022>\n\022failed_" + - "fetch_store\030\030 \001(\0132\".voldemort.FailedFetc" + - "hStoreRequest\022C\n\025get_ro_storage_format\030\031" + - " \001(\0132$.voldemort.GetROStorageFormatReque" + - "st\022F\n\026rebalance_state_change\030\032 \001(\0132&.vol" + - "demort.RebalanceStateChangeRequest\022/\n\nre" + - "pair_job\030\033 \001(\0132\033.voldemort.RepairJobRequ" + - "est\022X\n initiate_rebalance_node_on_donor\030" + - "\034 \001(\0132..voldemort.InitiateRebalanceNodeO" + - "nDonorRequest\022Q\n\034delete_store_rebalance_", - "state\030\035 \001(\0132+.voldemort.DeleteStoreRebal" + - "anceStateRequest\0225\n\rnative_backup\030\036 \001(\0132" + - "\036.voldemort.NativeBackupRequest\0227\n\016reser" + - "ve_memory\030\037 \001(\0132\037.voldemort.ReserveMemor" + - "yRequest*\310\005\n\020AdminRequestType\022\020\n\014GET_MET" + - "ADATA\020\000\022\023\n\017UPDATE_METADATA\020\001\022\034\n\030UPDATE_P" + - "ARTITION_ENTRIES\020\002\022\033\n\027FETCH_PARTITION_EN" + - "TRIES\020\003\022\034\n\030DELETE_PARTITION_ENTRIES\020\004\022\035\n" + - "\031INITIATE_FETCH_AND_UPDATE\020\005\022\032\n\026ASYNC_OP" + - "ERATION_STATUS\020\006\022\033\n\027INITIATE_REBALANCE_N", - "ODE\020\007\022\030\n\024ASYNC_OPERATION_STOP\020\010\022\030\n\024ASYNC" + - "_OPERATION_LIST\020\t\022\024\n\020TRUNCATE_ENTRIES\020\n\022" + - "\r\n\tADD_STORE\020\013\022\020\n\014DELETE_STORE\020\014\022\017\n\013FETC" + - "H_STORE\020\r\022\016\n\nSWAP_STORE\020\016\022\022\n\016ROLLBACK_ST" + - "ORE\020\017\022\032\n\026GET_RO_MAX_VERSION_DIR\020\020\022\036\n\032GET" + - "_RO_CURRENT_VERSION_DIR\020\021\022\031\n\025FETCH_PARTI" + - "TION_FILES\020\022\022\027\n\023UPDATE_SLOP_ENTRIES\020\024\022\026\n" + - "\022FAILED_FETCH_STORE\020\026\022\031\n\025GET_RO_STORAGE_" + - "FORMAT\020\027\022\032\n\026REBALANCE_STATE_CHANGE\020\030\022\016\n\n" + - "REPAIR_JOB\020\031\022$\n INITIATE_REBALANCE_NODE_", - "ON_DONOR\020\032\022 \n\034DELETE_STORE_REBALANCE_STA" + - "TE\020\033\022\021\n\rNATIVE_BACKUP\020\034\022\022\n\016RESERVE_MEMOR" + - "Y\020\035B-\n\034voldemort.client.protocol.pbB\013VAd" + - "minProtoH\001" + "ilter\022\024\n\014fetch_values\030\004 \001(\010\022*\n\"OBSOLETE_" + + "_DO_NOT_USE__skip_records\030\005 \001(\003\022\027\n\017initi" + + "al_cluster\030\006 \001(\t\022\026\n\016fetch_orphaned\030\007 \001(\010" + + "\022\035\n\025records_per_partition\030\010 \001(\003\"\201\001\n\035Fetc" + + "hPartitionEntriesResponse\0222\n\017partition_e" + + "ntry\030\001 \001(\0132\031.voldemort.PartitionEntry\022\013\n" + + "\003key\030\002 \001(\014\022\037\n\005error\030\003 \001(\0132\020.voldemort.Er" + + "ror\"\254\001\n\035DeletePartitionEntriesRequest\022\r\n" + + "\005store\030\001 \002(\t\0227\n\024replica_to_partition\030\002 \003" + + "(\0132\031.voldemort.PartitionTuple\022*\n\006filter\030", + "\003 \001(\0132\032.voldemort.VoldemortFilter\022\027\n\017ini" + + "tial_cluster\030\004 \001(\t\"P\n\036DeletePartitionEnt" + + "riesResponse\022\r\n\005count\030\001 \001(\003\022\037\n\005error\030\002 \001" + + "(\0132\020.voldemort.Error\"\317\001\n\035InitiateFetchAn" + + "dUpdateRequest\022\017\n\007node_id\030\001 \002(\005\022\r\n\005store" + + "\030\002 \002(\t\022*\n\006filter\030\003 \001(\0132\032.voldemort.Volde" + + "mortFilter\0227\n\024replica_to_partition\030\004 \003(\013" + + "2\031.voldemort.PartitionTuple\022\027\n\017initial_c" + + "luster\030\005 \001(\t\022\020\n\010optimize\030\006 \001(\010\"1\n\033AsyncO" + + "perationStatusRequest\022\022\n\nrequest_id\030\001 \002(", + "\005\"/\n\031AsyncOperationStopRequest\022\022\n\nreques" + + "t_id\030\001 \002(\005\"=\n\032AsyncOperationStopResponse" + + "\022\037\n\005error\030\001 \001(\0132\020.voldemort.Error\"2\n\031Asy" + + "ncOperationListRequest\022\025\n\rshow_complete\030" + + "\002 \002(\010\"R\n\032AsyncOperationListResponse\022\023\n\013r" + + "equest_ids\030\001 \003(\005\022\037\n\005error\030\002 \001(\0132\020.voldem" + + "ort.Error\":\n\016PartitionTuple\022\024\n\014replica_t" + + "ype\030\001 \002(\005\022\022\n\npartitions\030\002 \003(\005\"e\n\026PerStor" + + "ePartitionTuple\022\022\n\nstore_name\030\001 \002(\t\0227\n\024r" + + "eplica_to_partition\030\002 \003(\0132\031.voldemort.Pa", + "rtitionTuple\"\370\001\n\031RebalancePartitionInfoM" + + "ap\022\022\n\nstealer_id\030\001 \002(\005\022\020\n\010donor_id\030\002 \002(\005" + + "\022\017\n\007attempt\030\003 \002(\005\022C\n\030replica_to_add_part" + + "ition\030\004 \003(\0132!.voldemort.PerStorePartitio" + + "nTuple\022F\n\033replica_to_delete_partition\030\005 " + + "\003(\0132!.voldemort.PerStorePartitionTuple\022\027" + + "\n\017initial_cluster\030\006 \002(\t\"f\n\034InitiateRebal" + + "anceNodeRequest\022F\n\030rebalance_partition_i" + + "nfo\030\001 \002(\0132$.voldemort.RebalancePartition" + + "InfoMap\"m\n#InitiateRebalanceNodeOnDonorR", + "equest\022F\n\030rebalance_partition_info\030\001 \003(\013" + + "2$.voldemort.RebalancePartitionInfoMap\"\212" + + "\001\n\034AsyncOperationStatusResponse\022\022\n\nreque" + + "st_id\030\001 \001(\005\022\023\n\013description\030\002 \001(\t\022\016\n\006stat" + + "us\030\003 \001(\t\022\020\n\010complete\030\004 \001(\010\022\037\n\005error\030\005 \001(" + + "\0132\020.voldemort.Error\"\'\n\026TruncateEntriesRe" + + "quest\022\r\n\005store\030\001 \002(\t\":\n\027TruncateEntriesR" + + "esponse\022\037\n\005error\030\001 \001(\0132\020.voldemort.Error" + + "\"*\n\017AddStoreRequest\022\027\n\017storeDefinition\030\001" + + " \002(\t\"3\n\020AddStoreResponse\022\037\n\005error\030\001 \001(\0132", + "\020.voldemort.Error\"\'\n\022DeleteStoreRequest\022" + + "\021\n\tstoreName\030\001 \002(\t\"6\n\023DeleteStoreRespons" + + "e\022\037\n\005error\030\001 \001(\0132\020.voldemort.Error\"P\n\021Fe" + + "tchStoreRequest\022\022\n\nstore_name\030\001 \002(\t\022\021\n\ts" + + "tore_dir\030\002 \002(\t\022\024\n\014push_version\030\003 \001(\003\"9\n\020" + + "SwapStoreRequest\022\022\n\nstore_name\030\001 \002(\t\022\021\n\t" + + "store_dir\030\002 \002(\t\"P\n\021SwapStoreResponse\022\037\n\005" + + "error\030\001 \001(\0132\020.voldemort.Error\022\032\n\022previou" + + "s_store_dir\030\002 \001(\t\"@\n\024RollbackStoreReques" + + "t\022\022\n\nstore_name\030\001 \002(\t\022\024\n\014push_version\030\002 ", + "\002(\003\"8\n\025RollbackStoreResponse\022\037\n\005error\030\001 " + + "\001(\0132\020.voldemort.Error\"&\n\020RepairJobReques" + + "t\022\022\n\nstore_name\030\001 \001(\t\"4\n\021RepairJobRespon" + + "se\022\037\n\005error\030\001 \001(\0132\020.voldemort.Error\"=\n\024R" + + "OStoreVersionDirMap\022\022\n\nstore_name\030\001 \002(\t\022" + + "\021\n\tstore_dir\030\002 \002(\t\"/\n\031GetROMaxVersionDir" + + "Request\022\022\n\nstore_name\030\001 \003(\t\"y\n\032GetROMaxV" + + "ersionDirResponse\022:\n\021ro_store_versions\030\001" + + " \003(\0132\037.voldemort.ROStoreVersionDirMap\022\037\n" + + "\005error\030\002 \001(\0132\020.voldemort.Error\"3\n\035GetROC", + "urrentVersionDirRequest\022\022\n\nstore_name\030\001 " + + "\003(\t\"}\n\036GetROCurrentVersionDirResponse\022:\n" + + "\021ro_store_versions\030\001 \003(\0132\037.voldemort.ROS" + + "toreVersionDirMap\022\037\n\005error\030\002 \001(\0132\020.volde" + + "mort.Error\"/\n\031GetROStorageFormatRequest\022" + + "\022\n\nstore_name\030\001 \003(\t\"y\n\032GetROStorageForma" + + "tResponse\022:\n\021ro_store_versions\030\001 \003(\0132\037.v" + + "oldemort.ROStoreVersionDirMap\022\037\n\005error\030\002" + + " \001(\0132\020.voldemort.Error\"@\n\027FailedFetchSto" + + "reRequest\022\022\n\nstore_name\030\001 \002(\t\022\021\n\tstore_d", + "ir\030\002 \002(\t\";\n\030FailedFetchStoreResponse\022\037\n\005" + + "error\030\001 \001(\0132\020.voldemort.Error\"\346\001\n\033Rebala" + + "nceStateChangeRequest\022K\n\035rebalance_parti" + + "tion_info_list\030\001 \003(\0132$.voldemort.Rebalan" + + "cePartitionInfoMap\022\026\n\016cluster_string\030\002 \002" + + "(\t\022\017\n\007swap_ro\030\003 \002(\010\022\037\n\027change_cluster_me" + + "tadata\030\004 \002(\010\022\036\n\026change_rebalance_state\030\005" + + " \002(\010\022\020\n\010rollback\030\006 \002(\010\"?\n\034RebalanceState" + + "ChangeResponse\022\037\n\005error\030\001 \001(\0132\020.voldemor" + + "t.Error\"G\n DeleteStoreRebalanceStateRequ", + "est\022\022\n\nstore_name\030\001 \002(\t\022\017\n\007node_id\030\002 \002(\005" + + "\"D\n!DeleteStoreRebalanceStateResponse\022\037\n" + + "\005error\030\001 \001(\0132\020.voldemort.Error\"h\n\023Native" + + "BackupRequest\022\022\n\nstore_name\030\001 \002(\t\022\022\n\nbac" + + "kup_dir\030\002 \002(\t\022\024\n\014verify_files\030\003 \002(\010\022\023\n\013i" + + "ncremental\030\004 \002(\010\">\n\024ReserveMemoryRequest" + + "\022\022\n\nstore_name\030\001 \002(\t\022\022\n\nsize_in_mb\030\002 \002(\003" + + "\"8\n\025ReserveMemoryResponse\022\037\n\005error\030\001 \001(\013" + + "2\020.voldemort.Error\"\360\016\n\025VoldemortAdminReq" + + "uest\022)\n\004type\030\001 \002(\0162\033.voldemort.AdminRequ", + "estType\0223\n\014get_metadata\030\002 \001(\0132\035.voldemor" + + "t.GetMetadataRequest\0229\n\017update_metadata\030" + + "\003 \001(\0132 .voldemort.UpdateMetadataRequest\022" + + "J\n\030update_partition_entries\030\004 \001(\0132(.vold" + + "emort.UpdatePartitionEntriesRequest\022H\n\027f" + + "etch_partition_entries\030\005 \001(\0132\'.voldemort" + + ".FetchPartitionEntriesRequest\022J\n\030delete_" + + "partition_entries\030\006 \001(\0132(.voldemort.Dele" + + "tePartitionEntriesRequest\022K\n\031initiate_fe" + + "tch_and_update\030\007 \001(\0132(.voldemort.Initiat", + "eFetchAndUpdateRequest\022F\n\026async_operatio" + + "n_status\030\010 \001(\0132&.voldemort.AsyncOperatio" + + "nStatusRequest\022H\n\027initiate_rebalance_nod" + + "e\030\t \001(\0132\'.voldemort.InitiateRebalanceNod" + + "eRequest\022B\n\024async_operation_stop\030\n \001(\0132$" + + ".voldemort.AsyncOperationStopRequest\022B\n\024" + + "async_operation_list\030\013 \001(\0132$.voldemort.A" + + "syncOperationListRequest\022;\n\020truncate_ent" + + "ries\030\014 \001(\0132!.voldemort.TruncateEntriesRe" + + "quest\022-\n\tadd_store\030\r \001(\0132\032.voldemort.Add", + "StoreRequest\0223\n\014delete_store\030\016 \001(\0132\035.vol" + + "demort.DeleteStoreRequest\0221\n\013fetch_store" + + "\030\017 \001(\0132\034.voldemort.FetchStoreRequest\022/\n\n" + + "swap_store\030\020 \001(\0132\033.voldemort.SwapStoreRe" + + "quest\0227\n\016rollback_store\030\021 \001(\0132\037.voldemor" + + "t.RollbackStoreRequest\022D\n\026get_ro_max_ver" + + "sion_dir\030\022 \001(\0132$.voldemort.GetROMaxVersi" + + "onDirRequest\022L\n\032get_ro_current_version_d" + + "ir\030\023 \001(\0132(.voldemort.GetROCurrentVersion" + + "DirRequest\022D\n\025fetch_partition_files\030\024 \001(", + "\0132%.voldemort.FetchPartitionFilesRequest" + + "\022@\n\023update_slop_entries\030\026 \001(\0132#.voldemor" + + "t.UpdateSlopEntriesRequest\022>\n\022failed_fet" + + "ch_store\030\030 \001(\0132\".voldemort.FailedFetchSt" + + "oreRequest\022C\n\025get_ro_storage_format\030\031 \001(" + + "\0132$.voldemort.GetROStorageFormatRequest\022" + + "F\n\026rebalance_state_change\030\032 \001(\0132&.voldem" + + "ort.RebalanceStateChangeRequest\022/\n\nrepai" + + "r_job\030\033 \001(\0132\033.voldemort.RepairJobRequest" + + "\022X\n initiate_rebalance_node_on_donor\030\034 \001", + "(\0132..voldemort.InitiateRebalanceNodeOnDo" + + "norRequest\022Q\n\034delete_store_rebalance_sta" + + "te\030\035 \001(\0132+.voldemort.DeleteStoreRebalanc" + + "eStateRequest\0225\n\rnative_backup\030\036 \001(\0132\036.v" + + "oldemort.NativeBackupRequest\0227\n\016reserve_" + + "memory\030\037 \001(\0132\037.voldemort.ReserveMemoryRe" + + "quest*\310\005\n\020AdminRequestType\022\020\n\014GET_METADA" + + "TA\020\000\022\023\n\017UPDATE_METADATA\020\001\022\034\n\030UPDATE_PART" + + "ITION_ENTRIES\020\002\022\033\n\027FETCH_PARTITION_ENTRI" + + "ES\020\003\022\034\n\030DELETE_PARTITION_ENTRIES\020\004\022\035\n\031IN", + "ITIATE_FETCH_AND_UPDATE\020\005\022\032\n\026ASYNC_OPERA" + + "TION_STATUS\020\006\022\033\n\027INITIATE_REBALANCE_NODE" + + "\020\007\022\030\n\024ASYNC_OPERATION_STOP\020\010\022\030\n\024ASYNC_OP" + + "ERATION_LIST\020\t\022\024\n\020TRUNCATE_ENTRIES\020\n\022\r\n\t" + + "ADD_STORE\020\013\022\020\n\014DELETE_STORE\020\014\022\017\n\013FETCH_S" + + "TORE\020\r\022\016\n\nSWAP_STORE\020\016\022\022\n\016ROLLBACK_STORE" + + "\020\017\022\032\n\026GET_RO_MAX_VERSION_DIR\020\020\022\036\n\032GET_RO" + + "_CURRENT_VERSION_DIR\020\021\022\031\n\025FETCH_PARTITIO" + + "N_FILES\020\022\022\027\n\023UPDATE_SLOP_ENTRIES\020\024\022\026\n\022FA" + + "ILED_FETCH_STORE\020\026\022\031\n\025GET_RO_STORAGE_FOR", + "MAT\020\027\022\032\n\026REBALANCE_STATE_CHANGE\020\030\022\016\n\nREP" + + "AIR_JOB\020\031\022$\n INITIATE_REBALANCE_NODE_ON_" + + "DONOR\020\032\022 \n\034DELETE_STORE_REBALANCE_STATE\020" + + "\033\022\021\n\rNATIVE_BACKUP\020\034\022\022\n\016RESERVE_MEMORY\020\035" + + "B-\n\034voldemort.client.protocol.pbB\013VAdmin" + + "ProtoH\001" }; com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner assigner = new com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner() { @@ -23360,7 +23440,7 @@ public com.google.protobuf.ExtensionRegistry assignDescriptors( internal_static_voldemort_FetchPartitionEntriesRequest_fieldAccessorTable = new com.google.protobuf.GeneratedMessage.FieldAccessorTable( internal_static_voldemort_FetchPartitionEntriesRequest_descriptor, - new java.lang.String[] { "ReplicaToPartition", "Store", "Filter", "FetchValues", "SkipRecords", "InitialCluster", }, + new java.lang.String[] { "ReplicaToPartition", "Store", "Filter", "FetchValues", "OBSOLETEDONOTUSESkipRecords", "InitialCluster", "FetchOrphaned", "RecordsPerPartition", }, voldemort.client.protocol.pb.VAdminProto.FetchPartitionEntriesRequest.class, voldemort.client.protocol.pb.VAdminProto.FetchPartitionEntriesRequest.Builder.class); internal_static_voldemort_FetchPartitionEntriesResponse_descriptor = diff --git a/src/java/voldemort/client/protocol/vold/VoldemortNativeClientRequestFormat.java b/src/java/voldemort/client/protocol/vold/VoldemortNativeClientRequestFormat.java index 4953f05b0f..b7640f17c9 100644 --- a/src/java/voldemort/client/protocol/vold/VoldemortNativeClientRequestFormat.java +++ b/src/java/voldemort/client/protocol/vold/VoldemortNativeClientRequestFormat.java @@ -30,11 +30,11 @@ import voldemort.VoldemortException; import voldemort.client.protocol.RequestFormat; import voldemort.common.VoldemortOpCode; +import voldemort.common.nio.ByteBufferBackedInputStream; import voldemort.server.RequestRoutingType; import voldemort.store.ErrorCodeMapper; import voldemort.store.StoreUtils; import voldemort.utils.ByteArray; -import voldemort.utils.ByteBufferBackedInputStream; import voldemort.utils.ByteUtils; import voldemort.versioning.VectorClock; import voldemort.versioning.Version; diff --git a/src/java/voldemort/client/rebalance/RebalanceCLI.java b/src/java/voldemort/client/rebalance/RebalanceCLI.java index 716565a774..7f746c65ae 100644 --- a/src/java/voldemort/client/rebalance/RebalanceCLI.java +++ b/src/java/voldemort/client/rebalance/RebalanceCLI.java @@ -1,3 +1,19 @@ +/* + * Copyright 2012-2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + package voldemort.client.rebalance; import java.io.File; @@ -14,9 +30,11 @@ import voldemort.VoldemortException; import voldemort.cluster.Cluster; import voldemort.store.StoreDefinition; +import voldemort.utils.ClusterInstance; import voldemort.utils.CmdUtils; import voldemort.utils.Entropy; -import voldemort.utils.RebalanceUtils; +import voldemort.utils.Pair; +import voldemort.utils.RebalanceClusterUtils; import voldemort.xml.ClusterMapper; import voldemort.xml.StoreDefinitionsMapper; @@ -29,6 +47,13 @@ public class RebalanceCLI { private final static int HELP_EXIT_CODE = 2; private final static Logger logger = Logger.getLogger(RebalanceCLI.class); + private final static int DEFAULT_GENERATE_RANDOM_SWAP_ATTEMPTS = 100; + private final static int DEFAULT_GENERATE_RANDOM_SWAP_SUCCESSES = 100; + private final static int DEFAULT_GENERATE_GREEDY_SWAP_ATTEMPTS = 5; + private final static int DEFAULT_GENERATE_GREEDY_MAX_PARTITIONS_PER_NODE = 5; + private final static int DEFAULT_GENERATE_GREEDY_MAX_PARTITIONS_PER_ZONE = 25; + private final static int DEFAULT_GENERATE_MAX_CONTIGUOUS_PARTITIONS = 0; + public static void main(String[] args) throws Exception { int exitCode = ERROR_EXIT_CODE; RebalanceController rebalanceController = null; @@ -53,13 +78,65 @@ public static void main(String[] args) throws Exception { .describedAs("parallelism"); parser.accepts("tries", "(1) Tries during rebalance [ Default: " - + RebalanceClientConfig.MAX_TRIES + + RebalanceClientConfig.MAX_TRIES_REBALANCING + " ] (2) Number of tries while generating new metadata") .withRequiredArg() .ofType(Integer.class) .describedAs("num-tries"); parser.accepts("generate", "Optimize the target cluster which has new nodes with empty partitions"); + parser.accepts("generate-disable-primary-balancing", + "Make sure that all nodes within every zone have the same (within one) number of primary partitions [default: enabled]"); + parser.accepts("generate-enable-xzone-primary-moves", + "Allow primary partitions to move across zones [Default: disabled]"); + parser.accepts("generate-enable-any-xzone-nary-moves", + "Allow non-primary partitions to move across zones at any time (i.e., does not check for xzone moves) [Default: disabled]"); + parser.accepts("generate-enable-last-resort-xzone-nary-moves", + "Allow non-primary partitions to move across zones as a last resort (i.e., checks for xzone moves and prefers to avoid them, unless a xzone move is required to achieve balance) [Default: disabled]"); + parser.accepts("generate-enable-xzone-shuffle", + "Allows non-primary partitions to move across zones in random or greedy shuffles. [Default: disabled]"); + parser.accepts("generate-enable-random-swaps", + "Enable attempts to improve balance by random partition swaps within a zone. [Default: disabled]"); + parser.accepts("generate-random-swap-attempts", + "Number of random swaps to attempt. [Default:" + + DEFAULT_GENERATE_RANDOM_SWAP_ATTEMPTS + " ]") + .withRequiredArg() + .ofType(Integer.class) + .describedAs("num-attempts"); + parser.accepts("generate-random-swap-successes", + "Number of successful random swaps to permit exit before completing all swap attempts. [Default:" + + DEFAULT_GENERATE_RANDOM_SWAP_SUCCESSES + " ]") + .withRequiredArg() + .ofType(Integer.class) + .describedAs("num-successes"); + parser.accepts("generate-enable-greedy-swaps", + "Enable attempts to improve balance by greedily swapping (random) partitions within a zone. [Default: disabled]"); + parser.accepts("generate-greedy-swap-attempts", + "Number of greedy (random) swaps to attempt. [Default:" + + DEFAULT_GENERATE_GREEDY_SWAP_ATTEMPTS + " ]") + .withRequiredArg() + .ofType(Integer.class) + .describedAs("num-attempts"); + parser.accepts("generate-greedy-max-partitions-per-node", + "Max number of partitions per-node to evaluate swapping with other partitions within the zone. [Default:" + + DEFAULT_GENERATE_GREEDY_MAX_PARTITIONS_PER_NODE + " ]") + .withRequiredArg() + .ofType(Integer.class) + .describedAs("max-partitions-per-node"); + parser.accepts("generate-greedy-max-partitions-per-zone", + "Max number of (random) partitions per-zone to evaluate swapping with partitions from node being evaluated. [Default:" + + DEFAULT_GENERATE_GREEDY_MAX_PARTITIONS_PER_ZONE + " ]") + .withRequiredArg() + .ofType(Integer.class) + .describedAs("max-partitions-per-zone"); + parser.accepts("generate-max-contiguous-partitions", + "Limit the number of contiguous partition IDs allowed within a zone. [Default:" + + DEFAULT_GENERATE_MAX_CONTIGUOUS_PARTITIONS + + " (indicating no limit)]") + .withRequiredArg() + .ofType(Integer.class) + .describedAs("num-contiguous"); + parser.accepts("analyze", "Analyze how balanced given cluster is."); parser.accepts("entropy", "True - if we want to run the entropy calculator. False - if we want to store keys") .withRequiredArg() @@ -114,7 +191,7 @@ public static void main(String[] args) throws Exception { RebalanceClientConfig.MAX_PARALLEL_REBALANCING); int maxTriesRebalancing = CmdUtils.valueOf(options, "tries", - RebalanceClientConfig.MAX_TRIES); + RebalanceClientConfig.MAX_TRIES_REBALANCING); boolean enabledShowPlan = options.has("show-plan"); long rebalancingTimeoutSeconds = CmdUtils.valueOf(options, "timeout", @@ -125,6 +202,31 @@ public static void main(String[] args) throws Exception { boolean stealerBasedRebalancing = CmdUtils.valueOf(options, "stealer-based", RebalanceClientConfig.STEALER_BASED_REBALANCING); + boolean generateDisablePrimaryBalancing = options.has("generate-disable-primary-balancing"); + boolean generateEnableXzonePrimary = options.has("generate-enable-xzone-primary-moves"); + boolean generateEnableAllXzoneNary = options.has("generate-enable-any-xzone-nary-moves"); + boolean generateEnableLastResortXzoneNary = options.has("generate-enable-last-resort-xzone-nary-moves"); + boolean generateEnableXzoneShuffle = options.has("generate-enable-xzone-shuffle"); + boolean generateEnableRandomSwaps = options.has("generate-enable-random-swaps"); + int generateRandomSwapAttempts = CmdUtils.valueOf(options, + "generate-random-swap-attempts", + DEFAULT_GENERATE_RANDOM_SWAP_ATTEMPTS); + int generateRandomSwapSuccesses = CmdUtils.valueOf(options, + "generate-random-swap-successes", + DEFAULT_GENERATE_RANDOM_SWAP_SUCCESSES); + boolean generateEnableGreedySwaps = options.has("generate-enable-greedy-swaps"); + int generateGreedySwapAttempts = CmdUtils.valueOf(options, + "generate-greedy-swap-attempts", + DEFAULT_GENERATE_GREEDY_SWAP_ATTEMPTS); + int generateGreedyMaxPartitionsPerNode = CmdUtils.valueOf(options, + "generate-greedy-max-partitions-per-node", + DEFAULT_GENERATE_GREEDY_MAX_PARTITIONS_PER_NODE); + int generateGreedyMaxPartitionsPerZone = CmdUtils.valueOf(options, + "generate-greedy-max-partitions-per-zone", + DEFAULT_GENERATE_GREEDY_MAX_PARTITIONS_PER_ZONE); + int generateMaxContiguousPartitionsPerZone = CmdUtils.valueOf(options, + "generate-max-contiguous-partitions", + DEFAULT_GENERATE_MAX_CONTIGUOUS_PARTITIONS); RebalanceClientConfig config = new RebalanceClientConfig(); config.setMaxParallelRebalancing(parallelism); @@ -191,6 +293,12 @@ public static void main(String[] args) throws Exception { } + if(options.has("analyze")) { + Pair analysis = new ClusterInstance(currentCluster, storeDefs).analyzeBalanceVerbose(); + System.out.println(analysis.getSecond()); + return; + } + if(!options.has("target-cluster")) { System.err.println("Missing required arguments: target-cluster"); printHelp(System.err, parser); @@ -201,11 +309,61 @@ public static void main(String[] args) throws Exception { Cluster targetCluster = new ClusterMapper().readCluster(new File(targetClusterXML)); if(options.has("generate")) { - RebalanceUtils.generateMinCluster(currentCluster, - targetCluster, - storeDefs, - config.getOutputDirectory(), - config.getMaxTriesRebalancing()); + if(generateDisablePrimaryBalancing && !generateEnableRandomSwaps + && !generateEnableGreedySwaps && generateMaxContiguousPartitionsPerZone == 0) { + System.err.println("Specified generate but did not enable any forms for generation (balance primary partitoins, greedy swaps, random swaps, max contiguous partitions)."); + printHelp(System.err, parser); + System.exit(ERROR_EXIT_CODE); + } + if((options.has("generate-random-swap-attempts") || options.has("generate-random-swap-successes")) + && !generateEnableRandomSwaps) { + System.err.println("Provided arguments for generate random swaps but disabled the feature"); + printHelp(System.err, parser); + System.exit(ERROR_EXIT_CODE); + } + if((options.has("generate-greedy-swap-attempts") + || options.has("generate-greedy-max-partitions-per-node") || options.has("generate-greedy-max-partitions-per-zone")) + && !generateEnableGreedySwaps) { + System.err.println("Provided arguments for generate greedy swaps but disabled the feature"); + printHelp(System.err, parser); + System.exit(ERROR_EXIT_CODE); + } + if(generateEnableAllXzoneNary && generateEnableLastResortXzoneNary) { + System.err.println("Specified both generate-enable-any-xzone-nary-moves and generate-enable-last-resort-xzone-nary-moves. Please specify at most one of these mutually exclusive options."); + printHelp(System.err, parser); + System.exit(ERROR_EXIT_CODE); + } + if(generateDisablePrimaryBalancing + && (generateEnableAllXzoneNary || generateEnableLastResortXzoneNary)) { + System.err.println("Specified generate-disable-primary-balancing but also specified either generate-enable-any-xzone-nary-moves or generate-enable-last-resort-xzone-nary-moves which will have no effect."); + printHelp(System.err, parser); + System.exit(ERROR_EXIT_CODE); + } + if(generateEnableXzoneShuffle + && !(generateEnableRandomSwaps || generateEnableGreedySwaps)) { + System.err.println("Specified generate-enable-xzone-shuffle but did not specify one of generate-enable-random-swaps or generate-enable-greedy-swaps."); + printHelp(System.err, parser); + System.exit(ERROR_EXIT_CODE); + } + + RebalanceClusterUtils.balanceTargetCluster(currentCluster, + targetCluster, + storeDefs, + config.getOutputDirectory(), + config.getMaxTriesRebalancing(), + generateDisablePrimaryBalancing, + generateEnableXzonePrimary, + generateEnableAllXzoneNary, + generateEnableLastResortXzoneNary, + generateEnableXzoneShuffle, + generateEnableRandomSwaps, + generateRandomSwapAttempts, + generateRandomSwapSuccesses, + generateEnableGreedySwaps, + generateGreedySwapAttempts, + generateGreedyMaxPartitionsPerNode, + generateGreedyMaxPartitionsPerZone, + generateMaxContiguousPartitionsPerZone); return; } @@ -236,29 +394,51 @@ public static void main(String[] args) throws Exception { public static void printHelp(PrintStream stream, OptionParser parser) throws IOException { stream.println("Commands supported"); stream.println("------------------"); - stream.println("REBALANCE"); + stream.println(); + stream.println("REBALANCE (RUN PROCESS)"); stream.println("a) --url --target-cluster [ Run the actual rebalancing process ] "); - stream.println("b) --current-cluster --current-stores --target-cluster [ Generates the plan ]"); - stream.println("\t (i) --no-delete [ Will not delete the data after rebalancing ]"); - stream.println("\t (ii) --show-plan [ Will generate only the plan ]"); - stream.println("\t (iii) --output-dir [ Path to output dir where we store intermediate metadata ]"); - stream.println("\t (iv) --parallelism [ Number of parallel stealer - donor node tasks to run in parallel ] "); - stream.println("\t (v) --tries [ Number of times we try to move the data before declaring failure ]"); - stream.println("\t (vi) --timeout [ Timeout in seconds for one rebalancing task ( stealer - donor tuple ) ]"); - stream.println("\t (vii) --batch [ Number of primary partitions to move together ]"); - stream.println("\t (viii) --stealer-based [ Run the rebalancing from the stealers perspective ]"); + + stream.println(); + stream.println("REBALANCE (GENERATE PLAN)"); + stream.println("b) --current-cluster --current-stores --target-cluster "); + stream.println("\t (1) --no-delete [ Will not delete the data after rebalancing ]"); + stream.println("\t (2) --show-plan [ Will generate only the plan ]"); + stream.println("\t (3) --output-dir [ Path to output dir where we store intermediate metadata ]"); + stream.println("\t (4) --parallelism [ Number of parallel stealer - donor node tasks to run in parallel ] "); + stream.println("\t (5) --tries [ Number of times we try to move the data before declaring failure ]"); + stream.println("\t (6) --timeout [ Timeout in seconds for one rebalancing task ( stealer - donor tuple ) ]"); + stream.println("\t (7) --batch [ Number of primary partitions to move together ]"); + stream.println("\t (8) --stealer-based [ Run the rebalancing from the stealers perspective ]"); + stream.println(); stream.println("GENERATE"); stream.println("a) --current-cluster --current-stores --target-cluster --generate [ Generates a new cluster xml with least number of movements." + " Uses target cluster i.e. current-cluster + new nodes ( with empty partitions ) ]"); - stream.println("\t (i) --output-dir [ Output directory is where we store the optimized cluster ]"); - stream.println("\t (ii) --tries [ Number of optimization cycles ] "); + stream.println("\t (1) --output-dir [ Output directory is where we store the optimized cluster ]"); + stream.println("\t (2) --tries [ Number of optimization cycles ] "); + stream.println("\t (3) --generate-disable-primary-balancing [ Do not balance number of primary partitions across nodes within each zone ] "); + stream.println("\t (4) --generate-enable-xzone-primary-moves [ Allow primary partitions to move across zones ] "); + stream.println("\t (5) --generate-enable-any-xzone-nary-moves [ Allow non-primary partitions to move across zones. ]"); + stream.println("\t (6) --generate-enable-last-resort-xzone-nary-moves [ Allow non-primary partitions to move across zones as a last resort --- Will only do such a move if all possible moves result in xzone move.] "); + stream.println("\t (7) --generate-enable-xzone-shuffle [ Allow non-primary partitions to move across zones for random swaps or greedy swaps.] "); + stream.println("\t (8) --generate-enable-random-swaps [ Attempt to randomly swap partitions within a zone to improve balance ] "); + stream.println("\t (9) --generate-random-swap-attempts num-attempts [ Number of random swaps to attempt in hopes of improving balance ] "); + stream.println("\t(10) --generate-random-swap-successes num-successes [ Stop after num-successes successful random swap atttempts ] "); + stream.println("\t(11) --generate-enable-greedy-swaps [ Attempt to greedily (randomly) swap partitions within a zone to improve balance. Greedily/randomly means sample many swaps for each node and choose best swap. ] "); + stream.println("\t(12) --generate-greedy-swap-attempts num-attempts [ Number of greedy swap passes to attempt. Each pass can be fairly expensive. ] "); + stream.println("\t(13) --generate-greedy-max-partitions-per-node num-partitions [ num-partitions per node to consider in each greedy pass. Partitions selected randomly from each node. ] "); + stream.println("\t(14) --generate-greedy-max-partitions-per-zone num-partitions [ num-partitions per zone to consider in each greedy pass. Partitions selected randomly from all partitions in zone not on node being considered. ] "); + stream.println("\t(15) --generate-max-contiguous-partitions num-contiguous [ Max allowed contiguous partition IDs within a zone ] "); + + stream.println(); + stream.println("ANALYZE"); + stream.println("a) --current-cluster --current-stores --analyze [ Analyzes a cluster xml for balance]"); stream.println(); stream.println("ENTROPY"); stream.println("a) --current-cluster --current-stores --entropy --output-dir [ Runs the entropy calculator if " + "--entropy is true. Else dumps keys to the directory ]"); - stream.println("\t (i) --keys [ Number of keys ( per store ) we calculate entropy for ]"); - stream.println("\t (ii) --verbose-logging [ print keys found missing during entropy ]"); + stream.println("\t (1) --keys [ Number of keys ( per store ) we calculate entropy for ]"); + stream.println("\t (2) --verbose-logging [ print keys found missing during entropy ]"); parser.printHelpOn(stream); } } diff --git a/src/java/voldemort/client/rebalance/RebalanceClientConfig.java b/src/java/voldemort/client/rebalance/RebalanceClientConfig.java index 0154a4e76b..04ce8f1673 100644 --- a/src/java/voldemort/client/rebalance/RebalanceClientConfig.java +++ b/src/java/voldemort/client/rebalance/RebalanceClientConfig.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2010 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -24,13 +24,13 @@ public class RebalanceClientConfig extends AdminClientConfig { public final static int MAX_PARALLEL_REBALANCING = 1; - public final static int MAX_TRIES = 2; + public final static int MAX_TRIES_REBALANCING = 2; public final static long REBALANCING_CLIENT_TIMEOUT_SEC = 30 * 24 * 60 * 60; public final static int PRIMARY_PARTITION_BATCH_SIZE = 1; public final static boolean STEALER_BASED_REBALANCING = true; private int maxParallelRebalancing = MAX_PARALLEL_REBALANCING; - private int maxTriesRebalancing = MAX_TRIES; + private int maxTriesRebalancing = MAX_TRIES_REBALANCING; private long rebalancingClientTimeoutSeconds = REBALANCING_CLIENT_TIMEOUT_SEC; private int primaryPartitionBatchSize = PRIMARY_PARTITION_BATCH_SIZE; private boolean stealerBasedRebalancing = STEALER_BASED_REBALANCING; diff --git a/src/java/voldemort/client/rebalance/RebalanceController.java b/src/java/voldemort/client/rebalance/RebalanceController.java index ed0a2206a3..fb52897487 100644 --- a/src/java/voldemort/client/rebalance/RebalanceController.java +++ b/src/java/voldemort/client/rebalance/RebalanceController.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2010 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -29,6 +29,7 @@ import org.apache.log4j.Logger; import voldemort.VoldemortException; +import voldemort.client.ClientConfig; import voldemort.client.protocol.admin.AdminClient; import voldemort.client.rebalance.task.DonorBasedRebalanceTask; import voldemort.client.rebalance.task.RebalanceTask; @@ -37,7 +38,9 @@ import voldemort.cluster.Node; import voldemort.server.rebalance.VoldemortRebalancingException; import voldemort.store.StoreDefinition; +import voldemort.utils.NodeUtils; import voldemort.utils.RebalanceUtils; +import voldemort.utils.StoreDefinitionUtils; import voldemort.utils.Time; import voldemort.versioning.Versioned; import voldemort.xml.ClusterMapper; @@ -55,12 +58,12 @@ public class RebalanceController { private final RebalanceClientConfig rebalanceConfig; public RebalanceController(String bootstrapUrl, RebalanceClientConfig rebalanceConfig) { - this.adminClient = new AdminClient(bootstrapUrl, rebalanceConfig); + this.adminClient = new AdminClient(bootstrapUrl, rebalanceConfig, new ClientConfig()); this.rebalanceConfig = rebalanceConfig; } public RebalanceController(Cluster cluster, RebalanceClientConfig config) { - this.adminClient = new AdminClient(cluster, config); + this.adminClient = new AdminClient(cluster, config, new ClientConfig()); this.rebalanceConfig = config; } @@ -73,8 +76,8 @@ public RebalanceController(Cluster cluster, RebalanceClientConfig config) { public void rebalance(final Cluster targetCluster) { // Retrieve the latest cluster metadata from the existing nodes - Versioned currentVersionedCluster = RebalanceUtils.getLatestCluster(RebalanceUtils.getNodeIds(Lists.newArrayList(adminClient.getAdminClientCluster() - .getNodes())), + Versioned currentVersionedCluster = RebalanceUtils.getLatestCluster(NodeUtils.getNodeIds(Lists.newArrayList(adminClient.getAdminClientCluster() + .getNodes())), adminClient); Cluster currentCluster = currentVersionedCluster.getValue(); @@ -184,6 +187,12 @@ private void rebalancePerClusterTransition(Cluster currentCluster, // Used for creating clones ClusterMapper mapper = new ClusterMapper(); + // Output initial and final cluster + if(rebalanceConfig.hasOutputDirectory()) + RebalanceUtils.dumpCluster(currentCluster, + targetCluster, + new File(rebalanceConfig.getOutputDirectory())); + // Start first dry run to compute the stolen partitions for(Node stealerNode: targetCluster.getNodes()) { List stolenPrimaryPartitions = RebalanceUtils.getStolenPrimaryPartitions(currentCluster, @@ -246,6 +255,7 @@ private void rebalancePerClusterTransition(Cluster currentCluster, logger.info("Total number of tasks : " + numTasks); int tasksCompleted = 0; + int batchCounter = 0; int primaryPartitionId = 0; double totalTimeMs = 0.0; @@ -267,6 +277,7 @@ private void rebalancePerClusterTransition(Cluster currentCluster, if(primaryPartitionBatchSize == rebalanceConfig.getPrimaryPartitionBatchSize()) break; } + batchCounter++; // Remove the partitions moved + Prepare message to print StringBuffer buffer = new StringBuffer(); @@ -301,7 +312,8 @@ private void rebalancePerClusterTransition(Cluster currentCluster, if(rebalanceConfig.hasOutputDirectory()) RebalanceUtils.dumpCluster(currentCluster, transitionCluster, - new File(rebalanceConfig.getOutputDirectory())); + new File(rebalanceConfig.getOutputDirectory()), + "batch-" + Integer.toString(batchCounter) + "."); long startTimeMs = System.currentTimeMillis(); rebalancePerPartitionTransition(orderedClusterTransition); @@ -375,10 +387,10 @@ private void rebalancePerPartitionTransition(final OrderedClusterTransition orde List rebalancePartitionPlanList = rebalancePartitionsInfoList; // Split the store definitions - List readOnlyStoreDefs = RebalanceUtils.filterStores(orderedClusterTransition.getStoreDefs(), - true); - List readWriteStoreDefs = RebalanceUtils.filterStores(orderedClusterTransition.getStoreDefs(), - false); + List readOnlyStoreDefs = StoreDefinitionUtils.filterStores(orderedClusterTransition.getStoreDefs(), + true); + List readWriteStoreDefs = StoreDefinitionUtils.filterStores(orderedClusterTransition.getStoreDefs(), + false); boolean hasReadOnlyStores = readOnlyStoreDefs != null && readOnlyStoreDefs.size() > 0; boolean hasReadWriteStores = readWriteStoreDefs != null && readWriteStoreDefs.size() > 0; @@ -497,52 +509,52 @@ private void rebalanceStateChange(final int taskId, logger, "Cluster metadata change + rebalance state change"); if(!rebalanceConfig.isShowPlanEnabled()) - adminClient.rebalanceStateChange(currentCluster, - transitionCluster, - rebalancePartitionPlanList, - false, - true, - true, - true, - true); + adminClient.rebalanceOps.rebalanceStateChange(currentCluster, + transitionCluster, + rebalancePartitionPlanList, + false, + true, + true, + true, + true); } else if(hasReadOnlyStores && !finishedReadOnlyStores) { // Case 1 / 3 - rebalance state change RebalanceUtils.printLog(taskId, logger, "Rebalance state change"); if(!rebalanceConfig.isShowPlanEnabled()) - adminClient.rebalanceStateChange(currentCluster, - transitionCluster, - rebalancePartitionPlanList, - false, - false, - true, - true, - true); + adminClient.rebalanceOps.rebalanceStateChange(currentCluster, + transitionCluster, + rebalancePartitionPlanList, + false, + false, + true, + true, + true); } else if(hasReadOnlyStores && !hasReadWriteStores && finishedReadOnlyStores) { // Case 2 - swap + cluster change RebalanceUtils.printLog(taskId, logger, "Swap + Cluster metadata change"); if(!rebalanceConfig.isShowPlanEnabled()) - adminClient.rebalanceStateChange(currentCluster, - transitionCluster, - rebalancePartitionPlanList, - true, - true, - false, - true, - true); + adminClient.rebalanceOps.rebalanceStateChange(currentCluster, + transitionCluster, + rebalancePartitionPlanList, + true, + true, + false, + true, + true); } else { // Case 0 - swap + cluster change + rebalance state change RebalanceUtils.printLog(taskId, logger, "Swap + Cluster metadata change + rebalance state change"); if(!rebalanceConfig.isShowPlanEnabled()) - adminClient.rebalanceStateChange(currentCluster, - transitionCluster, - rebalancePartitionPlanList, - true, - true, - true, - true, - true); + adminClient.rebalanceOps.rebalanceStateChange(currentCluster, + transitionCluster, + rebalancePartitionPlanList, + true, + true, + true, + true, + true); } } catch(VoldemortRebalancingException e) { @@ -664,24 +676,24 @@ private void rebalancePerTaskTransition(final int taskId, if(hasReadOnlyStores && hasReadWriteStores && finishedReadOnlyStores) { // Case 0 - adminClient.rebalanceStateChange(null, - currentCluster, - null, - true, - true, - false, - false, - false); + adminClient.rebalanceOps.rebalanceStateChange(null, + currentCluster, + null, + true, + true, + false, + false, + false); } else if(hasReadWriteStores && finishedReadOnlyStores) { // Case 4 - adminClient.rebalanceStateChange(null, - currentCluster, - null, - false, - true, - false, - false, - false); + adminClient.rebalanceOps.rebalanceStateChange(null, + currentCluster, + null, + false, + true, + false, + false, + false); } throw e; @@ -719,8 +731,8 @@ private List executeTasks(final int taskId, false); for(Entry> entries: donorNodeBasedPartitionsInfo.entrySet()) { try { - Thread.sleep(10000); - } catch (InterruptedException e) {} + Thread.sleep(10000); + } catch(InterruptedException e) {} DonorBasedRebalanceTask rebalanceTask = new DonorBasedRebalanceTask(taskId, entries.getValue(), rebalanceConfig, @@ -740,7 +752,7 @@ public AdminClient getAdminClient() { } public void stop() { - adminClient.stop(); + adminClient.close(); } } diff --git a/src/java/voldemort/client/rebalance/RebalancePartitionsInfo.java b/src/java/voldemort/client/rebalance/RebalancePartitionsInfo.java index 078bbac332..330a81d021 100644 --- a/src/java/voldemort/client/rebalance/RebalancePartitionsInfo.java +++ b/src/java/voldemort/client/rebalance/RebalancePartitionsInfo.java @@ -1,3 +1,18 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ package voldemort.client.rebalance; import java.io.StringReader; @@ -6,8 +21,8 @@ import java.util.Iterator; import java.util.List; import java.util.Map; -import java.util.Set; import java.util.Map.Entry; +import java.util.Set; import voldemort.VoldemortException; import voldemort.cluster.Cluster; @@ -165,7 +180,8 @@ public ImmutableMap asMap() { replicaToAddPartition.get(replicaNum)); } else { builder.put(unbalancedStore + "replicaToAddPartitionList" - + Integer.toString(replicaNum), Lists.newArrayList()); + + Integer.toString(replicaNum), + Lists.newArrayList()); } if(replicaToDeletePartition != null @@ -175,7 +191,8 @@ public ImmutableMap asMap() { replicaToDeletePartition.get(replicaNum)); } else { builder.put(unbalancedStore + "replicaToDeletePartitionList" - + Integer.toString(replicaNum), Lists.newArrayList()); + + Integer.toString(replicaNum), + Lists.newArrayList()); } } } @@ -261,7 +278,9 @@ public List getStealMasterPartitions() { @Override public String toString() { StringBuffer sb = new StringBuffer(); - sb.append("\nRebalancePartitionsInfo(" + getStealerId() + " <--- " + getDonorId() + " "); + sb.append("\nRebalancePartitionsInfo(" + getStealerId() + " [" + + initialCluster.getNodeById(getStealerId()).getHost() + "] <--- " + getDonorId() + + " [" + initialCluster.getNodeById(getDonorId()).getHost() + "] "); for(String unbalancedStore: storeToReplicaToAddPartitionList.keySet()) { diff --git a/src/java/voldemort/client/rebalance/task/DonorBasedRebalanceTask.java b/src/java/voldemort/client/rebalance/task/DonorBasedRebalanceTask.java index 743adcae88..9f50b3a5a3 100644 --- a/src/java/voldemort/client/rebalance/task/DonorBasedRebalanceTask.java +++ b/src/java/voldemort/client/rebalance/task/DonorBasedRebalanceTask.java @@ -44,13 +44,13 @@ public void run() { RebalanceUtils.printLog(taskId, logger, "Starting on node " + donorNodeId + " rebalancing task " + stealInfos); - rebalanceAsyncId = adminClient.rebalanceNode(stealInfos); + rebalanceAsyncId = adminClient.rebalanceOps.rebalanceNode(stealInfos); // Wait for the task to get over - adminClient.waitForCompletion(donorNodeId, - rebalanceAsyncId, - config.getRebalancingClientTimeoutSeconds(), - TimeUnit.SECONDS); + adminClient.rpcOps.waitForCompletion(donorNodeId, + rebalanceAsyncId, + config.getRebalancingClientTimeoutSeconds(), + TimeUnit.SECONDS); RebalanceUtils.printLog(taskId, logger, "Succesfully finished rebalance for async operation id " @@ -59,8 +59,9 @@ public void run() { } catch(UnreachableStoreException e) { exception = e; logger.error("Donor node " + donorNodeId - + " is unreachable, please make sure it is up and running : " - + e.getMessage(), e); + + " is unreachable, please make sure it is up and running : " + + e.getMessage(), + e); } catch(Exception e) { exception = e; logger.error("Rebalance failed : " + e.getMessage(), e); diff --git a/src/java/voldemort/client/rebalance/task/StealerBasedRebalanceTask.java b/src/java/voldemort/client/rebalance/task/StealerBasedRebalanceTask.java index c3840a11c7..572418818c 100644 --- a/src/java/voldemort/client/rebalance/task/StealerBasedRebalanceTask.java +++ b/src/java/voldemort/client/rebalance/task/StealerBasedRebalanceTask.java @@ -49,7 +49,7 @@ private int startNodeRebalancing() { RebalanceUtils.printLog(taskId, logger, "Starting on node " + stealerNodeId + " rebalancing task " + stealInfos.get(0)); - int asyncOperationId = adminClient.rebalanceNode(stealInfos.get(0)); + int asyncOperationId = adminClient.rebalanceOps.rebalanceNode(stealInfos.get(0)); return asyncOperationId; } catch(AlreadyRebalancingException e) { @@ -58,11 +58,11 @@ private int startNodeRebalancing() { "Node " + stealerNodeId + " is currently rebalancing. Waiting till completion"); - adminClient.waitForCompletion(stealerNodeId, - MetadataStore.SERVER_STATE_KEY, - VoldemortState.NORMAL_SERVER.toString(), - config.getRebalancingClientTimeoutSeconds(), - TimeUnit.SECONDS); + adminClient.rpcOps.waitForCompletion(stealerNodeId, + MetadataStore.SERVER_STATE_KEY, + VoldemortState.NORMAL_SERVER.toString(), + config.getRebalancingClientTimeoutSeconds(), + TimeUnit.SECONDS); rebalanceException = e; } } @@ -83,10 +83,10 @@ public void run() { rebalanceAsyncId = startNodeRebalancing(); // Wait for the task to get over - adminClient.waitForCompletion(stealerNodeId, - rebalanceAsyncId, - config.getRebalancingClientTimeoutSeconds(), - TimeUnit.SECONDS); + adminClient.rpcOps.waitForCompletion(stealerNodeId, + rebalanceAsyncId, + config.getRebalancingClientTimeoutSeconds(), + TimeUnit.SECONDS); RebalanceUtils.printLog(taskId, logger, "Succesfully finished rebalance for async operation id " @@ -95,8 +95,9 @@ public void run() { } catch(UnreachableStoreException e) { exception = e; logger.error("Stealer node " + stealerNodeId - + " is unreachable, please make sure it is up and running : " - + e.getMessage(), e); + + " is unreachable, please make sure it is up and running : " + + e.getMessage(), + e); } catch(Exception e) { exception = e; logger.error("Rebalance failed : " + e.getMessage(), e); diff --git a/src/java/voldemort/cluster/Cluster.java b/src/java/voldemort/cluster/Cluster.java index 13933a2ea6..be7bd75196 100644 --- a/src/java/voldemort/cluster/Cluster.java +++ b/src/java/voldemort/cluster/Cluster.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2009 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -23,6 +23,8 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.TreeSet; import voldemort.VoldemortException; import voldemort.annotations.concurrency.Threadsafe; @@ -47,6 +49,8 @@ public class Cluster implements Serializable { private final int numberOfTags; private final Map nodesById; private final Map zonesById; + private final Map> nodesPerZone; + private final Map> partitionsPerZone; public Cluster(String name, List nodes) { this(name, nodes, new ArrayList()); @@ -54,6 +58,9 @@ public Cluster(String name, List nodes) { public Cluster(String name, List nodes, List zones) { this.name = Utils.notNull(name); + this.partitionsPerZone = new LinkedHashMap>(); + this.nodesPerZone = new LinkedHashMap>(); + if(zones.size() != 0) { zonesById = new LinkedHashMap(zones.size()); for(Zone zone: zones) { @@ -61,11 +68,16 @@ public Cluster(String name, List nodes, List zones) { throw new IllegalArgumentException("Zone id " + zone.getId() + " appears twice in the zone list."); zonesById.put(zone.getId(), zone); + nodesPerZone.put(zone, new ArrayList()); + partitionsPerZone.put(zone, new ArrayList()); } } else { // Add default zone zonesById = new LinkedHashMap(1); - zonesById.put(Zone.DEFAULT_ZONE_ID, new Zone()); + Zone defaultZone = new Zone(); + zonesById.put(defaultZone.getId(), defaultZone); + nodesPerZone.put(defaultZone, new ArrayList()); + partitionsPerZone.put(defaultZone, new ArrayList()); } this.nodesById = new LinkedHashMap(nodes.size()); @@ -74,14 +86,19 @@ public Cluster(String name, List nodes, List zones) { throw new IllegalArgumentException("Node id " + node.getId() + " appears twice in the node list."); nodesById.put(node.getId(), node); + + Zone nodesZone = zonesById.get(node.getZoneId()); + nodesPerZone.get(nodesZone).add(node.getId()); + partitionsPerZone.get(nodesZone).addAll(node.getPartitionIds()); } this.numberOfTags = getNumberOfTags(nodes); } private int getNumberOfTags(List nodes) { List tags = new ArrayList(); - for(Node node: nodes) + for(Node node: nodes) { tags.addAll(node.getPartitionIds()); + } Collections.sort(tags); for(int i = 0; i < numberOfTags; i++) { if(tags.get(i).intValue() != i) @@ -99,6 +116,23 @@ public Collection getNodes() { return nodesById.values(); } + /** + * @return Sorted set of node Ids + */ + public Set getNodeIds() { + Set nodeIds = nodesById.keySet(); + return new TreeSet(nodeIds); + } + + /** + * + * @return Sorted set of Zone Ids + */ + public Set getZoneIds() { + Set zoneIds = zonesById.keySet(); + return new TreeSet(zoneIds); + } + public Collection getZones() { return zonesById.values(); } @@ -133,6 +167,28 @@ public int getNumberOfZones() { return zonesById.size(); } + public int getNumberOfPartitionsInZone(Integer zoneId) { + return partitionsPerZone.get(getZoneById(zoneId)).size(); + } + + public int getNumberOfNodesInZone(Integer zoneId) { + return nodesPerZone.get(getZoneById(zoneId)).size(); + } + + /** + * @return Sorted set of node Ids for given zone + */ + public Set getNodeIdsInZone(Integer zoneId) { + return new TreeSet(nodesPerZone.get(getZoneById(zoneId))); + } + + /** + * @return Sorted set of partition Ids for given zone + */ + public Set getPartitionIdsInZone(Integer zoneId) { + return new TreeSet(partitionsPerZone.get(getZoneById(zoneId))); + } + public Node getNodeById(int id) { Node node = nodesById.get(id); if(node == null) diff --git a/src/java/voldemort/cluster/Node.java b/src/java/voldemort/cluster/Node.java index b1b7e1d4ea..5140f95941 100644 --- a/src/java/voldemort/cluster/Node.java +++ b/src/java/voldemort/cluster/Node.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2009 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -150,6 +150,7 @@ public int hashCode() { return getId(); } + @Override public int compareTo(Node other) { return Integer.valueOf(this.id).compareTo(other.getId()); } diff --git a/src/java/voldemort/cluster/failuredetector/AbstractFailureDetector.java b/src/java/voldemort/cluster/failuredetector/AbstractFailureDetector.java index ab6c70427e..24dce0c3b4 100644 --- a/src/java/voldemort/cluster/failuredetector/AbstractFailureDetector.java +++ b/src/java/voldemort/cluster/failuredetector/AbstractFailureDetector.java @@ -1,5 +1,5 @@ /* - * Copyright 2009 Mustard Grain, Inc., 2009-2010 LinkedIn, Inc. + * Copyright 2009 Mustard Grain, Inc., 2009-2012 LinkedIn, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -186,11 +186,11 @@ protected void setAvailable(Node node) { protected void setUnavailable(Node node, UnreachableStoreException e) { NodeStatus nodeStatus = getNodeStatus(node); - if(logger.isEnabledFor(Level.WARN)) { + if(logger.isDebugEnabled()) { if(e != null) - logger.warn("Node " + node.getId() + " set as unavailable", e); + logger.debug("Node " + node.getId() + " set as unavailable", e); else - logger.warn("Node " + node.getId() + " set as unavailable"); + logger.debug("Node " + node.getId() + " set as unavailable"); } // We need to distinguish the case where we're newly unavailable and the diff --git a/src/java/voldemort/cluster/failuredetector/AsyncRecoveryFailureDetector.java b/src/java/voldemort/cluster/failuredetector/AsyncRecoveryFailureDetector.java index 5bcd1337cc..32e9801d13 100644 --- a/src/java/voldemort/cluster/failuredetector/AsyncRecoveryFailureDetector.java +++ b/src/java/voldemort/cluster/failuredetector/AsyncRecoveryFailureDetector.java @@ -1,5 +1,5 @@ /* - * Copyright 2009 Mustard Grain, Inc., 2009-2010 LinkedIn, Inc. + * Copyright 2009 Mustard Grain, Inc., 2009-2012 LinkedIn, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -117,12 +117,12 @@ public void run() { if(logger.isDebugEnabled()) logger.debug("Verified previously unavailable node " + node.getId() - + ", will mark as available..."); + + "is now available."); nodeRecovered(node); } catch(UnreachableStoreException e) { - if(logger.isEnabledFor(Level.WARN)) - logger.warn("Node " + node.getId() + " still unavailable", e); + if(logger.isDebugEnabled()) + logger.debug("Node " + node.getId() + " still unavailable."); } catch(Exception e) { if(logger.isEnabledFor(Level.ERROR)) logger.error("Node " + node.getId() + " unavailable due to error", e); diff --git a/src/java/voldemort/cluster/failuredetector/ThresholdFailureDetector.java b/src/java/voldemort/cluster/failuredetector/ThresholdFailureDetector.java index f577892fa7..5e3ec8c9b7 100644 --- a/src/java/voldemort/cluster/failuredetector/ThresholdFailureDetector.java +++ b/src/java/voldemort/cluster/failuredetector/ThresholdFailureDetector.java @@ -1,5 +1,5 @@ /* - * Copyright 2009-2010 LinkedIn, Inc. + * Copyright 2009-2012 LinkedIn, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -145,6 +145,10 @@ protected void update(Node node, int successDelta, UnreachableStoreException e) String catastrophicError = getCatastrophicError(e); NodeStatus nodeStatus = getNodeStatus(node); + boolean invokeSetAvailable = false; + boolean invokeSetUnavailable = false; + // Protect all logic to decide on available/unavailable w/in + // synchronized section synchronized(nodeStatus) { if(currentTime >= nodeStatus.getStartMillis() + getConfig().getThresholdInterval()) { // We've passed into a new interval, so reset our counts @@ -165,7 +169,7 @@ protected void update(Node node, int successDelta, UnreachableStoreException e) logger.trace("Node " + node.getId() + " experienced catastrophic error: " + catastrophicError); - setUnavailable(node, e); + invokeSetUnavailable = true; } else if(nodeStatus.getFailure() >= getConfig().getThresholdCountMinimum()) { long percentage = (nodeStatus.getSuccess() * 100) / nodeStatus.getTotal(); @@ -173,12 +177,20 @@ protected void update(Node node, int successDelta, UnreachableStoreException e) logger.trace("Node " + node.getId() + " percentage: " + percentage + "%"); if(percentage >= getConfig().getThreshold()) - setAvailable(node); + invokeSetAvailable = true; else - setUnavailable(node, e); + invokeSetUnavailable = true; } } } + // Actually call set(Un)Available outside of synchronized section. This + // ensures that side effects are not w/in a sync section (e.g., alerting + // all the failure detector listeners). + if(invokeSetAvailable) { + setAvailable(node); + } else if(invokeSetUnavailable) { + setUnavailable(node, e); + } } protected String getCatastrophicError(UnreachableStoreException e) { diff --git a/src/java/voldemort/utils/ByteBufferBackedInputStream.java b/src/java/voldemort/common/nio/ByteBufferBackedInputStream.java similarity index 65% rename from src/java/voldemort/utils/ByteBufferBackedInputStream.java rename to src/java/voldemort/common/nio/ByteBufferBackedInputStream.java index aeddbe62b0..90bd195d5e 100644 --- a/src/java/voldemort/utils/ByteBufferBackedInputStream.java +++ b/src/java/voldemort/common/nio/ByteBufferBackedInputStream.java @@ -14,12 +14,14 @@ * the License. */ -package voldemort.utils; +package voldemort.common.nio; import java.io.IOException; import java.io.InputStream; import java.nio.ByteBuffer; +import org.apache.commons.lang.mutable.MutableLong; + import voldemort.annotations.concurrency.NotThreadsafe; /** @@ -38,16 +40,35 @@ public class ByteBufferBackedInputStream extends InputStream { private ByteBuffer buffer; + /** + * Reference to a size tracking object, that tracks the size of the buffer + * in bytes + */ + private MutableLong sizeTracker; + public ByteBufferBackedInputStream(ByteBuffer buffer) { this.buffer = buffer; + this.sizeTracker = null; + } + + public ByteBufferBackedInputStream(ByteBuffer buffer, MutableLong sizeTracker) { + this.buffer = buffer; + this.sizeTracker = sizeTracker; + if(buffer != null) + this.sizeTracker.add(buffer.capacity()); } public ByteBuffer getBuffer() { return buffer; } - public void setBuffer(ByteBuffer buffer) { - this.buffer = buffer; + public void setBuffer(ByteBuffer newBuffer) { + // update the size tracker with the new buffer size + if((sizeTracker != null && this.buffer != null && newBuffer != null)) { + sizeTracker.add(newBuffer.capacity()); + sizeTracker.subtract(this.buffer.capacity()); + } + this.buffer = newBuffer; } @Override @@ -68,4 +89,9 @@ public int read(byte[] bytes, int off, int len) throws IOException { return len; } + public void close() { + if(sizeTracker != null && this.buffer != null) { + sizeTracker.subtract(this.buffer.capacity()); + } + } } diff --git a/src/java/voldemort/utils/ByteBufferBackedOutputStream.java b/src/java/voldemort/common/nio/ByteBufferBackedOutputStream.java similarity index 64% rename from src/java/voldemort/utils/ByteBufferBackedOutputStream.java rename to src/java/voldemort/common/nio/ByteBufferBackedOutputStream.java index 0be7802622..ce8067d35a 100644 --- a/src/java/voldemort/utils/ByteBufferBackedOutputStream.java +++ b/src/java/voldemort/common/nio/ByteBufferBackedOutputStream.java @@ -14,13 +14,16 @@ * the License. */ -package voldemort.utils; +package voldemort.common.nio; import java.io.IOException; import java.io.OutputStream; import java.nio.ByteBuffer; +import org.apache.commons.lang.mutable.MutableLong; + import voldemort.annotations.concurrency.NotThreadsafe; +import voldemort.utils.ByteUtils; /** * ByteBufferBackedOutputStream serves two purposes: @@ -46,17 +49,37 @@ public class ByteBufferBackedOutputStream extends OutputStream { private boolean wasExpanded; + /** + * Reference to a size tracking object, that tracks the size of the buffer + * in bytes + */ + private MutableLong sizeTracker; + public ByteBufferBackedOutputStream(ByteBuffer buffer) { this.buffer = buffer; wasExpanded = false; + this.sizeTracker = null; + } + + public ByteBufferBackedOutputStream(ByteBuffer buffer, MutableLong sizeTracker) { + this.buffer = buffer; + wasExpanded = false; + this.sizeTracker = sizeTracker; + if(buffer != null) + this.sizeTracker.add(buffer.capacity()); } public ByteBuffer getBuffer() { return buffer; } - public void setBuffer(ByteBuffer buffer) { - this.buffer = buffer; + public void setBuffer(ByteBuffer newBuffer) { + // update the size tracker with the new buffer size + if((sizeTracker != null && this.buffer != null && newBuffer != null)) { + sizeTracker.add(newBuffer.capacity()); + sizeTracker.subtract(this.buffer.capacity()); + } + this.buffer = newBuffer; wasExpanded = false; } @@ -78,8 +101,13 @@ private void expandIfNeeded(int len) { if(need <= 0) return; - int newCapacity = buffer.capacity() + need; - buffer = ByteUtils.expand(buffer, newCapacity * 2); + int newCapacity = (buffer.capacity() + need) * 2; + // update the size tracker with the new buffer size + if(sizeTracker != null) { + sizeTracker.add(newCapacity); + sizeTracker.subtract(this.buffer.capacity()); + } + buffer = ByteUtils.expand(buffer, newCapacity); wasExpanded = true; } @@ -87,4 +115,9 @@ public boolean wasExpanded() { return wasExpanded; } + public void close() { + if(sizeTracker != null && this.buffer != null) { + sizeTracker.subtract(this.buffer.capacity()); + } + } } diff --git a/src/java/voldemort/common/nio/CommBufferSizeStats.java b/src/java/voldemort/common/nio/CommBufferSizeStats.java new file mode 100644 index 0000000000..ba71b5b1de --- /dev/null +++ b/src/java/voldemort/common/nio/CommBufferSizeStats.java @@ -0,0 +1,28 @@ +package voldemort.common.nio; + +import org.apache.commons.lang.mutable.MutableLong; + +/** + * Statistics object to track the communication buffer sizes across all the + * connections, handled by the selector managers + * + */ +public class CommBufferSizeStats { + + private MutableLong commReadBufferSizeBytes; + + private MutableLong commWriteBufferSizeBytes; + + public CommBufferSizeStats() { + commReadBufferSizeBytes = new MutableLong(0); + commWriteBufferSizeBytes = new MutableLong(0); + } + + public MutableLong getCommReadBufferSizeTracker() { + return commReadBufferSizeBytes; + } + + public MutableLong getCommWriteBufferSizeTracker() { + return commWriteBufferSizeBytes; + } +} diff --git a/src/java/voldemort/utils/SelectorManager.java b/src/java/voldemort/common/nio/SelectorManager.java similarity index 90% rename from src/java/voldemort/utils/SelectorManager.java rename to src/java/voldemort/common/nio/SelectorManager.java index 652f5daecc..b68e2258d8 100644 --- a/src/java/voldemort/utils/SelectorManager.java +++ b/src/java/voldemort/common/nio/SelectorManager.java @@ -14,7 +14,7 @@ * the License. */ -package voldemort.utils; +package voldemort.common.nio; import java.io.IOException; import java.nio.channels.ClosedSelectorException; @@ -98,6 +98,23 @@ public class SelectorManager implements Runnable { protected final Logger logger = Logger.getLogger(getClass()); + // statistics about the current select loop + /** + * Number of connections selected (meaning they have some data to be + * read/written) in the current processing loop + */ + protected int selectCount = -1; + /** + * Amount of time taken to process all the connections selected in this + * processing loop + */ + protected long processingTimeMs = -1; + /** + * Amount of time spent in the select() call. This is an indicator of how + * busy the thread is + */ + protected long selectTimeMs = -1; + public SelectorManager() { try { this.selector = Selector.open(); @@ -172,7 +189,10 @@ public void run() { processEvents(); try { + selectTimeMs = System.currentTimeMillis(); int selected = selector.select(SELECTOR_POLL_MS); + selectTimeMs = System.currentTimeMillis() - selectTimeMs; + selectCount = selected; if(isClosed.get()) { if(logger.isInfoEnabled()) @@ -182,6 +202,7 @@ public void run() { } if(selected > 0) { + processingTimeMs = System.currentTimeMillis(); Iterator i = selector.selectedKeys().iterator(); while(i.hasNext()) { @@ -194,6 +215,7 @@ public void run() { worker.run(); } } + processingTimeMs = System.currentTimeMillis() - processingTimeMs; } } catch(ClosedSelectorException e) { if(logger.isDebugEnabled()) @@ -217,5 +239,4 @@ public void run() { } } } - } diff --git a/src/java/voldemort/utils/SelectorManagerWorker.java b/src/java/voldemort/common/nio/SelectorManagerWorker.java similarity index 92% rename from src/java/voldemort/utils/SelectorManagerWorker.java rename to src/java/voldemort/common/nio/SelectorManagerWorker.java index 77169c5da7..f378e560f5 100644 --- a/src/java/voldemort/utils/SelectorManagerWorker.java +++ b/src/java/voldemort/common/nio/SelectorManagerWorker.java @@ -14,7 +14,7 @@ * the License. */ -package voldemort.utils; +package voldemort.common.nio; import java.io.EOFException; import java.io.IOException; @@ -29,6 +29,8 @@ import org.apache.log4j.Level; import org.apache.log4j.Logger; +import voldemort.utils.ByteUtils; + /** * SelectorManagerWorker manages a Selector, SocketChannel, and IO streams * implementation. At the point that the run method is invoked, the Selector @@ -62,13 +64,16 @@ public abstract class SelectorManagerWorker implements Runnable { public SelectorManagerWorker(Selector selector, SocketChannel socketChannel, - int socketBufferSize) { + int socketBufferSize, + CommBufferSizeStats commBufferStats) { this.selector = selector; this.socketChannel = socketChannel; this.socketBufferSize = socketBufferSize; this.resizeThreshold = socketBufferSize * 2; // This is arbitrary... - this.inputStream = new ByteBufferBackedInputStream(ByteBuffer.allocate(socketBufferSize)); - this.outputStream = new ByteBufferBackedOutputStream(ByteBuffer.allocate(socketBufferSize)); + this.inputStream = new ByteBufferBackedInputStream(ByteBuffer.allocate(socketBufferSize), + commBufferStats.getCommReadBufferSizeTracker()); + this.outputStream = new ByteBufferBackedOutputStream(ByteBuffer.allocate(socketBufferSize), + commBufferStats.getCommWriteBufferSizeTracker()); this.createTimestamp = System.nanoTime(); this.isClosed = new AtomicBoolean(false); @@ -162,6 +167,10 @@ protected void closeInternal() { logger.warn(e.getMessage(), e); } } + + // close the streams, so we account for comm buffer frees + inputStream.close(); + outputStream.close(); } public boolean isClosed() { diff --git a/src/java/voldemort/common/service/AbstractService.java b/src/java/voldemort/common/service/AbstractService.java index 222faed014..25a1496030 100644 --- a/src/java/voldemort/common/service/AbstractService.java +++ b/src/java/voldemort/common/service/AbstractService.java @@ -68,6 +68,7 @@ public void stop() { synchronized(this) { if(!isStarted()) { logger.info("The service is already stopped, ignoring duplicate attempt."); + return; } stopInner(); diff --git a/src/java/voldemort/common/service/SchedulerService.java b/src/java/voldemort/common/service/SchedulerService.java index 645be4015d..2cde4dd98a 100644 --- a/src/java/voldemort/common/service/SchedulerService.java +++ b/src/java/voldemort/common/service/SchedulerService.java @@ -164,10 +164,25 @@ public void schedule(String id, Runnable runnable, Date timeToRun) { } public void schedule(String id, Runnable runnable, Date nextRun, long periodMs) { - ScheduledFuture future = scheduler.scheduleWithFixedDelay(runnable, - delayMs(nextRun), - periodMs, - TimeUnit.MILLISECONDS); + schedule(id, runnable, nextRun, periodMs, false); + } + + public void schedule(String id, + Runnable runnable, + Date nextRun, + long periodMs, + boolean scheduleAtFixedRate) { + ScheduledFuture future = null; + if(scheduleAtFixedRate) + future = scheduler.scheduleAtFixedRate(runnable, + delayMs(nextRun), + periodMs, + TimeUnit.MILLISECONDS); + else + future = scheduler.scheduleWithFixedDelay(runnable, + delayMs(nextRun), + periodMs, + TimeUnit.MILLISECONDS); if(!allJobs.containsKey(id)) { allJobs.put(id, new ScheduledRunnable(runnable, nextRun, periodMs)); } diff --git a/src/java/voldemort/common/service/ServiceType.java b/src/java/voldemort/common/service/ServiceType.java index 0fe7c7932b..30b29182be 100644 --- a/src/java/voldemort/common/service/ServiceType.java +++ b/src/java/voldemort/common/service/ServiceType.java @@ -15,7 +15,8 @@ public enum ServiceType { VOLDEMORT("voldemort-server"), ASYNC_SCHEDULER("async-scheduler"), GOSSIP("gossip-service"), - REBALANCE("rebalance-service"); + REBALANCE("rebalance-service"), + COORDINATOR("coordinator-service"); private final String display; diff --git a/src/java/voldemort/coordinator/CoordinatorConfig.java b/src/java/voldemort/coordinator/CoordinatorConfig.java new file mode 100644 index 0000000000..670f278b6d --- /dev/null +++ b/src/java/voldemort/coordinator/CoordinatorConfig.java @@ -0,0 +1,148 @@ +package voldemort.coordinator; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.List; +import java.util.Properties; + +import org.apache.commons.io.IOUtils; + +import voldemort.utils.ConfigurationException; +import voldemort.utils.Props; +import voldemort.utils.Utils; + +public class CoordinatorConfig { + + private volatile List bootstrapURLs = null; + private volatile String fatClientConfigPath = null; + private volatile int fatClientWrapperMaxPoolSize = 20; + private volatile int fatClientWrapperCorePoolSize = 20; + private volatile int fatClientWrapperKeepAliveInSecs = 60; + private volatile int metadataCheckIntervalInMs = 5000; + + /* Propery names for propery-based configuration */ + public static final String BOOTSTRAP_URLS_PROPERTY = "bootstrap_urls"; + public static final String FAT_CLIENTS_CONFIG_FILE_PATH_PROPERTY = "fat_clients_config_file_path"; + public static final String FAT_CLIENT_WRAPPER_MAX_POOL_SIZE_PROPERTY = "fat_client_wrapper_max_pool_size"; + public static final String FAT_CLIENT_WRAPPER_CORE_POOL_SIZE_PROPERTY = "fat_client_wrapper_core_pool_size"; + public static final String FAT_CLIENT_WRAPPER_POOL_KEEPALIVE_IN_SECS = "fat_client_wrapper_pool_keepalive_in_secs"; + public static final String METADATA_CHECK_INTERVAL_IN_MS = "metadata_check_interval_in_ms"; + + /** + * Instantiate the coordinator config using a properties file + * + * @param propertyFile Properties file + */ + public CoordinatorConfig(File propertyFile) { + Properties properties = new Properties(); + InputStream input = null; + try { + input = new BufferedInputStream(new FileInputStream(propertyFile.getAbsolutePath())); + properties.load(input); + } catch(IOException e) { + throw new ConfigurationException(e); + } finally { + IOUtils.closeQuietly(input); + } + setProperties(properties); + } + + /** + * Initiate the coordinator config from a set of properties. This is useful + * for wiring from Spring or for externalizing client properties to a + * properties file + * + * @param properties The properties to use + */ + public CoordinatorConfig(Properties properties) { + setProperties(properties); + } + + private void setProperties(Properties properties) { + Props props = new Props(properties); + if(props.containsKey(BOOTSTRAP_URLS_PROPERTY)) { + setBootstrapURLs(props.getList(BOOTSTRAP_URLS_PROPERTY)); + } + + if(props.containsKey(FAT_CLIENTS_CONFIG_FILE_PATH_PROPERTY)) { + setFatClientConfigPath(props.getString(FAT_CLIENTS_CONFIG_FILE_PATH_PROPERTY)); + } + + if(props.containsKey(FAT_CLIENT_WRAPPER_CORE_POOL_SIZE_PROPERTY)) { + setFatClientWrapperCorePoolSize(props.getInt(FAT_CLIENT_WRAPPER_CORE_POOL_SIZE_PROPERTY, + this.fatClientWrapperCorePoolSize)); + } + + if(props.containsKey(FAT_CLIENT_WRAPPER_MAX_POOL_SIZE_PROPERTY)) { + setFatClientWrapperMaxPoolSize(props.getInt(FAT_CLIENT_WRAPPER_MAX_POOL_SIZE_PROPERTY, + this.fatClientWrapperMaxPoolSize)); + } + + if(props.containsKey(FAT_CLIENT_WRAPPER_POOL_KEEPALIVE_IN_SECS)) { + setFatClientWrapperKeepAliveInSecs(props.getInt(FAT_CLIENT_WRAPPER_POOL_KEEPALIVE_IN_SECS, + this.fatClientWrapperKeepAliveInSecs)); + } + + if(props.containsKey(METADATA_CHECK_INTERVAL_IN_MS)) { + setMetadataCheckIntervalInMs(props.getInt(METADATA_CHECK_INTERVAL_IN_MS, + this.metadataCheckIntervalInMs)); + } + } + + public String[] getBootstrapURLs() { + if(this.bootstrapURLs == null) + throw new IllegalStateException("No bootstrap urls have been set."); + return this.bootstrapURLs.toArray(new String[this.bootstrapURLs.size()]); + } + + public CoordinatorConfig setBootstrapURLs(List bootstrapUrls) { + this.bootstrapURLs = Utils.notNull(bootstrapUrls); + if(this.bootstrapURLs.size() <= 0) + throw new IllegalArgumentException("Must provide at least one bootstrap URL."); + return this; + } + + public String getFatClientConfigPath() { + return fatClientConfigPath; + } + + public void setFatClientConfigPath(String fatClientConfigPath) { + this.fatClientConfigPath = fatClientConfigPath; + } + + public int getFatClientWrapperMaxPoolSize() { + return fatClientWrapperMaxPoolSize; + } + + public void setFatClientWrapperMaxPoolSize(int fatClientWrapperMaxPoolSize) { + this.fatClientWrapperMaxPoolSize = fatClientWrapperMaxPoolSize; + } + + public int getFatClientWrapperCorePoolSize() { + return fatClientWrapperCorePoolSize; + } + + public void setFatClientWrapperCorePoolSize(int fatClientWrapperCorePoolSize) { + this.fatClientWrapperCorePoolSize = fatClientWrapperCorePoolSize; + } + + public int getFatClientWrapperKeepAliveInSecs() { + return fatClientWrapperKeepAliveInSecs; + } + + public void setFatClientWrapperKeepAliveInSecs(int fatClientWrapperKeepAliveInSecs) { + this.fatClientWrapperKeepAliveInSecs = fatClientWrapperKeepAliveInSecs; + } + + public int getMetadataCheckIntervalInMs() { + return metadataCheckIntervalInMs; + } + + public void setMetadataCheckIntervalInMs(int metadataCheckIntervalInMs) { + this.metadataCheckIntervalInMs = metadataCheckIntervalInMs; + } + +} diff --git a/src/java/voldemort/coordinator/CoordinatorPipelineFactory.java b/src/java/voldemort/coordinator/CoordinatorPipelineFactory.java new file mode 100644 index 0000000000..1646f02130 --- /dev/null +++ b/src/java/voldemort/coordinator/CoordinatorPipelineFactory.java @@ -0,0 +1,63 @@ +/* + * Copyright 2008-2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.coordinator; + +import static org.jboss.netty.channel.Channels.pipeline; + +import java.util.Map; + +import org.jboss.netty.channel.ChannelPipeline; +import org.jboss.netty.channel.ChannelPipelineFactory; +import org.jboss.netty.handler.codec.http.HttpChunkAggregator; +import org.jboss.netty.handler.codec.http.HttpContentCompressor; +import org.jboss.netty.handler.codec.http.HttpRequestDecoder; +import org.jboss.netty.handler.codec.http.HttpResponseEncoder; + +/** + * A PipelineFactory implementation to setup the Netty Pipeline in the + * Coordinator + * + */ +public class CoordinatorPipelineFactory implements ChannelPipelineFactory { + + private boolean noop = false; + private Map fatClientMap; + + public CoordinatorPipelineFactory(Map fatClientMap, boolean noop) { + this.fatClientMap = fatClientMap; + this.noop = noop; + } + + @Override + public ChannelPipeline getPipeline() throws Exception { + // Create a default pipeline implementation. + ChannelPipeline pipeline = pipeline(); + + pipeline.addLast("decoder", new HttpRequestDecoder()); + pipeline.addLast("aggregator", new HttpChunkAggregator(1048576)); + pipeline.addLast("encoder", new HttpResponseEncoder()); + // Remove the following line if you don't want automatic content + // compression. + pipeline.addLast("deflater", new HttpContentCompressor()); + if(this.noop) { + pipeline.addLast("handler", new NoopHttpRequestHandler()); + } else { + pipeline.addLast("handler", new VoldemortHttpRequestHandler(this.fatClientMap)); + } + return pipeline; + } +} diff --git a/src/java/voldemort/coordinator/CoordinatorService.java b/src/java/voldemort/coordinator/CoordinatorService.java new file mode 100644 index 0000000000..a6a8a508bc --- /dev/null +++ b/src/java/voldemort/coordinator/CoordinatorService.java @@ -0,0 +1,267 @@ +/* + * Copyright 2008-2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.coordinator; + +import static voldemort.utils.Utils.croak; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.io.StringReader; +import java.net.InetSocketAddress; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.concurrent.Callable; +import java.util.concurrent.Executors; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericDatumReader; +import org.apache.avro.io.JsonDecoder; +import org.apache.avro.util.Utf8; +import org.apache.commons.io.IOUtils; +import org.apache.log4j.Logger; +import org.jboss.netty.bootstrap.ServerBootstrap; +import org.jboss.netty.channel.socket.nio.NioServerSocketChannelFactory; + +import voldemort.client.ClientConfig; +import voldemort.client.SocketStoreClientFactory; +import voldemort.client.SystemStoreRepository; +import voldemort.client.scheduler.AsyncMetadataVersionManager; +import voldemort.common.service.AbstractService; +import voldemort.common.service.SchedulerService; +import voldemort.common.service.ServiceType; +import voldemort.server.VoldemortServer; +import voldemort.store.StoreDefinition; +import voldemort.store.metadata.MetadataStore; +import voldemort.utils.SystemTime; +import voldemort.utils.Utils; +import voldemort.xml.StoreDefinitionsMapper; + +import com.google.common.base.Joiner; + +/** + * A Netty based HTTP service that accepts REST requests from the Voldemort thin + * clients and invokes the corresponding Fat client API. + * + */ +public class CoordinatorService extends AbstractService { + + private CoordinatorConfig config = null; + + public CoordinatorService(CoordinatorConfig config) { + super(ServiceType.COORDINATOR); + this.config = config; + } + + private static boolean noop = false; + private static SocketStoreClientFactory storeClientFactory = null; + private static AsyncMetadataVersionManager asyncMetadataManager = null; + private static SchedulerService schedulerService = null; + private static final Logger logger = Logger.getLogger(CoordinatorService.class); + private static Map fatClientMap = null; + public final static Schema CLIENT_CONFIGS_AVRO_SCHEMA = Schema.parse("{ \"name\": \"clientConfigs\", \"type\":\"array\"," + + "\"items\": { \"name\": \"clientConfig\", \"type\": \"map\", \"values\":\"string\" }}}"); + private static final String STORE_NAME_KEY = "store_name"; + + /** + * Initializes all the Fat clients (1 per store) for the cluster that this + * Coordinator talks to. This is invoked once during startup and then every + * time the Metadata manager detects changes to the cluster and stores + * metadata. + */ + private void initializeFatClients() { + StoreDefinitionsMapper storeMapper = new StoreDefinitionsMapper(); + + // Fetch the state once and use this to initialize all the Fat clients + String storesXml = storeClientFactory.bootstrapMetadataWithRetries(MetadataStore.STORES_KEY); + String clusterXml = storeClientFactory.bootstrapMetadataWithRetries(MetadataStore.CLUSTER_KEY); + + List storeDefList = storeMapper.readStoreList(new StringReader(storesXml), + false); + Map fatClientConfigMap = readClientConfig(this.config.getFatClientConfigPath(), + this.config.getBootstrapURLs()); + // For now Simply create the map of store definition to + // FatClientWrappers + // TODO: After the fat client improvements is done, modify this to + // - Fetch cluster.xml and stores.xml + // - Pass these on to each FatClientWrapper + // - Set up AsyncMetadataVersionManager + fatClientMap = new HashMap(); + for(StoreDefinition storeDef: storeDefList) { + String storeName = storeDef.getName(); + logger.info("Creating a Fat client wrapper for store: " + storeName); + logger.info("Using config: " + fatClientConfigMap.get(storeName)); + fatClientMap.put(storeName, new FatClientWrapper(storeName, + this.config, + fatClientConfigMap.get(storeName), + storesXml, + clusterXml)); + } + } + + @Override + protected void startInner() { + + // Initialize the Voldemort Metadata + ClientConfig clientConfig = new ClientConfig(); + clientConfig.setBootstrapUrls(this.config.getBootstrapURLs()); + storeClientFactory = new SocketStoreClientFactory(clientConfig); + initializeFatClients(); + + // Setup the Async Metadata checker + SystemStoreRepository sysRepository = new SystemStoreRepository(); + String clusterXml = storeClientFactory.bootstrapMetadataWithRetries(MetadataStore.CLUSTER_KEY); + + sysRepository.createSystemStores(clientConfig, + clusterXml, + storeClientFactory.getFailureDetector()); + // Create a callback for re-bootstrapping the client + Callable rebootstrapCallback = new Callable() { + + @Override + public Void call() throws Exception { + initializeFatClients(); + return null; + } + + }; + + asyncMetadataManager = new AsyncMetadataVersionManager(sysRepository, + rebootstrapCallback, + null); + + schedulerService = new SchedulerService(1, SystemTime.INSTANCE, true); + schedulerService.schedule(asyncMetadataManager.getClass().getName(), + asyncMetadataManager, + new Date(), + this.config.getMetadataCheckIntervalInMs()); + + // Configure the server. + ServerBootstrap bootstrap = new ServerBootstrap(new NioServerSocketChannelFactory(Executors.newCachedThreadPool(), + Executors.newCachedThreadPool())); + bootstrap.setOption("backlog", 1000); + + // Set up the event pipeline factory. + bootstrap.setPipelineFactory(new CoordinatorPipelineFactory(fatClientMap, noop)); + + // Bind and start to accept incoming connections. + bootstrap.bind(new InetSocketAddress(8080)); + } + + /** + * A function to parse the specified Avro file in order to obtain the config + * for each fat client managed by this coordinator. + * + * @param configFilePath Path of the Avro file containing fat client configs + * @param bootstrapURLs The server URLs used during bootstrap + * @return Map of store name to the corresponding fat client config + */ + @SuppressWarnings("unchecked") + private static Map readClientConfig(String configFilePath, + String[] bootstrapURLs) { + String line; + Map storeNameConfigMap = new HashMap(); + try { + line = Joiner.on(" ") + .join(IOUtils.readLines(new FileReader(new File(configFilePath)))) + .trim(); + + JsonDecoder decoder = new JsonDecoder(CLIENT_CONFIGS_AVRO_SCHEMA, line); + GenericDatumReader datumReader = new GenericDatumReader(CLIENT_CONFIGS_AVRO_SCHEMA); + GenericData.Array> flowMaps = (GenericData.Array>) datumReader.read(null, + decoder); + + // Flows to return back + if(flowMaps != null && flowMaps.size() > 0) { + for(Map flowMap: flowMaps) { + Properties props = new Properties(); + for(Utf8 key: flowMap.keySet()) { + props.put(key.toString(), flowMap.get(key).toString()); + } + + String storeName = flowMap.get(new Utf8(STORE_NAME_KEY)).toString(); + + storeName = props.getProperty(STORE_NAME_KEY); + if(storeName == null || storeName.length() == 0) { + throw new Exception("Illegal Store Name !!!"); + } + + ClientConfig config = new ClientConfig(props); + config.setBootstrapUrls(bootstrapURLs) + .setEnableCompressionLayer(false) + .setEnableSerializationLayer(false) + .enableDefaultClient(true) + .setEnableLazy(false); + + storeNameConfigMap.put(storeName, config); + + } + } + } catch(FileNotFoundException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch(IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch(Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + return storeNameConfigMap; + } + + @Override + protected void stopInner() {} + + public static void main(String[] args) throws Exception { + CoordinatorConfig config = null; + try { + if(args.length != 1) { + croak("USAGE: java " + VoldemortServer.class.getName() + + " [coordinator_config_file]"); + + System.exit(-1); + } + + config = new CoordinatorConfig(new File(args[0])); + } catch(Exception e) { + logger.error(e); + Utils.croak("Error while loading configuration: " + e.getMessage()); + } + + final CoordinatorService coordinator = new CoordinatorService(config); + if(!coordinator.isStarted()) { + coordinator.start(); + } + + // add a shutdown hook to stop the coordinator + Runtime.getRuntime().addShutdownHook(new Thread() { + + @Override + public void run() { + if(coordinator.isStarted()) + coordinator.stop(); + } + }); + } +} diff --git a/src/java/voldemort/coordinator/DynamicTimeoutStoreClient.java b/src/java/voldemort/coordinator/DynamicTimeoutStoreClient.java new file mode 100644 index 0000000000..17f3d71af5 --- /dev/null +++ b/src/java/voldemort/coordinator/DynamicTimeoutStoreClient.java @@ -0,0 +1,209 @@ +/* + * Copyright 2008-2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.coordinator; + +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +import org.apache.log4j.Logger; + +import voldemort.VoldemortException; +import voldemort.client.AbstractStoreClientFactory; +import voldemort.client.DefaultStoreClient; +import voldemort.client.StoreClientFactory; +import voldemort.store.CompositeVersionedPutVoldemortRequest; +import voldemort.store.CompositeVoldemortRequest; +import voldemort.store.InvalidMetadataException; +import voldemort.store.StoreTimeoutException; +import voldemort.versioning.ObsoleteVersionException; +import voldemort.versioning.VectorClock; +import voldemort.versioning.Version; +import voldemort.versioning.Versioned; + +import com.google.common.collect.Maps; + +/** + * A special store client to invoke Voldemort operations with the following new + * features: 1) Per call timeout facility 2) Ability to disable resolution per + * call + * + * @param Type of the Key + * @param Type of the Value + */ +public class DynamicTimeoutStoreClient extends DefaultStoreClient { + + private final Logger logger = Logger.getLogger(DynamicTimeoutStoreClient.class); + + /** + * + * @param storeName Name of the store this client connects to + * @param storeFactory Reference to the factory used to create this client + * @param maxMetadataRefreshAttempts Number of retries to retrieve the state + * @param storesXml The storesXml used during bootstrap + * @param clusterXml The clusterXml used during bootstrap + */ + public DynamicTimeoutStoreClient(String storeName, + StoreClientFactory storeFactory, + int maxMetadataRefreshAttempts, + String storesXml, + String clusterXml) { + this.storeName = storeName; + this.storeFactory = storeFactory; + this.metadataRefreshAttempts = maxMetadataRefreshAttempts; + bootStrap(clusterXml, storesXml); + } + + // Bootstrap using the given cluster xml and stores xml + // The super class bootStrap() method is used to handle the + // InvalidMetadataException + public void bootStrap(String customClusterXml, String customStoresXml) { + AbstractStoreClientFactory factory = (AbstractStoreClientFactory) this.storeFactory; + this.store = factory.getRawStore(storeName, null, customStoresXml, customClusterXml, null); + } + + public Versioned getWithCustomTimeout(CompositeVoldemortRequest requestWrapper) { + validateTimeout(requestWrapper.getRoutingTimeoutInMs()); + for(int attempts = 0; attempts < this.metadataRefreshAttempts; attempts++) { + try { + List> items = store.get(requestWrapper); + return getItemOrThrow(requestWrapper.getKey(), requestWrapper.getValue(), items); + } catch(InvalidMetadataException e) { + logger.info("Received invalid metadata exception during get [ " + e.getMessage() + + " ] on store '" + storeName + "'. Rebootstrapping"); + bootStrap(); + } + } + throw new VoldemortException(this.metadataRefreshAttempts + + " metadata refresh attempts failed."); + } + + public Version putWithCustomTimeout(CompositeVoldemortRequest requestWrapper) { + validateTimeout(requestWrapper.getRoutingTimeoutInMs()); + Versioned versioned; + long startTime = System.currentTimeMillis(); + + // We use the full timeout for doing the Get. In this, we're being + // optimistic that the subsequent put might be faster all the steps + // might finish within the alloted time + versioned = getWithCustomTimeout(requestWrapper); + + long endTime = System.currentTimeMillis(); + if(versioned == null) + versioned = Versioned.value(requestWrapper.getRawValue(), new VectorClock()); + else + versioned.setObject(requestWrapper.getRawValue()); + + // This should not happen unless there's a bug in the + // getWithCustomTimeout + if((endTime - startTime) > requestWrapper.getRoutingTimeoutInMs()) { + throw new StoreTimeoutException("PUT request timed out"); + } + + return putVersionedWithCustomTimeout(new CompositeVersionedPutVoldemortRequest(requestWrapper.getKey(), + versioned, + (requestWrapper.getRoutingTimeoutInMs() - (endTime - startTime)))); + } + + public Version putVersionedWithCustomTimeout(CompositeVoldemortRequest requestWrapper) + throws ObsoleteVersionException { + validateTimeout(requestWrapper.getRoutingTimeoutInMs()); + for(int attempts = 0; attempts < this.metadataRefreshAttempts; attempts++) { + try { + store.put(requestWrapper); + return requestWrapper.getValue().getVersion(); + } catch(InvalidMetadataException e) { + logger.info("Received invalid metadata exception during put [ " + e.getMessage() + + " ] on store '" + storeName + "'. Rebootstrapping"); + bootStrap(); + } + } + throw new VoldemortException(this.metadataRefreshAttempts + + " metadata refresh attempts failed."); + } + + public Map> getAllWithCustomTimeout(CompositeVoldemortRequest requestWrapper) { + validateTimeout(requestWrapper.getRoutingTimeoutInMs()); + Map>> items = null; + for(int attempts = 0;; attempts++) { + if(attempts >= this.metadataRefreshAttempts) + throw new VoldemortException(this.metadataRefreshAttempts + + " metadata refresh attempts failed."); + try { + items = store.getAll(requestWrapper); + break; + } catch(InvalidMetadataException e) { + logger.info("Received invalid metadata exception during getAll [ " + + e.getMessage() + " ] on store '" + storeName + "'. Rebootstrapping"); + bootStrap(); + } + } + Map> result = Maps.newHashMapWithExpectedSize(items.size()); + + for(Entry>> mapEntry: items.entrySet()) { + Versioned value = getItemOrThrow(mapEntry.getKey(), null, mapEntry.getValue()); + result.put(mapEntry.getKey(), value); + } + return result; + } + + public boolean deleteWithCustomTimeout(CompositeVoldemortRequest deleteRequestObject) { + validateTimeout(deleteRequestObject.getRoutingTimeoutInMs()); + if(deleteRequestObject.getVersion() == null) { + + long startTimeInMs = System.currentTimeMillis(); + + // We use the full timeout for doing the Get. In this, we're being + // optimistic that the subsequent delete might be faster all the + // steps might finish within the alloted time + Versioned versioned = getWithCustomTimeout(deleteRequestObject); + if(versioned == null) { + return false; + } + + long endTimeInMs = System.currentTimeMillis(); + long diffInMs = endTimeInMs - startTimeInMs; + + // This should not happen unless there's a bug in the + // getWithCustomTimeout + if(diffInMs > deleteRequestObject.getRoutingTimeoutInMs()) { + throw new StoreTimeoutException("DELETE request timed out"); + } + + // Update the version and the new timeout + deleteRequestObject.setVersion(versioned.getVersion()); + deleteRequestObject.setRoutingTimeoutInMs(deleteRequestObject.getRoutingTimeoutInMs() + - diffInMs); + + } + + return store.delete(deleteRequestObject); + } + + // Make sure that the timeout specified is valid + private void validateTimeout(long opTimeoutInMs) { + if(opTimeoutInMs <= 0) { + throw new IllegalArgumentException("Illegal parameter: Timeout is too low: " + + opTimeoutInMs); + } + } + + public String getStoreName() { + return this.storeName; + } + +} diff --git a/src/java/voldemort/coordinator/FatClientWrapper.java b/src/java/voldemort/coordinator/FatClientWrapper.java new file mode 100644 index 0000000000..81f0e08b31 --- /dev/null +++ b/src/java/voldemort/coordinator/FatClientWrapper.java @@ -0,0 +1,225 @@ +/* + * Copyright 2008-2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.coordinator; + +import java.util.concurrent.ExecutorService; +import java.util.concurrent.RejectedExecutionException; +import java.util.concurrent.RejectedExecutionHandler; +import java.util.concurrent.SynchronousQueue; +import java.util.concurrent.ThreadFactory; +import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.TimeUnit; + +import org.apache.log4j.Logger; +import org.jboss.netty.channel.MessageEvent; + +import voldemort.client.ClientConfig; +import voldemort.client.SocketStoreClientFactory; +import voldemort.store.CompositeVoldemortRequest; +import voldemort.utils.ByteArray; + +/** + * A Wrapper class to provide asynchronous API for calling the fat client + * methods. These methods will be invoked by the Netty request handler instead + * of invoking the Fat Client methods on its own + * + */ +public class FatClientWrapper { + + private ExecutorService fatClientExecutor; + private SocketStoreClientFactory storeClientFactory; + private DynamicTimeoutStoreClient dynamicTimeoutClient; + private final CoordinatorConfig config; + private final Logger logger = Logger.getLogger(FatClientWrapper.class); + + /** + * + * @param storeName Store to connect to via this fat client + * @param config Bootstrap URLs for the intended cluster + * @param clientConfig The config used to bootstrap the fat client + * @param storesXml Stores XML used to bootstrap the fat client + * @param clusterXml Cluster XML used to bootstrap the fat client + */ + public FatClientWrapper(String storeName, + CoordinatorConfig config, + ClientConfig clientConfig, + String storesXml, + String clusterXml) { + + this.config = config; + + // TODO: Import this from Config + this.fatClientExecutor = new ThreadPoolExecutor(this.config.getFatClientWrapperCorePoolSize(), + this.config.getFatClientWrapperMaxPoolSize(), + this.config.getFatClientWrapperKeepAliveInSecs(), // Keepalive + TimeUnit.SECONDS, // Keepalive + // Timeunit + new SynchronousQueue(), // Queue + // for + // pending + // tasks + + new ThreadFactory() { + + @Override + public Thread newThread(Runnable r) { + Thread t = new Thread(r); + t.setName("FatClientExecutor"); + return t; + } + }, + + new RejectedExecutionHandler() { // Handler + + // for + // rejected + // tasks + + @Override + public void rejectedExecution(Runnable r, + ThreadPoolExecutor executor) { + + } + }); + // this.fatClientRequestQueue = new SynchronousQueue(); + + this.storeClientFactory = new SocketStoreClientFactory(clientConfig); + this.dynamicTimeoutClient = new DynamicTimeoutStoreClient(storeName, + this.storeClientFactory, + 1, + storesXml, + clusterXml); + + } + + /** + * Perform a get operation on the fat client asynchronously + * + * @param getRequestObject Contains the key used in the get operation + * @param getRequestMessageEvent MessageEvent to write the response back to + */ + void submitGetRequest(final CompositeVoldemortRequest getRequestObject, + final MessageEvent getRequestMessageEvent) { + try { + + this.fatClientExecutor.submit(new HttpGetRequestExecutor(getRequestObject, + getRequestMessageEvent, + this.dynamicTimeoutClient)); + if(logger.isDebugEnabled()) { + logger.debug("Submitted a get request"); + } + + // Keep track of this request for monitoring + // this.fatClientRequestQueue.add(f); + } catch(RejectedExecutionException rej) { + handleRejectedException(getRequestMessageEvent); + } + } + + /** + * Perform a getAll operation on the fat client asynchronously + * + * @param getAllRequestObject Contains the keys used in the getAll oepration + * @param getAllRequestMessageEvent MessageEvent to write the response back + * to + */ + void submitGetAllRequest(final CompositeVoldemortRequest getAllRequestObject, + final MessageEvent getAllRequestMessageEvent, + final String storeName) { + try { + + this.fatClientExecutor.submit(new HttpGetAllRequestExecutor(getAllRequestObject, + getAllRequestMessageEvent, + this.dynamicTimeoutClient, + storeName)); + if(logger.isDebugEnabled()) { + logger.debug("Submitted a get all request"); + } + + // Keep track of this request for monitoring + // this.fatClientRequestQueue.add(f); + } catch(RejectedExecutionException rej) { + handleRejectedException(getAllRequestMessageEvent); + } + } + + /** + * Interface to perform put operation on the Fat client + * + * @param key: ByteArray representation of the key to put + * @param value: value corresponding to the key to put + * @param putRequest: MessageEvent to write the response on. + * @param operationTimeoutInMs The timeout value for this operation + */ + void submitPutRequest(final CompositeVoldemortRequest putRequestObject, + final MessageEvent putRequest) { + try { + + this.fatClientExecutor.submit(new HttpPutRequestExecutor(putRequestObject, + putRequest, + this.dynamicTimeoutClient)); + if(logger.isDebugEnabled()) { + logger.debug("Submitted a put request"); + } + + // Keep track of this request for monitoring + // this.fatClientRequestQueue.add(f); + } catch(RejectedExecutionException rej) { + handleRejectedException(putRequest); + } + } + + /** + * Interface to perform delete operation on the fat client + * + * @param deleteRequestObject Contains the key and the version used in the + * delete operation + * @param deleteRequestEvent MessageEvent to write the response back to + */ + public void submitDeleteRequest(CompositeVoldemortRequest deleteRequestObject, + MessageEvent deleteRequestEvent) { + try { + + this.fatClientExecutor.submit(new HttpDeleteRequestExecutor(deleteRequestObject, + deleteRequestEvent, + this.dynamicTimeoutClient)); + + // Keep track of this request for monitoring + // this.fatClientRequestQueue.add(f); + } catch(RejectedExecutionException rej) { + handleRejectedException(deleteRequestEvent); + } + + } + + // TODO: Add a custom HTTP Error status 429: Too many requests + private void handleRejectedException(MessageEvent getRequest) { + logger.error("rejected !!!"); + getRequest.getChannel().write(null); // Write error back to the thin + // client + // String errorDescription = + // "Request queue for store " + + // this.dynamicTimeoutClient.getStoreName() + // + " is full !"); + // logger.error(errorDescription); + // RESTErrorHandler.handleError(REQUEST_TIMEOUT, + // this.getRequestMessageEvent, + // false, + // errorDescription); + } + +} diff --git a/src/java/voldemort/coordinator/HttpDeleteRequestExecutor.java b/src/java/voldemort/coordinator/HttpDeleteRequestExecutor.java new file mode 100644 index 0000000000..9009c19d16 --- /dev/null +++ b/src/java/voldemort/coordinator/HttpDeleteRequestExecutor.java @@ -0,0 +1,118 @@ +/* + * Copyright 2008-2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.coordinator; + +import static org.jboss.netty.handler.codec.http.HttpHeaders.Names.CONTENT_LENGTH; +import static org.jboss.netty.handler.codec.http.HttpHeaders.Names.CONTENT_TRANSFER_ENCODING; +import static org.jboss.netty.handler.codec.http.HttpHeaders.Names.CONTENT_TYPE; +import static org.jboss.netty.handler.codec.http.HttpResponseStatus.INTERNAL_SERVER_ERROR; +import static org.jboss.netty.handler.codec.http.HttpResponseStatus.NOT_FOUND; +import static org.jboss.netty.handler.codec.http.HttpResponseStatus.OK; +import static org.jboss.netty.handler.codec.http.HttpResponseStatus.REQUEST_TIMEOUT; +import static org.jboss.netty.handler.codec.http.HttpVersion.HTTP_1_1; + +import org.apache.log4j.Logger; +import org.jboss.netty.channel.ChannelFuture; +import org.jboss.netty.channel.ChannelFutureListener; +import org.jboss.netty.channel.MessageEvent; +import org.jboss.netty.handler.codec.http.DefaultHttpResponse; +import org.jboss.netty.handler.codec.http.HttpResponse; + +import voldemort.VoldemortException; +import voldemort.store.CompositeVoldemortRequest; +import voldemort.store.StoreTimeoutException; +import voldemort.utils.ByteArray; + +/** + * A Runnable class that uses the specified Fat client to perform a Voldemort + * DELETE operation. This is invoked by a FatClientWrapper thread to satisfy a + * corresponding REST DELETE request. + * + */ +public class HttpDeleteRequestExecutor implements Runnable { + + private MessageEvent deleteRequestMessageEvent; + DynamicTimeoutStoreClient storeClient; + private final Logger logger = Logger.getLogger(HttpDeleteRequestExecutor.class); + private final CompositeVoldemortRequest deleteRequestObject; + + /** + * + * @param deleteRequestObject The request object containing key and version + * values + * @param requestEvent Reference to the MessageEvent for the response / + * error + * @param storeClient Reference to the fat client for performing this Delete + * operation + */ + public HttpDeleteRequestExecutor(CompositeVoldemortRequest deleteRequestObject, + MessageEvent requestEvent, + DynamicTimeoutStoreClient storeClient) { + this.deleteRequestMessageEvent = requestEvent; + this.storeClient = storeClient; + this.deleteRequestObject = deleteRequestObject; + } + + public void writeResponse() { + // 1. Create the Response object + HttpResponse response = new DefaultHttpResponse(HTTP_1_1, OK); + + // 2. Set the right headers + response.setHeader(CONTENT_TYPE, "binary"); + response.setHeader(CONTENT_TRANSFER_ENCODING, "binary"); + response.setHeader(CONTENT_LENGTH, "0"); + + // Write the response to the Netty Channel + ChannelFuture future = this.deleteRequestMessageEvent.getChannel().write(response); + + // Close the non-keep-alive connection after the write operation is + // done. + future.addListener(ChannelFutureListener.CLOSE); + + } + + @Override + public void run() { + try { + boolean isDeleted = storeClient.deleteWithCustomTimeout(this.deleteRequestObject); + if(isDeleted) { + writeResponse(); + } else { + RESTErrorHandler.handleError(NOT_FOUND, + this.deleteRequestMessageEvent, + false, + "Requested Key with the specified version does not exist"); + } + + } catch(StoreTimeoutException timeoutException) { + String errorDescription = "DELETE Request timed out: " + timeoutException.getMessage(); + logger.error(errorDescription); + RESTErrorHandler.handleError(REQUEST_TIMEOUT, + this.deleteRequestMessageEvent, + false, + errorDescription); + } catch(VoldemortException ve) { + ve.printStackTrace(); + String errorDescription = "Voldemort Exception: " + ve.getMessage(); + RESTErrorHandler.handleError(INTERNAL_SERVER_ERROR, + this.deleteRequestMessageEvent, + false, + errorDescription); + } + } + +} diff --git a/src/java/voldemort/coordinator/HttpGetAllRequestExecutor.java b/src/java/voldemort/coordinator/HttpGetAllRequestExecutor.java new file mode 100644 index 0000000000..7ef491e677 --- /dev/null +++ b/src/java/voldemort/coordinator/HttpGetAllRequestExecutor.java @@ -0,0 +1,207 @@ +/* + * Copyright 2008-2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.coordinator; + +import static org.jboss.netty.handler.codec.http.HttpHeaders.Names.CONTENT_LENGTH; +import static org.jboss.netty.handler.codec.http.HttpHeaders.Names.CONTENT_LOCATION; +import static org.jboss.netty.handler.codec.http.HttpHeaders.Names.CONTENT_TRANSFER_ENCODING; +import static org.jboss.netty.handler.codec.http.HttpHeaders.Names.CONTENT_TYPE; +import static org.jboss.netty.handler.codec.http.HttpHeaders.Names.ETAG; +import static org.jboss.netty.handler.codec.http.HttpResponseStatus.BAD_REQUEST; +import static org.jboss.netty.handler.codec.http.HttpResponseStatus.INTERNAL_SERVER_ERROR; +import static org.jboss.netty.handler.codec.http.HttpResponseStatus.NOT_FOUND; +import static org.jboss.netty.handler.codec.http.HttpResponseStatus.OK; +import static org.jboss.netty.handler.codec.http.HttpResponseStatus.REQUEST_TIMEOUT; +import static org.jboss.netty.handler.codec.http.HttpVersion.HTTP_1_1; + +import java.io.IOException; +import java.util.Map; +import java.util.Map.Entry; + +import javax.mail.MessagingException; +import javax.mail.internet.MimeBodyPart; +import javax.mail.internet.MimeMultipart; + +import org.apache.commons.codec.binary.Base64; +import org.apache.commons.io.output.ByteArrayOutputStream; +import org.apache.log4j.Logger; +import org.codehaus.jackson.JsonGenerationException; +import org.codehaus.jackson.map.JsonMappingException; +import org.codehaus.jackson.map.ObjectMapper; +import org.jboss.netty.buffer.ChannelBuffer; +import org.jboss.netty.buffer.ChannelBuffers; +import org.jboss.netty.channel.ChannelFuture; +import org.jboss.netty.channel.ChannelFutureListener; +import org.jboss.netty.channel.MessageEvent; +import org.jboss.netty.handler.codec.http.DefaultHttpResponse; +import org.jboss.netty.handler.codec.http.HttpResponse; + +import voldemort.VoldemortException; +import voldemort.store.CompositeVoldemortRequest; +import voldemort.store.StoreTimeoutException; +import voldemort.utils.ByteArray; +import voldemort.versioning.VectorClock; +import voldemort.versioning.Versioned; + +/** + * A Runnable class that uses the specified Fat client to perform a Voldemort + * GET operation. This is invoked by a FatClientWrapper thread to satisfy a + * corresponding REST GET request. + * + */ +public class HttpGetAllRequestExecutor implements Runnable { + + private MessageEvent getRequestMessageEvent; + DynamicTimeoutStoreClient storeClient; + private final Logger logger = Logger.getLogger(HttpGetRequestExecutor.class); + private final CompositeVoldemortRequest getAllRequestObject; + private final String storeName; + + /** + * + * @param getAllRequestObject The request object containing key and timeout + * values + * @param requestEvent Reference to the MessageEvent for the response / + * error + * @param storeClient Reference to the fat client for performing this Get + * operation + */ + public HttpGetAllRequestExecutor(CompositeVoldemortRequest getAllRequestObject, + MessageEvent requestMessageEvent, + DynamicTimeoutStoreClient storeClient, + String storeName) { + this.getRequestMessageEvent = requestMessageEvent; + this.storeClient = storeClient; + this.getAllRequestObject = getAllRequestObject; + this.storeName = storeName; + } + + public void writeResponse(Map> responseVersioned) { + + // Multipart response + MimeMultipart mp = new MimeMultipart(); + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + + try { + + for(Entry> entry: responseVersioned.entrySet()) { + Versioned value = entry.getValue(); + ByteArray keyByteArray = entry.getKey(); + String base64Key = new String(Base64.encodeBase64(keyByteArray.get())); + String contentLocationKey = "/" + this.storeName + "/" + base64Key; + + byte[] responseValue = value.getValue(); + + VectorClock vc = (VectorClock) value.getVersion(); + VectorClockWrapper vcWrapper = new VectorClockWrapper(vc); + ObjectMapper mapper = new ObjectMapper(); + String eTag = ""; + try { + eTag = mapper.writeValueAsString(vcWrapper); + } catch(JsonGenerationException e) { + e.printStackTrace(); + } catch(JsonMappingException e) { + e.printStackTrace(); + } catch(IOException e) { + e.printStackTrace(); + } + + if(logger.isDebugEnabled()) { + logger.debug("ETAG : " + eTag); + } + + // Create the individual body part + MimeBodyPart body = new MimeBodyPart(); + body.addHeader(CONTENT_TYPE, "application/octet-stream"); + body.addHeader(CONTENT_LOCATION, contentLocationKey); + body.addHeader(CONTENT_TRANSFER_ENCODING, "binary"); + body.addHeader(CONTENT_LENGTH, "" + responseValue.length); + body.addHeader(ETAG, eTag); + body.setContent(responseValue, "application/octet-stream"); + mp.addBodyPart(body); + } + + // At this point we have a complete multi-part response + mp.writeTo(outputStream); + + } catch(MessagingException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch(IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + ChannelBuffer responseContent = ChannelBuffers.dynamicBuffer(); + responseContent.writeBytes(outputStream.toByteArray()); + + // 1. Create the Response object + HttpResponse response = new DefaultHttpResponse(HTTP_1_1, OK); + + // 2. Set the right headers + response.setHeader(CONTENT_TYPE, "multipart/binary"); + response.setHeader(CONTENT_TRANSFER_ENCODING, "binary"); + + // 3. Copy the data into the payload + response.setContent(responseContent); + response.setHeader(CONTENT_LENGTH, response.getContent().readableBytes()); + + // Write the response to the Netty Channel + ChannelFuture future = this.getRequestMessageEvent.getChannel().write(response); + + // Close the non-keep-alive connection after the write operation is + // done. + future.addListener(ChannelFutureListener.CLOSE); + + } + + @Override + public void run() { + try { + Map> responseVersioned = storeClient.getAllWithCustomTimeout(this.getAllRequestObject); + if(responseVersioned == null) { + RESTErrorHandler.handleError(NOT_FOUND, + this.getRequestMessageEvent, + false, + "Requested Key does not exist"); + } + writeResponse(responseVersioned); + } catch(IllegalArgumentException illegalArgsException) { + String errorDescription = "GETALL Failed !!! Illegal Arguments : " + + illegalArgsException.getMessage(); + logger.error(errorDescription); + RESTErrorHandler.handleError(BAD_REQUEST, + this.getRequestMessageEvent, + false, + errorDescription); + } catch(StoreTimeoutException timeoutException) { + String errorDescription = "GET Request timed out: " + timeoutException.getMessage(); + logger.error(errorDescription); + RESTErrorHandler.handleError(REQUEST_TIMEOUT, + this.getRequestMessageEvent, + false, + errorDescription); + } catch(VoldemortException ve) { + String errorDescription = "Voldemort Exception: " + ve.getMessage(); + RESTErrorHandler.handleError(INTERNAL_SERVER_ERROR, + this.getRequestMessageEvent, + false, + errorDescription); + } + } + +} \ No newline at end of file diff --git a/src/java/voldemort/coordinator/HttpGetRequestExecutor.java b/src/java/voldemort/coordinator/HttpGetRequestExecutor.java new file mode 100644 index 0000000000..61c4a543c8 --- /dev/null +++ b/src/java/voldemort/coordinator/HttpGetRequestExecutor.java @@ -0,0 +1,169 @@ +/* + * Copyright 2008-2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.coordinator; + +import static org.jboss.netty.handler.codec.http.HttpHeaders.Names.CONTENT_LENGTH; +import static org.jboss.netty.handler.codec.http.HttpHeaders.Names.CONTENT_TRANSFER_ENCODING; +import static org.jboss.netty.handler.codec.http.HttpHeaders.Names.CONTENT_TYPE; +import static org.jboss.netty.handler.codec.http.HttpHeaders.Names.ETAG; +import static org.jboss.netty.handler.codec.http.HttpResponseStatus.BAD_REQUEST; +import static org.jboss.netty.handler.codec.http.HttpResponseStatus.INTERNAL_SERVER_ERROR; +import static org.jboss.netty.handler.codec.http.HttpResponseStatus.NOT_FOUND; +import static org.jboss.netty.handler.codec.http.HttpResponseStatus.OK; +import static org.jboss.netty.handler.codec.http.HttpResponseStatus.REQUEST_TIMEOUT; +import static org.jboss.netty.handler.codec.http.HttpVersion.HTTP_1_1; + +import java.io.IOException; + +import org.apache.log4j.Logger; +import org.codehaus.jackson.JsonGenerationException; +import org.codehaus.jackson.map.JsonMappingException; +import org.codehaus.jackson.map.ObjectMapper; +import org.jboss.netty.buffer.ChannelBuffer; +import org.jboss.netty.buffer.ChannelBuffers; +import org.jboss.netty.channel.MessageEvent; +import org.jboss.netty.handler.codec.http.DefaultHttpResponse; +import org.jboss.netty.handler.codec.http.HttpResponse; + +import voldemort.VoldemortException; +import voldemort.store.CompositeVoldemortRequest; +import voldemort.store.StoreTimeoutException; +import voldemort.utils.ByteArray; +import voldemort.versioning.VectorClock; +import voldemort.versioning.Versioned; + +/** + * A Runnable class that uses the specified Fat client to perform a Voldemort + * GET operation. This is invoked by a FatClientWrapper thread to satisfy a + * corresponding REST GET request. + * + */ +public class HttpGetRequestExecutor implements Runnable { + + private MessageEvent getRequestMessageEvent; + private ChannelBuffer responseContent; + DynamicTimeoutStoreClient storeClient; + private final Logger logger = Logger.getLogger(HttpGetRequestExecutor.class); + private final CompositeVoldemortRequest getRequestObject; + + /** + * + * @param getRequestObject The request object containing key and timeout + * values + * @param requestEvent Reference to the MessageEvent for the response / + * error + * @param storeClient Reference to the fat client for performing this Get + * operation + */ + public HttpGetRequestExecutor(CompositeVoldemortRequest getRequestObject, + MessageEvent requestEvent, + DynamicTimeoutStoreClient storeClient) { + this.getRequestMessageEvent = requestEvent; + this.storeClient = storeClient; + this.getRequestObject = getRequestObject; + } + + public void writeResponse(Versioned responseVersioned) { + + byte[] value = responseVersioned.getValue(); + + // Set the value as the HTTP response payload + byte[] responseValue = responseVersioned.getValue(); + this.responseContent = ChannelBuffers.dynamicBuffer(responseValue.length); + this.responseContent.writeBytes(value); + + VectorClock vc = (VectorClock) responseVersioned.getVersion(); + VectorClockWrapper vcWrapper = new VectorClockWrapper(vc); + ObjectMapper mapper = new ObjectMapper(); + String eTag = ""; + try { + eTag = mapper.writeValueAsString(vcWrapper); + } catch(JsonGenerationException e) { + e.printStackTrace(); + } catch(JsonMappingException e) { + e.printStackTrace(); + } catch(IOException e) { + e.printStackTrace(); + } + + if(logger.isDebugEnabled()) { + logger.debug("ETAG : " + eTag); + } + + // 1. Create the Response object + HttpResponse response = new DefaultHttpResponse(HTTP_1_1, OK); + + // 2. Set the right headers + response.setHeader(CONTENT_TYPE, "binary"); + response.setHeader(CONTENT_TRANSFER_ENCODING, "binary"); + response.setHeader(ETAG, eTag); + + // 3. Copy the data into the payload + response.setContent(responseContent); + response.setHeader(CONTENT_LENGTH, response.getContent().readableBytes()); + + if(logger.isDebugEnabled()) { + logger.debug("Response = " + response); + } + + // Write the response to the Netty Channel + this.getRequestMessageEvent.getChannel().write(response); + } + + @Override + public void run() { + try { + Versioned responseVersioned = storeClient.getWithCustomTimeout(this.getRequestObject); + if(responseVersioned == null) { + if(this.getRequestObject.getValue() != null) { + responseVersioned = this.getRequestObject.getValue(); + } else { + RESTErrorHandler.handleError(NOT_FOUND, + this.getRequestMessageEvent, + false, + "Requested Key does not exist"); + } + if(logger.isDebugEnabled()) { + logger.debug("GET successful !"); + } + } + writeResponse(responseVersioned); + } catch(IllegalArgumentException illegalArgsException) { + String errorDescription = "PUT Failed !!! Illegal Arguments : " + + illegalArgsException.getMessage(); + logger.error(errorDescription); + RESTErrorHandler.handleError(BAD_REQUEST, + this.getRequestMessageEvent, + false, + errorDescription); + } catch(StoreTimeoutException timeoutException) { + String errorDescription = "GET Request timed out: " + timeoutException.getMessage(); + logger.error(errorDescription); + RESTErrorHandler.handleError(REQUEST_TIMEOUT, + this.getRequestMessageEvent, + false, + errorDescription); + } catch(VoldemortException ve) { + String errorDescription = "Voldemort Exception: " + ve.getMessage(); + RESTErrorHandler.handleError(INTERNAL_SERVER_ERROR, + this.getRequestMessageEvent, + false, + errorDescription); + } + } + +} \ No newline at end of file diff --git a/src/java/voldemort/coordinator/HttpPutRequestExecutor.java b/src/java/voldemort/coordinator/HttpPutRequestExecutor.java new file mode 100644 index 0000000000..ebbc7acc0d --- /dev/null +++ b/src/java/voldemort/coordinator/HttpPutRequestExecutor.java @@ -0,0 +1,131 @@ +/* + * Copyright 2008-2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.coordinator; + +import static org.jboss.netty.handler.codec.http.HttpHeaders.Names.CONTENT_LENGTH; +import static org.jboss.netty.handler.codec.http.HttpHeaders.Names.CONTENT_TYPE; +import static org.jboss.netty.handler.codec.http.HttpResponseStatus.BAD_REQUEST; +import static org.jboss.netty.handler.codec.http.HttpResponseStatus.INTERNAL_SERVER_ERROR; +import static org.jboss.netty.handler.codec.http.HttpResponseStatus.OK; +import static org.jboss.netty.handler.codec.http.HttpResponseStatus.PRECONDITION_FAILED; +import static org.jboss.netty.handler.codec.http.HttpResponseStatus.REQUEST_TIMEOUT; +import static org.jboss.netty.handler.codec.http.HttpVersion.HTTP_1_1; + +import org.apache.log4j.Logger; +import org.jboss.netty.channel.MessageEvent; +import org.jboss.netty.handler.codec.http.DefaultHttpResponse; +import org.jboss.netty.handler.codec.http.HttpResponse; + +import voldemort.VoldemortException; +import voldemort.store.CompositeVoldemortRequest; +import voldemort.store.StoreTimeoutException; +import voldemort.utils.ByteArray; +import voldemort.versioning.ObsoleteVersionException; + +/** + * A Runnable class that uses the specified Fat client to perform a Voldemort + * PUT operation. This is invoked by a FatClientWrapper thread to satisfy a + * corresponding REST POST (PUT) request. + * + */ +public class HttpPutRequestExecutor implements Runnable { + + private MessageEvent putRequestMessageEvent; + DynamicTimeoutStoreClient storeClient; + private final Logger logger = Logger.getLogger(HttpPutRequestExecutor.class); + private final CompositeVoldemortRequest putRequestObject; + + public HttpPutRequestExecutor(MessageEvent requestEvent) { + this.putRequestMessageEvent = requestEvent; + this.putRequestObject = null; + } + + /** + * + * @param putRequestObject The request object containing key and timeout + * values + * @param requestEvent Reference to the MessageEvent for the response / + * error + * @param storeClient Reference to the fat client for performing this Get + * operation + */ + public HttpPutRequestExecutor(CompositeVoldemortRequest putRequestObject, + MessageEvent requestEvent, + DynamicTimeoutStoreClient storeClient) { + this.putRequestMessageEvent = requestEvent; + this.storeClient = storeClient; + this.putRequestObject = putRequestObject; + } + + public void writeResponse() { + // 1. Create the Response object + HttpResponse response = new DefaultHttpResponse(HTTP_1_1, OK); + + // 2. Set the right headers + response.setHeader(CONTENT_TYPE, "application/json"); + + // 3. Copy the data into the payload + response.setHeader(CONTENT_LENGTH, 0); + + // Write the response to the Netty Channel + this.putRequestMessageEvent.getChannel().write(response); + } + + @Override + public void run() { + + try { + this.storeClient.putWithCustomTimeout(putRequestObject); + if(logger.isDebugEnabled()) { + logger.debug("PUT successful !"); + } + writeResponse(); + + } catch(IllegalArgumentException illegalArgsException) { + String errorDescription = "PUT Failed !!! Illegal Arguments : " + + illegalArgsException.getMessage(); + logger.error(errorDescription); + RESTErrorHandler.handleError(BAD_REQUEST, + this.putRequestMessageEvent, + false, + errorDescription); + } catch(ObsoleteVersionException oe) { + String errorDescription = "PUT Failed !!! Obsolete version exception: " + + oe.getMessage(); + RESTErrorHandler.handleError(PRECONDITION_FAILED, + this.putRequestMessageEvent, + false, + errorDescription); + + } catch(StoreTimeoutException timeoutException) { + String errorDescription = "GET Request timed out: " + timeoutException.getMessage(); + logger.error(errorDescription); + RESTErrorHandler.handleError(REQUEST_TIMEOUT, + this.putRequestMessageEvent, + false, + errorDescription); + + } catch(VoldemortException ve) { + String errorDescription = "Voldemort Exception: " + ve.getMessage(); + RESTErrorHandler.handleError(INTERNAL_SERVER_ERROR, + this.putRequestMessageEvent, + false, + errorDescription); + } + } + +} \ No newline at end of file diff --git a/src/java/voldemort/coordinator/NoopHttpRequestHandler.java b/src/java/voldemort/coordinator/NoopHttpRequestHandler.java new file mode 100644 index 0000000000..7b0574453a --- /dev/null +++ b/src/java/voldemort/coordinator/NoopHttpRequestHandler.java @@ -0,0 +1,77 @@ +/* + * Copyright 2008-2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.coordinator; + +import org.jboss.netty.channel.ChannelHandlerContext; +import org.jboss.netty.channel.MessageEvent; +import org.jboss.netty.handler.codec.http.HttpMethod; +import org.jboss.netty.handler.codec.http.HttpRequest; + +import voldemort.common.VoldemortOpCode; +import voldemort.store.CompositeGetVoldemortRequest; +import voldemort.utils.ByteArray; +import voldemort.versioning.Versioned; + +/** + * A class that does a Noop after handling a REST request from the thin client. + * This is used for benchmarking purposes. + * + * + */ +public class NoopHttpRequestHandler extends VoldemortHttpRequestHandler { + + public NoopHttpRequestHandler() {} + + @Override + public void messageReceived(ChannelHandlerContext ctx, MessageEvent e) throws Exception { + this.request = (HttpRequest) e.getMessage(); + byte operationType = getOperationType(this.request.getMethod()); + + switch(operationType) { + case VoldemortOpCode.GET_OP_CODE: + HttpGetRequestExecutor getExecutor = new HttpGetRequestExecutor(new CompositeGetVoldemortRequest(null, + 0l, + false), + e, + null); + + Versioned responseVersioned = null; + byte[] nullByteArray = new byte[1]; + nullByteArray[0] = 0; + responseVersioned = new Versioned(nullByteArray); + getExecutor.writeResponse(responseVersioned); + break; + case VoldemortOpCode.PUT_OP_CODE: + HttpPutRequestExecutor putRequestExecutor = new HttpPutRequestExecutor(e); + putRequestExecutor.writeResponse(); + break; + default: + System.err.println("Illegal operation."); + return; + } + } + + private byte getOperationType(HttpMethod method) { + if(method.equals(HttpMethod.POST)) { + return VoldemortOpCode.PUT_OP_CODE; + } else if(method.equals(HttpMethod.GET)) { + return VoldemortOpCode.GET_OP_CODE; + } + + return -1; + } +} diff --git a/src/java/voldemort/coordinator/RESTErrorHandler.java b/src/java/voldemort/coordinator/RESTErrorHandler.java new file mode 100644 index 0000000000..f83e923248 --- /dev/null +++ b/src/java/voldemort/coordinator/RESTErrorHandler.java @@ -0,0 +1,58 @@ +/* + * Copyright 2008-2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.coordinator; + +import static org.jboss.netty.handler.codec.http.HttpHeaders.Names.CONTENT_TYPE; +import static org.jboss.netty.handler.codec.http.HttpVersion.HTTP_1_1; + +import org.jboss.netty.buffer.ChannelBuffers; +import org.jboss.netty.channel.ChannelFuture; +import org.jboss.netty.channel.ChannelFutureListener; +import org.jboss.netty.channel.MessageEvent; +import org.jboss.netty.handler.codec.http.DefaultHttpResponse; +import org.jboss.netty.handler.codec.http.HttpResponse; +import org.jboss.netty.handler.codec.http.HttpResponseStatus; +import org.jboss.netty.util.CharsetUtil; + +/** + * A Generic class used to propagate the error back to the client over the Netty + * channel + * + */ +public class RESTErrorHandler { + + public static void handleError(HttpResponseStatus status, + MessageEvent e, + boolean keepAlive, + String message) { + // 1. Create the Response object + HttpResponse response = new DefaultHttpResponse(HTTP_1_1, status); + + response.setHeader(CONTENT_TYPE, "text/plain; charset=UTF-8"); + response.setContent(ChannelBuffers.copiedBuffer("Failure: " + status.toString() + ". " + + message + "\r\n", CharsetUtil.UTF_8)); + + // Write the response to the Netty Channel + ChannelFuture future = e.getChannel().write(response); + + // Close the non-keep-alive connection after the write operation is + // done. + if(!keepAlive) { + future.addListener(ChannelFutureListener.CLOSE); + } + } +} diff --git a/src/java/voldemort/coordinator/VectorClockWrapper.java b/src/java/voldemort/coordinator/VectorClockWrapper.java new file mode 100644 index 0000000000..169bf71fb5 --- /dev/null +++ b/src/java/voldemort/coordinator/VectorClockWrapper.java @@ -0,0 +1,60 @@ +/* + * Copyright 2008-2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.coordinator; + +import java.util.ArrayList; +import java.util.List; + +import voldemort.versioning.ClockEntry; +import voldemort.versioning.VectorClock; + +/** + * A wrapper for Vector clock used for serialization purposes. This Wrapper is + * then converted to a JSON string which in turn gets embedded in a HTTP header + * field. + * + */ +public class VectorClockWrapper { + + private List versions; + private long timestamp; + + public VectorClockWrapper() { + this.versions = new ArrayList(); + } + + public VectorClockWrapper(VectorClock vc) { + this.versions = vc.getEntries(); + this.setTimestamp(vc.getTimestamp()); + } + + public List getVersions() { + return versions; + } + + public void setVersions(List vectorClock) { + versions = vectorClock; + } + + public long getTimestamp() { + return timestamp; + } + + public void setTimestamp(long timestamp) { + this.timestamp = timestamp; + } +} diff --git a/src/java/voldemort/coordinator/VoldemortHttpRequestHandler.java b/src/java/voldemort/coordinator/VoldemortHttpRequestHandler.java new file mode 100644 index 0000000000..0af1a280c0 --- /dev/null +++ b/src/java/voldemort/coordinator/VoldemortHttpRequestHandler.java @@ -0,0 +1,402 @@ +/* + * Copyright 2008-2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.coordinator; + +import static org.jboss.netty.handler.codec.http.HttpHeaders.isKeepAlive; +import static org.jboss.netty.handler.codec.http.HttpResponseStatus.BAD_REQUEST; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import org.apache.commons.codec.binary.Base64; +import org.apache.log4j.Logger; +import org.codehaus.jackson.JsonParseException; +import org.codehaus.jackson.map.JsonMappingException; +import org.codehaus.jackson.map.ObjectMapper; +import org.jboss.netty.buffer.ChannelBuffer; +import org.jboss.netty.channel.ChannelHandlerContext; +import org.jboss.netty.channel.ExceptionEvent; +import org.jboss.netty.channel.MessageEvent; +import org.jboss.netty.channel.SimpleChannelUpstreamHandler; +import org.jboss.netty.handler.codec.http.HttpChunk; +import org.jboss.netty.handler.codec.http.HttpChunkTrailer; +import org.jboss.netty.handler.codec.http.HttpMethod; +import org.jboss.netty.handler.codec.http.HttpRequest; +import org.jboss.netty.util.CharsetUtil; + +import voldemort.common.VoldemortOpCode; +import voldemort.store.CompositeDeleteVoldemortRequest; +import voldemort.store.CompositeGetAllVoldemortRequest; +import voldemort.store.CompositeGetVoldemortRequest; +import voldemort.store.CompositePutVoldemortRequest; +import voldemort.store.CompositeVoldemortRequest; +import voldemort.utils.ByteArray; +import voldemort.versioning.VectorClock; + +/** + * A class to handle the HTTP request and execute the same on behalf of the thin + * client. + * + * Currently, we're using a fat client to handle this request. + * + */ +public class VoldemortHttpRequestHandler extends SimpleChannelUpstreamHandler { + + public HttpRequest request; + private boolean readingChunks; + /** Buffer that stores the response content */ + private final StringBuilder buf = new StringBuilder(); + private Map fatClientMap; + private final Logger logger = Logger.getLogger(VoldemortHttpRequestHandler.class); + public static final String X_VOLD_REQUEST_TIMEOUT_MS = "X-VOLD-Request-Timeout-ms"; + public static final String X_VOLD_INCONSISTENCY_RESOLVER = "X-VOLD-Inconsistency-Resolver"; + private static final String X_VOLD_VECTOR_CLOCK = "X-VOLD-Vector-Clock"; + public static final String CUSTOM_RESOLVING_STRATEGY = "custom"; + public static final String DEFAULT_RESOLVING_STRATEGY = "timestamp"; + + // Implicit constructor defined for the derived classes + public VoldemortHttpRequestHandler() {} + + public VoldemortHttpRequestHandler(Map fatClientMap) { + this.fatClientMap = fatClientMap; + } + + /** + * Function to parse the HTTP headers and build a Voldemort request object + * + * @param requestURI URI of the REST request + * @param httpMethod Message Event object used to write the response to + * @param e The REST (Voldemort) operation type + * @return true if a valid request was received. False otherwise + */ + private CompositeVoldemortRequest parseRequest(String requestURI, + MessageEvent e, + HttpMethod httpMethod) { + CompositeVoldemortRequest requestWrapper = null; + long operationTimeoutInMs = 1500; + boolean resolveConflicts = true; + + // Retrieve the timeout value from the REST request + String timeoutValStr = this.request.getHeader(X_VOLD_REQUEST_TIMEOUT_MS); + if(timeoutValStr != null) { + try { + Long.parseLong(timeoutValStr); + } catch(NumberFormatException nfe) { + handleBadRequest(e, "Incorrect timeout parameter. Cannot parse this to long: " + + timeoutValStr + ". Details: " + nfe.getMessage()); + return null; + } + } + + // Retrieve the inconsistency resolving strategy from the REST request + String inconsistencyResolverOption = this.request.getHeader(X_VOLD_INCONSISTENCY_RESOLVER); + if(inconsistencyResolverOption != null) { + if(inconsistencyResolverOption.equalsIgnoreCase(CUSTOM_RESOLVING_STRATEGY)) { + resolveConflicts = false; + } else if(!inconsistencyResolverOption.equalsIgnoreCase(DEFAULT_RESOLVING_STRATEGY)) { + handleBadRequest(e, + "Invalid Inconsistency Resolving strategy specified in the Request : " + + inconsistencyResolverOption); + return null; + } + } + + List keyList = readKey(requestURI); + if(keyList == null) { + handleBadRequest(e, "Error: No key specified !"); + return null; + } + + byte operationType = getOperationType(httpMethod, keyList); + + // Build the request object based on the operation type + switch(operationType) { + case VoldemortOpCode.GET_OP_CODE: + requestWrapper = new CompositeGetVoldemortRequest(keyList.get(0), + operationTimeoutInMs, + resolveConflicts); + break; + case VoldemortOpCode.GET_ALL_OP_CODE: + requestWrapper = new CompositeGetAllVoldemortRequest(keyList, + operationTimeoutInMs, + resolveConflicts); + break; + + case VoldemortOpCode.PUT_OP_CODE: + ChannelBuffer content = request.getContent(); + if(!content.readable()) { + handleBadRequest(e, "Contents not readable"); + return null; + } + + ByteArray putKey = null; + if(keyList.size() == 1) { + putKey = keyList.get(0); + } else { + handleBadRequest(e, "Cannot have multiple keys in a put operation"); + return null; + } + byte[] putValue = readValue(content); + requestWrapper = new CompositePutVoldemortRequest(putKey, + putValue, + operationTimeoutInMs); + + break; + case VoldemortOpCode.DELETE_OP_CODE: + VectorClock vc = getVectorClock(this.request.getHeader(X_VOLD_VECTOR_CLOCK)); + if(vc == null) { + // handleBadRequest(e, + // "Incorrect vector clock specified in the request"); + } + requestWrapper = new CompositeDeleteVoldemortRequest(keyList.get(0), + vc, + operationTimeoutInMs); + + break; + + default: + handleBadRequest(e, "Illegal Operation."); + return null; + } + + return requestWrapper; + } + + @Override + public void messageReceived(ChannelHandlerContext ctx, MessageEvent e) throws Exception { + + if(!readingChunks) { + HttpRequest request = this.request = (HttpRequest) e.getMessage(); + String requestURI = this.request.getUri(); + if(logger.isDebugEnabled()) { + logger.debug("Request URI: " + requestURI); + } + + if(request.isChunked()) { + readingChunks = true; + } else { + + CompositeVoldemortRequest requestObject = parseRequest(requestURI, + e, + this.request.getMethod()); + + // Get the store name from the REST request and the + // corresponding Fat client + String storeName = getStoreName(requestURI); + FatClientWrapper fatClientWrapper = null; + if(storeName != null) { + fatClientWrapper = this.fatClientMap.get(storeName); + } + + if(storeName == null || fatClientWrapper == null) { + handleBadRequest(e, "Invalid store name. Critical error."); + return; + } + + if(requestObject == null) { + handleBadRequest(e, "Illegal request."); + return; + } + + switch(requestObject.getOperationType()) { + case VoldemortOpCode.GET_OP_CODE: + if(logger.isDebugEnabled()) { + logger.debug("Incoming get request"); + } + fatClientWrapper.submitGetRequest(requestObject, e); + break; + case VoldemortOpCode.GET_ALL_OP_CODE: + fatClientWrapper.submitGetAllRequest(requestObject, e, storeName); + break; + + case VoldemortOpCode.PUT_OP_CODE: + if(logger.isDebugEnabled()) { + logger.debug("Incoming put request"); + } + fatClientWrapper.submitPutRequest(requestObject, e); + break; + case VoldemortOpCode.DELETE_OP_CODE: + fatClientWrapper.submitDeleteRequest(requestObject, e); + break; + default: + String errorMessage = "Illegal operation."; + logger.error(errorMessage); + RESTErrorHandler.handleError(BAD_REQUEST, + e, + isKeepAlive(request), + errorMessage); + return; + } + + } + } else { + HttpChunk chunk = (HttpChunk) e.getMessage(); + if(chunk.isLast()) { + readingChunks = false; + buf.append("END OF CONTENT\r\n"); + + HttpChunkTrailer trailer = (HttpChunkTrailer) chunk; + if(!trailer.getHeaderNames().isEmpty()) { + buf.append("\r\n"); + for(String name: trailer.getHeaderNames()) { + for(String value: trailer.getHeaders(name)) { + buf.append("TRAILING HEADER: " + name + " = " + value + "\r\n"); + } + } + buf.append("\r\n"); + } + + } else { + buf.append("CHUNK: " + chunk.getContent().toString(CharsetUtil.UTF_8) + "\r\n"); + } + + } + } + + /** + * Parse and return a VectorClock object from the request header + * + * @param vectorClockHeader Header containing the Vector clock in JSON + * format + * @return Equivalent VectorClock object + */ + private VectorClock getVectorClock(String vectorClockHeader) { + VectorClock vc = null; + ObjectMapper mapper = new ObjectMapper(); + if(logger.isDebugEnabled()) { + logger.debug("Received vector clock : " + vectorClockHeader); + } + try { + VectorClockWrapper vcWrapper = mapper.readValue(vectorClockHeader, + VectorClockWrapper.class); + vc = new VectorClock(vcWrapper.getVersions(), vcWrapper.getTimestamp()); + } catch(NullPointerException npe) { + // npe.printStackTrace(); + } catch(JsonParseException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch(JsonMappingException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch(IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + return vc; + } + + /** + * Send a BAD_REQUEST HTTP error back to the client with the specified + * message. + * + * @param e Message event to write the error to + * @param msg Error message + */ + private void handleBadRequest(MessageEvent e, String msg) { + String errorMessage = msg; + logger.error(errorMessage); + RESTErrorHandler.handleError(BAD_REQUEST, e, false, errorMessage); + } + + /** + * Method to determine the operation type + * + * @param httpMethod The HTTP Method object received by the Netty handler + * @param keyList + * @return A voldemortOpCode object representing the operation type + */ + protected byte getOperationType(HttpMethod httpMethod, List keyList) { + if(httpMethod.equals(HttpMethod.POST)) { + return VoldemortOpCode.PUT_OP_CODE; + } else if(httpMethod.equals(HttpMethod.GET)) { + if(keyList.size() == 1) { + return VoldemortOpCode.GET_OP_CODE; + } else if(keyList.size() > 1) { + return VoldemortOpCode.GET_ALL_OP_CODE; + } + } else if(httpMethod.equals(HttpMethod.DELETE)) { + return VoldemortOpCode.DELETE_OP_CODE; + } + + return -1; + } + + /** + * Method to read a value for a put operation + * + * @param content The ChannelBuffer object containing the value + * @return The byte[] array representing the value + */ + private byte[] readValue(ChannelBuffer content) { + byte[] value = new byte[content.capacity()]; + content.readBytes(value); + return value; + } + + /** + * Method to read a key (or keys) present in the HTTP request URI. The URI + * must be of the format //[,,...] + * + * @param requestURI The URI of the HTTP request + * @return the List representing the key (or keys) + */ + private List readKey(String requestURI) { + List keyList = null; + String[] parts = requestURI.split("/"); + if(parts.length > 2) { + String base64KeyList = parts[2]; + keyList = new ArrayList(); + + if(!base64KeyList.contains(",")) { + String rawKey = base64KeyList.trim(); + keyList.add(new ByteArray(Base64.decodeBase64(rawKey.getBytes()))); + } else { + String[] base64KeyArray = base64KeyList.split(","); + for(String base64Key: base64KeyArray) { + String rawKey = base64Key.trim(); + keyList.add(new ByteArray(Base64.decodeBase64(rawKey.getBytes()))); + } + } + } + return keyList; + } + + /** + * Retrieve the store name from the URI + * + * @param requestURI The URI of the HTTP request + * @return The string representing the store name + */ + private String getStoreName(String requestURI) { + String storeName = null; + String[] parts = requestURI.split("/"); + if(parts.length > 1 && this.fatClientMap.containsKey(parts[1])) { + storeName = parts[1]; + } + + return storeName; + } + + @Override + public void exceptionCaught(ChannelHandlerContext ctx, ExceptionEvent e) throws Exception { + e.getCause().printStackTrace(); + e.getChannel().close(); + } +} diff --git a/src/java/voldemort/routing/ConsistentRoutingStrategy.java b/src/java/voldemort/routing/ConsistentRoutingStrategy.java index c3c8777429..fa226c6ca5 100644 --- a/src/java/voldemort/routing/ConsistentRoutingStrategy.java +++ b/src/java/voldemort/routing/ConsistentRoutingStrategy.java @@ -150,6 +150,16 @@ public List getReplicatingPartitionList(int index) { return replicationPartitionsList; } + /** + * Obtain the master partition for a given key + * + * @param key + * @return + */ + public Integer getMasterPartition(byte[] key) { + return abs(hash.hash(key)) % (Math.max(1, this.partitionToNode.length)); + } + public Set getNodes() { Set s = Sets.newHashSetWithExpectedSize(partitionToNode.length); for(Node n: this.partitionToNode) @@ -172,7 +182,7 @@ Set getPartitionsByNode(Node n) { public List getPartitionList(byte[] key) { // hash the key and perform a modulo on the total number of partitions, // to get the master partition - int index = abs(hash.hash(key)) % (Math.max(1, this.partitionToNode.length)); + int index = getMasterPartition(key); if(logger.isDebugEnabled()) { logger.debug("Key " + ByteUtils.toHexString(key) + " primary partition " + index); } diff --git a/src/java/voldemort/routing/RouteToAllStrategy.java b/src/java/voldemort/routing/RouteToAllStrategy.java index 760ba37adc..a8246cca46 100644 --- a/src/java/voldemort/routing/RouteToAllStrategy.java +++ b/src/java/voldemort/routing/RouteToAllStrategy.java @@ -57,6 +57,16 @@ public List getReplicatingPartitionList(int partitionId) { throw new UnsupportedOperationException("Not yet implemented."); } + /** + * Obtain the master partition for a given key + * + * @param key + * @return + */ + public Integer getMasterPartition(byte[] key) { + throw new UnsupportedOperationException("Not yet implemented."); + } + public String getType() { return RoutingStrategyType.TO_ALL_STRATEGY; } diff --git a/src/java/voldemort/routing/RoutingStrategy.java b/src/java/voldemort/routing/RoutingStrategy.java index 4a9716fa37..827d861657 100644 --- a/src/java/voldemort/routing/RoutingStrategy.java +++ b/src/java/voldemort/routing/RoutingStrategy.java @@ -54,6 +54,14 @@ public interface RoutingStrategy { */ public List getPartitionList(byte[] key); + /** + * Obtain the master partition for a given key + * + * @param key The key being operated on + * @return The partition that owns the key + */ + public Integer getMasterPartition(byte[] key); + /** * Get the replication partitions list for the given partition. * diff --git a/src/java/voldemort/server/StoreRepository.java b/src/java/voldemort/server/StoreRepository.java index 94cfaaaf21..5325385d9b 100644 --- a/src/java/voldemort/server/StoreRepository.java +++ b/src/java/voldemort/server/StoreRepository.java @@ -12,7 +12,9 @@ import voldemort.store.StorageEngine; import voldemort.store.Store; import voldemort.store.slop.SlopStorageEngine; +import voldemort.store.stats.StreamingStats; import voldemort.utils.ByteArray; +import voldemort.utils.JmxUtils; import voldemort.utils.Pair; /** @@ -59,6 +61,16 @@ public class StoreRepository { */ private final ConcurrentMap> storageEngines; + /** + * Aggregated statistics about streaming operations + */ + private StreamingStats aggregatedStreamStats; + /** + * Maintains statistics about streaming reads/writes performed against all + * the local storage engines in this node + */ + private ConcurrentMap streamingStatsMap; + /* * Routed stores that write and read from multiple nodes */ @@ -80,10 +92,21 @@ public class StoreRepository { */ private RepairJob repairJob; + /** + * Constructor invoked by tests + */ public StoreRepository() { + this(true); + } + + public StoreRepository(boolean jmxEnabled) { super(); this.localStores = new ConcurrentHashMap>(); this.storageEngines = new ConcurrentHashMap>(); + if(jmxEnabled) { + this.streamingStatsMap = new ConcurrentHashMap(); + this.aggregatedStreamStats = new StreamingStats(); + } this.routedStores = new ConcurrentHashMap>(); this.nodeStores = new ConcurrentHashMap, Store>(); this.redirectingSocketStores = new ConcurrentHashMap, Store>(); @@ -127,9 +150,38 @@ public void addStorageEngine(StorageEngine engine) { if(found != null) throw new VoldemortException("Storage Engine '" + engine.getName() + "' has already been initialized."); + + // register streaming stats object for the store + if(streamingStatsMap != null) { + // lazily register the aggregated mbean + if(storageEngines.size() == 1) { + JmxUtils.registerMbean(aggregatedStreamStats, + JmxUtils.createObjectName(this.getClass().getCanonicalName(), + "aggregated-streaming-stats")); + } + + StreamingStats stat = new StreamingStats(aggregatedStreamStats); + JmxUtils.registerMbean(stat, JmxUtils.createObjectName(this.getClass() + .getCanonicalName(), + engine.getName() + + "-streaming-stats")); + streamingStatsMap.putIfAbsent(engine.getName(), stat); + } } public Store removeStorageEngine(String storeName) { + // register streaming stats object for the store + if(streamingStatsMap != null) { + JmxUtils.unregisterMbean(JmxUtils.createObjectName(this.getClass().getCanonicalName(), + storeName)); + streamingStatsMap.remove(storeName); + // lazily unregister the aggregated mbean + if(storageEngines.size() == 1) { + JmxUtils.unregisterMbean(JmxUtils.createObjectName(this.getClass() + .getCanonicalName(), + "aggregated-streaming-stats")); + } + } return this.storageEngines.remove(storeName); } @@ -234,4 +286,8 @@ public RepairJob getRepairJob() { public void registerRepairJob(RepairJob job) { repairJob = job; } + + public StreamingStats getStreamingStats(String store) { + return streamingStatsMap.get(store); + } } diff --git a/src/java/voldemort/server/VoldemortConfig.java b/src/java/voldemort/server/VoldemortConfig.java index abdaa8744c..d7eb7bdfb6 100644 --- a/src/java/voldemort/server/VoldemortConfig.java +++ b/src/java/voldemort/server/VoldemortConfig.java @@ -21,19 +21,38 @@ import java.io.Serializable; import java.util.List; import java.util.Properties; +import java.util.Timer; +import java.util.TimerTask; +import voldemort.client.ClientConfig; +import voldemort.client.DefaultStoreClient; import voldemort.client.TimeoutConfig; import voldemort.client.protocol.RequestFormatType; +import voldemort.client.protocol.pb.VAdminProto.VoldemortFilter; import voldemort.cluster.failuredetector.FailureDetectorConfig; import voldemort.common.OpTimeMap; import voldemort.common.VoldemortOpCode; +import voldemort.common.service.SchedulerService; +import voldemort.server.http.HttpService; +import voldemort.server.niosocket.NioSocketService; +import voldemort.server.protocol.admin.AsyncOperation; +import voldemort.server.scheduler.DataCleanupJob; +import voldemort.server.scheduler.slop.BlockingSlopPusherJob; import voldemort.server.scheduler.slop.StreamingSlopPusherJob; +import voldemort.server.storage.RepairJob; +import voldemort.store.InvalidMetadataException; +import voldemort.store.StorageEngine; import voldemort.store.bdb.BdbStorageConfiguration; +import voldemort.store.invalidmetadata.InvalidMetadataCheckingStore; +import voldemort.store.logging.LoggingStore; import voldemort.store.memory.CacheStorageConfiguration; import voldemort.store.memory.InMemoryStorageConfiguration; import voldemort.store.mysql.MysqlStorageConfiguration; import voldemort.store.readonly.BinarySearchStrategy; +import voldemort.store.readonly.InterpolationSearchStrategy; import voldemort.store.readonly.ReadOnlyStorageConfiguration; +import voldemort.store.readonly.ReadOnlyStorageEngine; +import voldemort.store.stats.StatTrackingStore; import voldemort.utils.ConfigurationException; import voldemort.utils.Props; import voldemort.utils.Time; @@ -57,8 +76,11 @@ public class VoldemortConfig implements Serializable { public static final long REPORTING_INTERVAL_BYTES = 25 * 1024 * 1024; public static final int DEFAULT_BUFFER_SIZE = 64 * 1024; - private int nodeId; + // Kerberos support for read-only fetches (constants) + public static String DEFAULT_KERBEROS_PRINCIPAL = "voldemrt"; + public static String DEFAULT_KEYTAB_PATH = "/voldemrt.headless.keytab"; + private int nodeId; private String voldemortHome; private String dataDirectory; private String metadataDirectory; @@ -66,7 +88,6 @@ public class VoldemortConfig implements Serializable { private long bdbCacheSize; private boolean bdbWriteTransactions; private boolean bdbFlushTransactions; - private boolean bdbSortedDuplicates; private String bdbDataDirectory; private long bdbMaxLogFileSize; private int bdbBtreeFanout; @@ -76,6 +97,7 @@ public class VoldemortConfig implements Serializable { private int bdbCleanerMinFileUtilization; private int bdbCleanerMinUtilization; private int bdbCleanerLookAheadCacheSize; + private long bdbCleanerBytesInterval; private boolean bdbCheckpointerHighPriority; private int bdbCleanerMaxBatchFiles; private boolean bdbReadUncommitted; @@ -88,6 +110,13 @@ public class VoldemortConfig implements Serializable { private long bdbStatsCacheTtlMs; private boolean bdbExposeSpaceUtilization; private long bdbMinimumSharedCache; + private boolean bdbCleanerLazyMigration; + private boolean bdbCacheModeEvictLN; + private boolean bdbMinimizeScanImpact; + private boolean bdbPrefixKeysWithPartitionId; + private boolean bdbLevelBasedEviction; + private boolean bdbProactiveBackgroundMigration; + private boolean bdbCheckpointerOffForBatchWrites; private String mysqlUsername; private String mysqlPassword; @@ -95,21 +124,25 @@ public class VoldemortConfig implements Serializable { private String mysqlHost; private int mysqlPort; - private int readOnlyBackups; + private int numReadOnlyVersions; private String readOnlyStorageDir; private String readOnlySearchStrategy; private int readOnlyDeleteBackupTimeMs; - private long maxBytesPerSecond; - private long minBytesPerSecond; - private long reportingIntervalBytes; + private long readOnlyFetcherMaxBytesPerSecond; + private long readOnlyFetcherMinBytesPerSecond; + private long readOnlyFetcherReportingIntervalBytes; private int fetcherBufferSize; + private String readOnlyKeytabPath; + private String readOnlyKerberosUser; + private String hadoopConfigPath; + // flag to indicate if we will mlock and pin index pages in memory + private boolean useMlock; private OpTimeMap testingSlowQueueingDelays; private OpTimeMap testingSlowConcurrentDelays; private int coreThreads; private int maxThreads; - private int socketTimeoutMs; private int socketBufferSize; private boolean socketKeepAlive; @@ -120,25 +153,22 @@ public class VoldemortConfig implements Serializable { private int nioAcceptorBacklog; private int clientSelectors; - private int clientRoutingTimeoutMs; private TimeoutConfig clientTimeoutConfig; private int clientMaxConnectionsPerNode; private int clientConnectionTimeoutMs; + private int clientRoutingTimeoutMs; private int clientMaxThreads; private int clientThreadIdleMs; private int clientMaxQueuedRequests; - private int schedulerThreads; private boolean mayInterruptService; private int numScanPermits; - private RequestFormatType requestFormatType; private boolean enableSlop; private boolean enableSlopPusherJob; private boolean enableRepair; - private boolean enableGui; private boolean enableHttpServer; private boolean enableSocketServer; private boolean enableAdminServer; @@ -156,11 +186,9 @@ public class VoldemortConfig implements Serializable { private List storageConfigurations; private Props allProps; - private String slopStoreType; private String pusherType; private long slopFrequencyMs; - private long repairStartMs; private long slopMaxWriteBytesPerSec; private long slopMaxReadBytesPerSec; private int slopBatchSize; @@ -174,8 +202,8 @@ public class VoldemortConfig implements Serializable { private long streamMaxReadBytesPerSec; private long streamMaxWriteBytesPerSec; + private int gossipIntervalMs; - private int gossipInterval; private String failureDetectorImplementation; private long failureDetectorBannagePeriod; private int failureDetectorThreshold; @@ -187,11 +215,14 @@ public class VoldemortConfig implements Serializable { private int retentionCleanupFirstStartTimeInHour; private int retentionCleanupScheduledPeriodInHour; - - private int maxRebalancingAttempt; + private int retentionCleanupFirstStartDayOfWeek; + private boolean retentionCleanupPinStartTime; + private boolean enforceRetentionPolicyOnRead; + private boolean deleteExpiredValuesOnRead; private long rebalancingTimeoutSec; private int maxParallelStoresRebalancing; private boolean rebalancingOptimization; + private boolean usePartitionScanForRebalance; public VoldemortConfig(Properties props) { this(new Props(props)); @@ -216,16 +247,18 @@ public VoldemortConfig(Props props) { + File.separator + "bdb"); this.bdbMaxLogFileSize = props.getBytes("bdb.max.logfile.size", 60 * 1024 * 1024); this.bdbBtreeFanout = props.getInt("bdb.btree.fanout", 512); - this.bdbCheckpointBytes = props.getLong("bdb.checkpoint.interval.bytes", 20 * 1024 * 1024); + this.bdbCheckpointBytes = props.getLong("bdb.checkpoint.interval.bytes", 200 * 1024 * 1024); this.bdbCheckpointMs = props.getLong("bdb.checkpoint.interval.ms", 30 * Time.MS_PER_SECOND); - this.bdbSortedDuplicates = props.getBoolean("bdb.enable.sorted.duplicates", true); this.bdbOneEnvPerStore = props.getBoolean("bdb.one.env.per.store", false); - this.bdbCleanerMinFileUtilization = props.getInt("bdb.cleaner.min.file.utilization", 5); + this.bdbCleanerMinFileUtilization = props.getInt("bdb.cleaner.min.file.utilization", 0); this.bdbCleanerMinUtilization = props.getInt("bdb.cleaner.minUtilization", 50); this.bdbCleanerThreads = props.getInt("bdb.cleaner.threads", 1); + // by default, wake up the cleaner everytime we write log file size * + // utilization% bytes. So, by default 30MB + this.bdbCleanerBytesInterval = props.getLong("bdb.cleaner.interval.bytes", 30 * 1024 * 1024); this.bdbCleanerLookAheadCacheSize = props.getInt("bdb.cleaner.lookahead.cache.size", 8192); this.bdbLockTimeoutMs = props.getLong("bdb.lock.timeout.ms", 500); - this.bdbLockNLockTables = props.getInt("bdb.lock.nLockTables", 1); + this.bdbLockNLockTables = props.getInt("bdb.lock.nLockTables", 7); this.bdbLogFaultReadSize = props.getInt("bdb.log.fault.read.size", 2048); this.bdbLogIteratorReadSize = props.getInt("bdb.log.iterator.read.size", 8192); this.bdbFairLatches = props.getBoolean("bdb.fair.latches", false); @@ -235,20 +268,38 @@ public VoldemortConfig(Props props) { this.bdbStatsCacheTtlMs = props.getLong("bdb.stats.cache.ttl.ms", 5 * Time.MS_PER_SECOND); this.bdbExposeSpaceUtilization = props.getBoolean("bdb.expose.space.utilization", true); this.bdbMinimumSharedCache = props.getLong("bdb.minimum.shared.cache", 0); - - this.readOnlyBackups = props.getInt("readonly.backups", 1); + this.bdbCleanerLazyMigration = props.getBoolean("bdb.cleaner.lazy.migration", false); + this.bdbCacheModeEvictLN = props.getBoolean("bdb.cache.evictln", true); + this.bdbMinimizeScanImpact = props.getBoolean("bdb.minimize.scan.impact", true); + this.bdbPrefixKeysWithPartitionId = props.getBoolean("bdb.prefix.keys.with.partitionid", + true); + this.bdbLevelBasedEviction = props.getBoolean("bdb.evict.by.level", false); + this.bdbProactiveBackgroundMigration = props.getBoolean("bdb.proactive.background.migration", + false); + this.bdbCheckpointerOffForBatchWrites = props.getBoolean("bdb.checkpointer.off.batch.writes", + false); + + this.numReadOnlyVersions = props.getInt("readonly.backups", 1); this.readOnlySearchStrategy = props.getString("readonly.search.strategy", BinarySearchStrategy.class.getName()); this.readOnlyStorageDir = props.getString("readonly.data.directory", this.dataDirectory + File.separator + "read-only"); this.readOnlyDeleteBackupTimeMs = props.getInt("readonly.delete.backup.ms", 0); - this.maxBytesPerSecond = props.getBytes("fetcher.max.bytes.per.sec", 0); - this.minBytesPerSecond = props.getBytes("fetcher.min.bytes.per.sec", 0); - this.reportingIntervalBytes = props.getBytes("fetcher.reporting.interval.bytes", - REPORTING_INTERVAL_BYTES); + this.readOnlyFetcherMaxBytesPerSecond = props.getBytes("fetcher.max.bytes.per.sec", 0); + this.readOnlyFetcherMinBytesPerSecond = props.getBytes("fetcher.min.bytes.per.sec", 0); + this.readOnlyFetcherReportingIntervalBytes = props.getBytes("fetcher.reporting.interval.bytes", + REPORTING_INTERVAL_BYTES); this.fetcherBufferSize = (int) props.getBytes("hdfs.fetcher.buffer.size", DEFAULT_BUFFER_SIZE); + this.readOnlyKeytabPath = props.getString("readonly.keytab.path", + this.metadataDirectory + + VoldemortConfig.DEFAULT_KEYTAB_PATH); + this.readOnlyKerberosUser = props.getString("readonly.kerberos.user", + VoldemortConfig.DEFAULT_KERBEROS_PRINCIPAL); + this.setHadoopConfigPath(props.getString("readonly.hadoop.config.path", + this.metadataDirectory + "/hadoop-conf")); + this.setUseMlock(props.getBoolean("readonly.mlock.index", true)); this.mysqlUsername = props.getString("mysql.user", "root"); this.mysqlPassword = props.getString("mysql.password", ""); @@ -311,7 +362,7 @@ public VoldemortConfig(Props props) { Math.max(8, Runtime.getRuntime() .availableProcessors())); // a value <= 0 forces the default to be used - this.nioAcceptorBacklog = props.getInt("nio.acceptor.backlog", -1); + this.nioAcceptorBacklog = props.getInt("nio.acceptor.backlog", 256); this.clientSelectors = props.getInt("client.selectors", 4); this.clientMaxConnectionsPerNode = props.getInt("client.max.connections.per.node", 50); @@ -356,13 +407,12 @@ public VoldemortConfig(Props props) { this.enableRepair = props.getBoolean("enable.repair", true); this.enableJmxClusterName = props.getBoolean("enable.jmx.clustername", false); - this.gossipInterval = props.getInt("gossip.interval.ms", 30 * 1000); + this.gossipIntervalMs = props.getInt("gossip.interval.ms", 30 * 1000); this.slopMaxWriteBytesPerSec = props.getBytes("slop.write.byte.per.sec", 10 * 1000 * 1000); this.slopMaxReadBytesPerSec = props.getBytes("slop.read.byte.per.sec", 10 * 1000 * 1000); this.slopStoreType = props.getString("slop.store.engine", BdbStorageConfiguration.TYPE_NAME); this.slopFrequencyMs = props.getLong("slop.frequency.ms", 5 * 60 * 1000); - this.repairStartMs = props.getLong("repair.start.ms", 24 * 60 * 60 * 1000); this.slopBatchSize = props.getInt("slop.batch.size", 100); this.pusherType = props.getString("pusher.type", StreamingSlopPusherJob.TYPE_NAME); this.slopZonesDownToTerminate = props.getInt("slop.zones.terminate", 0); @@ -382,9 +432,22 @@ public VoldemortConfig(Props props) { // start at midnight (0-23) this.retentionCleanupFirstStartTimeInHour = props.getInt("retention.cleanup.first.start.hour", 0); + // start next day by default (1=SUN, 2=MON, 3=TUE, 4=WED, 5=THU, 6=FRI, + // 7=SAT) + this.retentionCleanupFirstStartDayOfWeek = props.getInt("retention.cleanup.first.start.day", + Utils.getDayOfTheWeekFromNow(1)); // repeat every 24 hours this.retentionCleanupScheduledPeriodInHour = props.getInt("retention.cleanup.period.hours", 24); + // should the retention job always start at the 'start time' specified + this.retentionCleanupPinStartTime = props.getBoolean("retention.cleanup.pin.start.time", + true); + // should the online reads filter out stale values when reading them ? + this.enforceRetentionPolicyOnRead = props.getBoolean("enforce.retention.policy.on.read", + false); + // should the online reads issue deletes to clear out stale values when + // reading them? + this.deleteExpiredValuesOnRead = props.getBoolean("delete.expired.values.on.read", false); // save props for access from plugins this.allProps = props; @@ -394,10 +457,11 @@ public VoldemortConfig(Props props) { this.requestFormatType = RequestFormatType.fromCode(requestFormatName); // rebalancing parameters - this.maxRebalancingAttempt = props.getInt("max.rebalancing.attempts", 3); this.rebalancingTimeoutSec = props.getLong("rebalancing.timeout.seconds", 10 * 24 * 60 * 60); this.maxParallelStoresRebalancing = props.getInt("max.parallel.stores.rebalancing", 3); this.rebalancingOptimization = props.getBoolean("rebalancing.optimization", true); + this.usePartitionScanForRebalance = props.getBoolean("use.partition.scan.for.rebalance", + true); this.failureDetectorImplementation = props.getString("failuredetector.implementation", FailureDetectorConfig.DEFAULT_IMPLEMENTATION_CLASS_NAME); @@ -431,6 +495,10 @@ public VoldemortConfig(Props props) { validateParams(); } + public VoldemortConfig(int nodeId, String voldemortHome) { + this(new Props().with("node.id", nodeId).with("voldemort.home", voldemortHome)); + } + private void validateParams() { if(coreThreads < 0) throw new IllegalArgumentException("core.threads cannot be less than 1"); @@ -516,145 +584,180 @@ public static VoldemortConfig loadFromVoldemortHome(String voldemortHome, return new VoldemortConfig(properties); } - /** - * The interval at which gossip is run to exchange metadata - */ - public int getGossipInterval() { - return gossipInterval; - } - - public void setGossipInterval(int gossipInterval) { - this.gossipInterval = gossipInterval; - } - - /** - * The node id given by "node.id" property default: VOLDEMORT_NODE_ID - * environment variable - */ public int getNodeId() { return nodeId; } + /** + * Id of the server within the cluster. The server matches up this id with + * the information in cluster.xml to determine what partitions belong to it + * + *
      + *
    • Property : "node.id"
    • + *
    • Default : VOLDEMORT_NODE_ID env variable
    • + *
    + */ public void setNodeId(int nodeId) { this.nodeId = nodeId; } - /** - * The node id given by "voldemort.home" default: VOLDEMORT_HOME environment - * variable - */ public String getVoldemortHome() { return voldemortHome; } + /** + *
      + *
    • Property : "voldemort.home"
    • + *
    • Default : VOLDEMORT_HOME environment variable
    • + *
    + */ public void setVoldemortHome(String voldemortHome) { this.voldemortHome = voldemortHome; } - /** - * The directory name given by "data.directory" default: voldemort.home/data - */ public String getDataDirectory() { return dataDirectory; } + /** + * The directory name given by "data.directory" default: voldemort.home/data + * + *
      + *
    • Property : "data.directory"
    • + *
    • Default : VOLDEMORT_HOME/data
    • + *
    + */ public void setDataDirectory(String dataDirectory) { this.dataDirectory = dataDirectory; } - /** - * The directory name given by "metadata.directory" default: - * voldemort.home/config - */ public String getMetadataDirectory() { return metadataDirectory; } + /** + * The directory name given by "metadata.directory" default: + * voldemort.home/config + * + *
      + *
    • Property : "metadata.directory"
    • + *
    • Default : VOLDEMORT_HOME/config
    • + *
    + */ public void setMetadataDirectory(String metadataDirectory) { this.metadataDirectory = metadataDirectory; } - /** - * The cache size given by "bdb.cache.size" in bytes default: 200MB - */ public long getBdbCacheSize() { return bdbCacheSize; } + /** + * The size of BDB Cache to hold portions of the BTree. + * + *
      + *
    • Property : "bdb.cache.size"
    • + *
    • Default : 200MB
    • + *
    + */ public void setBdbCacheSize(int bdbCacheSize) { this.bdbCacheSize = bdbCacheSize; } + public boolean getBdbExposeSpaceUtilization() { + return bdbExposeSpaceUtilization; + } + /** * This parameter controls whether we expose space utilization via MBean. If * set to false, stat will always return 0; * + *
      + *
    • Property : "bdb.expose.space.utilization"
    • + *
    • Default : true
    • + *
    */ - public boolean getBdbExposeSpaceUtilization() { - return bdbExposeSpaceUtilization; - } - public void setBdbExposeSpaceUtilization(boolean bdbExposeSpaceUtilization) { this.bdbExposeSpaceUtilization = bdbExposeSpaceUtilization; } - /** - * Given by "bdb.flush.transactions". If true then sync transactions to disk - * immediately. default: false - */ public boolean isBdbFlushTransactionsEnabled() { return bdbFlushTransactions; } + /** + * If true then sync transactions to disk immediately. + * + *
      + *
    • Property : "bdb.flush.transactions"
    • + *
    • Default : false
    • + *
    + * + */ public void setBdbFlushTransactions(boolean bdbSyncTransactions) { this.bdbFlushTransactions = bdbSyncTransactions; } - /** - * The directory in which bdb data is stored. Given by "bdb.data.directory" - * default: data.directory/bdb - */ public String getBdbDataDirectory() { return bdbDataDirectory; } + /** + * The directory in which bdb data is stored. + * + *
      + *
    • Property : "bdb.data.directory"
    • + *
    • Default : data.directory/bdb
    • + *
    + */ public void setBdbDataDirectory(String bdbDataDirectory) { this.bdbDataDirectory = bdbDataDirectory; } - /** - * The maximum size of a single .jdb log file in bytes. Given by - * "bdb.max.logfile.size" default: 60MB - */ public long getBdbMaxLogFileSize() { return this.bdbMaxLogFileSize; } + /** + * The maximum size of a single .jdb log file in bytes. + * + *
      + *
    • Property : "bdb.max.logfile.size"
    • + *
    • Default : 60MB
    • + *
    + */ public void setBdbMaxLogFileSize(long bdbMaxLogFileSize) { this.bdbMaxLogFileSize = bdbMaxLogFileSize; } + public int getBdbCleanerMinFileUtilization() { + return bdbCleanerMinFileUtilization; + } + /** * A log file will be cleaned if its utilization percentage is below this - * value, irrespective of total utilization. + * value, irrespective of total utilization. In practice, setting this to a + * value greater than 0, might potentially hurt if the workload generates a + * cleaning pattern with a heavy skew of utilization distribution amongs the + * jdb files * *
      *
    • property: "bdb.cleaner.minFileUtilization"
    • - *
    • default: 5
    • + *
    • default: 0
    • *
    • minimum: 0
    • *
    • maximum: 50
    • *
    */ - public int getBdbCleanerMinFileUtilization() { - return bdbCleanerMinFileUtilization; - } - public final void setBdbCleanerMinFileUtilization(int minFileUtilization) { if(minFileUtilization < 0 || minFileUtilization > 50) throw new IllegalArgumentException("minFileUtilization should be between 0 and 50 (both inclusive)"); this.bdbCleanerMinFileUtilization = minFileUtilization; } + public boolean getBdbCheckpointerHighPriority() { + return bdbCheckpointerHighPriority; + } + /** * If true, the checkpointer uses more resources in order to complete the * checkpoint in a shorter time interval. @@ -664,14 +767,14 @@ public final void setBdbCleanerMinFileUtilization(int minFileUtilization) { *
  • default: false
  • * */ - public boolean getBdbCheckpointerHighPriority() { - return bdbCheckpointerHighPriority; - } - public final void setBdbCheckpointerHighPriority(boolean bdbCheckpointerHighPriority) { this.bdbCheckpointerHighPriority = bdbCheckpointerHighPriority; } + public int getBdbCleanerMaxBatchFiles() { + return bdbCleanerMaxBatchFiles; + } + /** * The maximum number of log files in the cleaner's backlog, or zero if * there is no limit @@ -683,16 +786,16 @@ public final void setBdbCheckpointerHighPriority(boolean bdbCheckpointerHighPrio *
  • maximum: 100000
  • * */ - public int getBdbCleanerMaxBatchFiles() { - return bdbCleanerMaxBatchFiles; - } - public final void setBdbCleanerMaxBatchFiles(int bdbCleanerMaxBatchFiles) { if(bdbCleanerMaxBatchFiles < 0 || bdbCleanerMaxBatchFiles > 100000) throw new IllegalArgumentException("bdbCleanerMaxBatchFiles should be between 0 and 100000 (both inclusive)"); this.bdbCleanerMaxBatchFiles = bdbCleanerMaxBatchFiles; } + public int getBdbCleanerThreads() { + return bdbCleanerThreads; + } + /** * * The number of cleaner threads @@ -703,31 +806,59 @@ public final void setBdbCleanerMaxBatchFiles(int bdbCleanerMaxBatchFiles) { *
  • minimum: 1
  • * */ - public int getBdbCleanerThreads() { - return bdbCleanerThreads; - } - public final void setBdbCleanerThreads(int bdbCleanerThreads) { if(bdbCleanerThreads <= 0) throw new IllegalArgumentException("bdbCleanerThreads should be greater than 0"); this.bdbCleanerThreads = bdbCleanerThreads; } + public long getBdbCleanerBytesInterval() { + return bdbCleanerBytesInterval; + } + + /** + * + * Amount of bytes written before the Cleaner wakes up to check for + * utilization + * + *
      + *
    • property: "bdb.cleaner.interval.bytes"
    • + *
    • default: 30MB
    • + *
    + */ + public final void setCleanerBytesInterval(long bdbCleanerBytesInterval) { + this.bdbCleanerBytesInterval = bdbCleanerBytesInterval; + } + public int getBdbCleanerLookAheadCacheSize() { return bdbCleanerLookAheadCacheSize; } + /** + * Buffer size used by cleaner to fetch BTree nodes during cleaning. + * + *
      + *
    • property: "bdb.cleaner.lookahead.cache.size"
    • + *
    • default: 8192
    • + *
    + * + */ public final void setBdbCleanerLookAheadCacheSize(int bdbCleanerLookAheadCacheSize) { if(bdbCleanerLookAheadCacheSize < 0) throw new IllegalArgumentException("bdbCleanerLookAheadCacheSize should be at least 0"); this.bdbCleanerLookAheadCacheSize = bdbCleanerLookAheadCacheSize; } + public long getBdbLockTimeoutMs() { + return bdbLockTimeoutMs; + } + /** * * The lock timeout for all transactional and non-transactional operations. * Value of zero disables lock timeouts i.e. a deadlock scenario will block - * forever + * forever. High locktimeout combined with a highly concurrent workload, + * might have adverse impact on latency for all stores * *
      *
    • property: "bdb.lock.timeout.ms"
    • @@ -736,16 +867,25 @@ public final void setBdbCleanerLookAheadCacheSize(int bdbCleanerLookAheadCacheSi *
    • maximum: 75 * 60 * 1000
    • *
    */ - public long getBdbLockTimeoutMs() { - return bdbLockTimeoutMs; - } - public final void setBdbLockTimeoutMs(long bdbLockTimeoutMs) { if(bdbLockTimeoutMs < 0) throw new IllegalArgumentException("bdbLockTimeoutMs should be greater than 0"); this.bdbLockTimeoutMs = bdbLockTimeoutMs; } + public int getBdbLockNLockTables() { + return bdbLockNLockTables; + } + + /** + * The size of the lock table used by BDB JE + * + *
      + *
    • Property : bdb.lock.nLockTables"
    • + *
    • Default : 7
    • + *
    + * + */ public void setBdbLockNLockTables(int bdbLockNLockTables) { if(bdbLockNLockTables < 1 || bdbLockNLockTables > 32767) throw new IllegalArgumentException("bdbLockNLockTables should be greater than 0 and " @@ -753,30 +893,54 @@ public void setBdbLockNLockTables(int bdbLockNLockTables) { this.bdbLockNLockTables = bdbLockNLockTables; } - public int getBdbLockNLockTables() { - return bdbLockNLockTables; + public int getBdbLogFaultReadSize() { + return bdbLogFaultReadSize; } + /** + * Buffer for faulting in objects from disk + * + *
      + *
    • Property : "bdb.log.fault.read.size"
    • + *
    • Default : 2048
    • + *
    + * + * @return + */ public void setBdbLogFaultReadSize(int bdbLogFaultReadSize) { this.bdbLogFaultReadSize = bdbLogFaultReadSize; } - public int getBdbLogFaultReadSize() { - return bdbLogFaultReadSize; + public int getBdbLogIteratorReadSize() { + return bdbLogIteratorReadSize; } + /** + * Buffer size used by BDB JE for reading the log eg: Cleaning. + * + *
      + *
    • Property : "bdb.log.iterator.read.size"
    • + *
    • Default : 8192
    • + *
    + * + */ public void setBdbLogIteratorReadSize(int bdbLogIteratorReadSize) { this.bdbLogIteratorReadSize = bdbLogIteratorReadSize; } - public int getBdbLogIteratorReadSize() { - return bdbLogIteratorReadSize; - } - public boolean getBdbFairLatches() { return bdbFairLatches; } + /** + * Controls whether BDB JE should use latches instead of synchronized blocks + * + *
      + *
    • Property : "bdb.fair.latches"
    • + *
    • Default : false
    • + *
    + * + */ public void setBdbFairLatches(boolean bdbFairLatches) { this.bdbFairLatches = bdbFairLatches; } @@ -785,10 +949,23 @@ public boolean getBdbReadUncommitted() { return bdbReadUncommitted; } + /** + * If true, BDB JE get() will not be blocked by put() + * + *
      + *
    • Property : "bdb.lock.read_uncommitted"
    • + *
    • Default : true
    • + *
    + * + */ public void setBdbReadUncommitted(boolean bdbReadUncommitted) { this.bdbReadUncommitted = bdbReadUncommitted; } + public int getBdbCleanerMinUtilization() { + return bdbCleanerMinUtilization; + } + /** * * The cleaner will keep the total disk space utilization percentage above @@ -801,93 +978,454 @@ public void setBdbReadUncommitted(boolean bdbReadUncommitted) { *
  • maximum: 90
  • * */ - public int getBdbCleanerMinUtilization() { - return bdbCleanerMinUtilization; - } - public final void setBdbCleanerMinUtilization(int minUtilization) { if(minUtilization < 0 || minUtilization > 90) throw new IllegalArgumentException("minUtilization should be between 0 and 90 (both inclusive)"); this.bdbCleanerMinUtilization = minUtilization; } - /** - * - * The btree node fanout. Given by "bdb.btree.fanout". default: 512 - */ public int getBdbBtreeFanout() { return this.bdbBtreeFanout; } + /** + * The btree node fanout. Given by "". default: 512 + * + *
      + *
    • property: "bdb.btree.fanout"
    • + *
    • default: 512
    • + *
    + */ public void setBdbBtreeFanout(int bdbBtreeFanout) { this.bdbBtreeFanout = bdbBtreeFanout; } + public boolean getBdbCleanerLazyMigration() { + return bdbCleanerLazyMigration; + } + /** - * The comfortable number of threads the threadpool will attempt to - * maintain. Specified by "core.threads" default: max(1, floor(0.5 * - * max.threads)) + * If true, Cleaner offloads some work to application threads, to keep up + * with the write rate. Side effect is that data is staged on the JVM till + * it is flushed down by Checkpointer, hence not GC friendly (Will cause + * promotions). Use if you have lots of spare RAM but running low on + * threads/IOPS + * + *
      + *
    • property: "bdb.cleaner.lazy.migration"
    • + *
    • default : false
    • + *
    + * */ - public int getCoreThreads() { - return coreThreads; + public final void setBdbCleanerLazyMigration(boolean bdbCleanerLazyMigration) { + this.bdbCleanerLazyMigration = bdbCleanerLazyMigration; } - public void setCoreThreads(int coreThreads) { - this.coreThreads = coreThreads; + public boolean getBdbCacheModeEvictLN() { + return bdbCacheModeEvictLN; } /** - * The maximum number of threadpool threads set by "max.threads" default: - * 100 + * If true, BDB will not cache data in the JVM. This is very Java GC + * friendly, and brings a lot of predictability in performance, by greatly + * reducing constant CMS activity + * + *
      + *
    • Property : "bdb.cache.evictln"
    • + *
    • Default : true
    • + *
    + * */ - public int getMaxThreads() { - return maxThreads; + public void setBdbCacheModeEvictLN(boolean bdbCacheModeEvictLN) { + this.bdbCacheModeEvictLN = bdbCacheModeEvictLN; } - public void setMaxThreads(int maxThreads) { - this.maxThreads = maxThreads; + public boolean getBdbMinimizeScanImpact() { + return bdbMinimizeScanImpact; } - public int getAdminCoreThreads() { - return adminCoreThreads; + /** + * If true, attempts are made to minimize impact to BDB cache during scan + * jobs + * + *
      + *
    • Property : "bdb.minimize.scan.impact"
    • + *
    • Default : true
    • + *
    + * + */ + public void setBdbMinimizeScanImpact(boolean bdbMinimizeScanImpact) { + this.bdbMinimizeScanImpact = bdbMinimizeScanImpact; } - public void setAdminCoreThreads(int coreThreads) { - this.adminCoreThreads = coreThreads; + public boolean isBdbWriteTransactionsEnabled() { + return bdbWriteTransactions; } - public int getAdminMaxThreads() { - return adminMaxThreads; + /** + * Controls persistence mode for BDB JE Transaction. By default, we rely on + * the checkpointer to flush the writes + * + *
      + *
    • Property : "bdb.write.transactions"
    • + *
    • Default : false
    • + *
    + * + */ + public void setBdbWriteTransactions(boolean bdbWriteTransactions) { + this.bdbWriteTransactions = bdbWriteTransactions; } - public void setAdminMaxThreads(int maxThreads) { - this.adminMaxThreads = maxThreads; - } + /** + * If true, use separate BDB JE environment per store + * + *
      + *
    • Property : "bdb.one.env.per.store"
    • + *
    • Default : false
    • + *
    + * + */ + public void setBdbOneEnvPerStore(boolean bdbOneEnvPerStore) { + this.bdbOneEnvPerStore = bdbOneEnvPerStore; + } + + public boolean isBdbOneEnvPerStore() { + return bdbOneEnvPerStore; + } + + public boolean getBdbPrefixKeysWithPartitionId() { + return bdbPrefixKeysWithPartitionId; + } + + /** + * If true, keys will be prefixed by the partition Id on disk. This can + * dramatically speed up rebalancing, restore operations, at the cost of 2 + * bytes of extra storage per key + * + *
      + *
    • Property : "bdb.prefix.keys.with.partitionid"
    • + *
    • Default : true
    • + *
    + * + */ + public void setBdbPrefixKeysWithPartitionId(boolean bdbPrefixKeysWithPartitionId) { + this.bdbPrefixKeysWithPartitionId = bdbPrefixKeysWithPartitionId; + } + + public long getBdbCheckpointBytes() { + return this.bdbCheckpointBytes; + } + + /** + * Checkpointer is woken up and a checkpoint is written once this many bytes + * have been logged + * + *
      + *
    • Property : "bdb.checkpoint.interval.bytes"
    • + *
    • Default : 200MB
    • + *
    + * + */ + public void setBdbCheckpointBytes(long bdbCheckpointBytes) { + this.bdbCheckpointBytes = bdbCheckpointBytes; + } + + public boolean getBdbCheckpointerOffForBatchWrites() { + return this.bdbCheckpointerOffForBatchWrites; + } + + /** + * BDB JE Checkpointer will be turned off during batch writes. This helps + * save redundant writing of index updates, as we do say large streaming + * updates + * + *
      + *
    • Property : "bdb.checkpointer.off.batch.writes"
    • + *
    • Default : false
    • + *
    + * + */ + public void setBdbCheckpointerOffForBatchWrites(boolean bdbCheckpointerOffForBulkWrites) { + this.bdbCheckpointerOffForBatchWrites = bdbCheckpointerOffForBulkWrites; + } + + public long getBdbCheckpointMs() { + return this.bdbCheckpointMs; + } + + /** + * BDB JE Checkpointer wakes up whenever this time period elapses + * + *
      + *
    • Property : "bdb.checkpoint.interval.ms"
    • + *
    • Default : 30s or 30000 ms
    • + *
    + * + */ + public void setBdbCheckpointMs(long bdbCheckpointMs) { + this.bdbCheckpointMs = bdbCheckpointMs; + } + + public long getBdbStatsCacheTtlMs() { + return this.bdbStatsCacheTtlMs; + } + + /** + * Interval to reuse environment stats fetched from BDB. Once the interval + * expires, a fresh call will be made + * + *
      + *
    • Property : "bdb.stats.cache.ttl.ms"
    • + *
    • Default : 5s
    • + *
    + * + */ + public void setBdbStatsCacheTtlMs(long statsCacheTtlMs) { + this.bdbStatsCacheTtlMs = statsCacheTtlMs; + } + + public long getBdbMinimumSharedCache() { + return this.bdbMinimumSharedCache; + } + + /** + * When using partitioned caches, this parameter controls the minimum amount + * of memory reserved for the global pool. Any memory-footprint reservation + * that will break this guarantee will fail. + * + *
      + *
    • Property : "bdb.minimum.shared.cache"
    • + *
    • Default : 0
    • + *
    + * + */ + public void setBdbMinimumSharedCache(long minimumSharedCache) { + this.bdbMinimumSharedCache = minimumSharedCache; + } + + public boolean isBdbLevelBasedEviction() { + return bdbLevelBasedEviction; + } + + /** + * Controls if BDB JE cache eviction happens based on LRU or by BTree level. + * + *
      + *
    • Property : "bdb.evict.by.level"
    • + *
    • Default : false
    • + *
    + * + */ + public void setBdbLevelBasedEviction(boolean bdbLevelBasedEviction) { + this.bdbLevelBasedEviction = bdbLevelBasedEviction; + } + + public boolean getBdbProactiveBackgroundMigration() { + return bdbProactiveBackgroundMigration; + } + + /** + * Exposes BDB JE EnvironmentConfig.CLEANER_PROACTIVE_BACKGROUND_MIGRATION. + * + *
      + *
    • Property : "bdb.proactive.background.migration"
    • + *
    • Default : false
    • + *
    + * + */ + public void setBdbProactiveBackgroundMigration(boolean bdbProactiveBackgroundMigration) { + this.bdbProactiveBackgroundMigration = bdbProactiveBackgroundMigration; + } + + public int getCoreThreads() { + return coreThreads; + } + + /** + * The comfortable number of threads the threadpool will attempt to + * maintain. Not applicable with enable.nio=true and not officially + * supported anymore + * + *
      + *
    • Property : "core.threads"
    • + *
    • Default : max(1, floor(0.5 * max.threads)
    • + *
    + * + */ + @Deprecated + public void setCoreThreads(int coreThreads) { + this.coreThreads = coreThreads; + } + + public int getMaxThreads() { + return maxThreads; + } + + /** + * The maximum number of threads in the server thread pool. Not applicable + * with enable.nio.connector=true. Not officially supported anymore + * + *
      + *
    • Property : "max.threads"
    • + *
    • Default : 100
    • + *
    + * + */ + @Deprecated + public void setMaxThreads(int maxThreads) { + this.maxThreads = maxThreads; + } + + public int getAdminCoreThreads() { + return adminCoreThreads; + } + + /** + * Number of threads that the admin service thread pool will attempt to keep + * around. Not applicable with enable.nio.connector=true + * + *
      + *
    • Property : "admin.core.threads"
    • + *
    • Default : max(1, adminMaxThreads/2)
    • + *
    + * + */ + public void setAdminCoreThreads(int coreThreads) { + this.adminCoreThreads = coreThreads; + } + + public int getAdminMaxThreads() { + return adminMaxThreads; + } + + /** + * Maximum number of threads in the admin service thread pool. Not + * applicable with enable.nio=true + * + *
      + *
    • Property : "admin.max.threads"
    • + *
    • Default : 20
    • + *
    + * + */ + public void setAdminMaxThreads(int maxThreads) { + this.adminMaxThreads = maxThreads; + } + + public boolean getUseNioConnector() { + return this.useNioConnector; + } + + /** + * Determines whether the server will use NIO style selectors while handling + * requests. This is recommended over using old style BIO. + * + *
      + *
    • Property : "enable.nio.connector"
    • + *
    • Default : true
    • + *
    + * + */ + public void setUseNioConnector(boolean useNio) { + this.useNioConnector = useNio; + } + + public int getNioConnectorSelectors() { + return nioConnectorSelectors; + } + + /** + * Number of NIO server threads to use to process client requests + * + *
      + *
    • Property : nio.connector.selectors
    • + *
    • Default : max(8, number of available processors)
    • + *
    + * + * + */ + public void setNioConnectorSelectors(int nioConnectorSelectors) { + this.nioConnectorSelectors = nioConnectorSelectors; + } + + public int getNioAdminConnectorSelectors() { + return nioAdminConnectorSelectors; + } + + /** + * Number of admin NIO server threads to spin up. + * + *
      + *
    • Property : nio.admin.connector.selectors
    • + *
    • Default : max(8, number of available processors)
    • + *
    + * + * + */ + public void setNioAdminConnectorSelectors(int nioAdminConnectorSelectors) { + this.nioAdminConnectorSelectors = nioAdminConnectorSelectors; + } public boolean isHttpServerEnabled() { return enableHttpServer; } + /** + * Whether or not the {@link HttpService} is enabled + *
      + *
    • Property :"http.enable"
    • + *
    • Default :true
    • + *
    + * + */ public void setEnableHttpServer(boolean enableHttpServer) { this.enableHttpServer = enableHttpServer; } + /** + * Determines whether the socket server will be enabled for BIO/NIO request + * handling + * + *
      + *
    • Property :"socket.enable"
    • + *
    • Default :true
    • + *
    + * + */ public boolean isSocketServerEnabled() { return enableSocketServer; } - public void setAdminServerEnabled(boolean enableSocketServer) { - this.enableSocketServer = enableSocketServer; - } - public boolean isAdminServerEnabled() { return enableAdminServer; } + /** + * Determine whether the admin service has been enabled to perform + * maintenance operations on the server + * + *
      + *
    • Property : "admin.enable"
    • + *
    • Default : true
    • + *
    + */ + public void setAdminServerEnabled(boolean enableAdminServer) { + this.enableAdminServer = enableAdminServer; + } + public long getStreamMaxReadBytesPerSec() { return streamMaxReadBytesPerSec; } + /** + * Maximum amount of data read out of the server by streaming operations + * + *
      + *
    • Property : "stream.read.byte.per.sec"
    • + *
    • Default : 10MB
    • + *
    + * + */ public void setStreamMaxReadBytesPerSec(long streamMaxReadBytesPerSec) { this.streamMaxReadBytesPerSec = streamMaxReadBytesPerSec; } @@ -896,6 +1434,16 @@ public long getStreamMaxWriteBytesPerSec() { return streamMaxWriteBytesPerSec; } + /** + * Maximum amount of data to be written into the server by streaming + * operations + * + *
      + *
    • Property : "stream.write.byte.per.sec"
    • + *
    • Default : 10MB
    • + *
    + * + */ public void setStreamMaxWriteBytesPerSec(long streamMaxWriteBytesPerSec) { this.streamMaxWriteBytesPerSec = streamMaxWriteBytesPerSec; } @@ -904,6 +1452,16 @@ public long getSlopMaxWriteBytesPerSec() { return slopMaxWriteBytesPerSec; } + /** + * Controls the rate at which the {@link StreamingSlopPusherJob} will send + * slop writes over the wire + * + *
      + *
    • Property :"slop.write.byte.per.sec"
    • + *
    • Default :10MB
    • + *
    + * + */ public void setSlopMaxWriteBytesPerSec(long slopMaxWriteBytesPerSec) { this.slopMaxWriteBytesPerSec = slopMaxWriteBytesPerSec; } @@ -912,18 +1470,33 @@ public long getSlopMaxReadBytesPerSec() { return slopMaxReadBytesPerSec; } + /** + * Controls the rate at which the {@link StreamingSlopPusherJob} reads the + * 'slop' store and drains it off to another server + * + *
      + *
    • Property :"slop.read.byte.per.sec"
    • + *
    • Default :10MB
    • + *
    + * + */ public void setSlopMaxReadBytesPerSec(long slopMaxReadBytesPerSec) { this.slopMaxReadBytesPerSec = slopMaxReadBytesPerSec; } - public void setEnableAdminServer(boolean enableAdminServer) { - this.enableAdminServer = enableAdminServer; - } - public boolean isJmxEnabled() { return enableJmx; } + /** + * Is JMX monitoring enabled on the server? + * + *
      + *
    • Property :"jmx.enable"
    • + *
    • Default : true
    • + *
    + * + */ public void setEnableJmx(boolean enableJmx) { this.enableJmx = enableJmx; } @@ -932,22 +1505,31 @@ public boolean isPipelineRoutedStoreEnabled() { return enablePipelineRoutedStore; } + /** + * {@link ClientConfig#setEnablePipelineRoutedStore(boolean)} + * + *
      + *
    • Property :"enable.pipeline.routed.store"
    • + *
    • Default :true
    • + *
    + * + */ public void setEnablePipelineRoutedStore(boolean enablePipelineRoutedStore) { this.enablePipelineRoutedStore = enablePipelineRoutedStore; } - public boolean isGuiEnabled() { - return enableGui; - } - - public void setEnableGui(boolean enableGui) { - this.enableGui = enableGui; - } - public String getMysqlUsername() { return mysqlUsername; } + /** + * user name to use with MySQL storage engine + * + *
      + *
    • Property : "mysql.user"
    • + *
    • Default : "root"
    • + *
    + */ public void setMysqlUsername(String mysqlUsername) { this.mysqlUsername = mysqlUsername; } @@ -956,6 +1538,14 @@ public String getMysqlPassword() { return mysqlPassword; } + /** + * Password to use with MySQL storage engine + * + *
      + *
    • Property :"mysql.password"
    • + *
    • Default :""
    • + *
    + */ public void setMysqlPassword(String mysqlPassword) { this.mysqlPassword = mysqlPassword; } @@ -964,6 +1554,14 @@ public String getMysqlDatabaseName() { return mysqlDatabaseName; } + /** + * MySQL database name to use + * + *
      + *
    • Property :
    • + *
    • Default :
    • + *
    + */ public void setMysqlDatabaseName(String mysqlDatabaseName) { this.mysqlDatabaseName = mysqlDatabaseName; } @@ -972,6 +1570,14 @@ public String getMysqlHost() { return mysqlHost; } + /** + * Hostname of the database server for MySQL storage engine + * + *
      + *
    • Property :"mysql.host"
    • + *
    • Default :"localhost"
    • + *
    + */ public void setMysqlHost(String mysqlHost) { this.mysqlHost = mysqlHost; } @@ -980,52 +1586,79 @@ public int getMysqlPort() { return mysqlPort; } + /** + * Port number for the MySQL database server + * + *
      + *
    • Property :"mysql.port"
    • + *
    • Default :3306
    • + *
    + */ public void setMysqlPort(int mysqlPort) { this.mysqlPort = mysqlPort; } - /** - * The underlying store type which will be used to store slops. Defaults to - * Bdb - */ public String getSlopStoreType() { return slopStoreType; } + /** + * The underlying store type which will be used to store slops. Defaults to + * Bdb torageConfiguration.class.getName()) + * + *
      + *
    • Property :"slop.store.engine"
    • + *
    • Default :BdbStorageConfiguration.TYPE_NAME
    • + *
    + */ public void setSlopStoreType(String slopStoreType) { this.slopStoreType = slopStoreType; } - /** - * The type of streaming job we would want to use to send hints. Defaults to - * streaming - */ public String getPusherType() { return this.pusherType; } + /** + * The type of streaming job we would want to use to send hints. Defaults to + * + *
      + *
    • Property :"pusher.type"
    • + *
    • Default :StreamingSlopPusherJob.TYPE_NAME
    • + *
    + */ public void setPusherType(String pusherType) { this.pusherType = pusherType; } - /** - * Number of zones declared down before we terminate the pusher job - */ public int getSlopZonesDownToTerminate() { return this.slopZonesDownToTerminate; } + /** + * Number of zones declared down before we terminate the pusher job + * + *
      + *
    • Property :"slop.zones.terminate"
    • + *
    • Default :0
    • + *
    + */ public void setSlopZonesDownToTerminate(int slopZonesDownToTerminate) { this.slopZonesDownToTerminate = slopZonesDownToTerminate; } - /** - * Returns the size of the batch used while streaming slops - */ public int getSlopBatchSize() { return this.slopBatchSize; } + /** + * Returns the size of the batch used while streaming slops + * + *
      + *
    • Property :"slop.batch.size"
    • + *
    • Default :100
    • + *
    + */ public void setSlopBatchSize(int slopBatchSize) { this.slopBatchSize = slopBatchSize; } @@ -1038,18 +1671,26 @@ public long getSlopFrequencyMs() { return this.slopFrequencyMs; } + /** + * Frequency at which the slop pusher attempts to push slops + * + *
      + *
    • Property :"slop.frequency.ms"
    • + *
    • Default :300 seconds
    • + *
    + */ public void setSlopFrequencyMs(long slopFrequencyMs) { this.slopFrequencyMs = slopFrequencyMs; } - public long getRepairStartMs() { - return this.repairStartMs; - } - - public void setRepairStartMs(long repairStartMs) { - this.repairStartMs = repairStartMs; - } - + /** + * {@link ClientConfig#setSocketTimeout(int, java.util.concurrent.TimeUnit)} + * + *
      + *
    • Property :"socket.timeout.ms"
    • + *
    • Default :5000
    • + *
    + */ public void setSocketTimeoutMs(int socketTimeoutMs) { this.socketTimeoutMs = socketTimeoutMs; } @@ -1058,6 +1699,14 @@ public int getClientSelectors() { return clientSelectors; } + /** + * {@link ClientConfig#setSelectors(int)} + * + *
      + *
    • Property :"client.selectors"
    • + *
    • Default :4
    • + *
    + */ public void setClientSelectors(int clientSelectors) { this.clientSelectors = clientSelectors; } @@ -1066,10 +1715,22 @@ public int getClientRoutingTimeoutMs() { return this.clientRoutingTimeoutMs; } + /** + * {@link ClientConfig#setRoutingTimeout(int, java.util.concurrent.TimeUnit)} + * + *
      + *
    • Property :"client.routing.timeout.ms"
    • + *
    • Default :15000
    • + *
    + */ public void setClientRoutingTimeoutMs(int routingTimeoutMs) { this.clientRoutingTimeoutMs = routingTimeoutMs; } + /** + * {@link ClientConfig#setTimeoutConfig(TimeoutConfig)} + * + */ public TimeoutConfig getTimeoutConfig() { return this.clientTimeoutConfig; } @@ -1078,6 +1739,14 @@ public int getClientMaxConnectionsPerNode() { return clientMaxConnectionsPerNode; } + /** + * {@link ClientConfig#setMaxConnectionsPerNode(int)} + * + *
      + *
    • Property :"client.max.connections.per.node"
    • + *
    • Default :50
    • + *
    + */ public void setClientMaxConnectionsPerNode(int maxConnectionsPerNode) { this.clientMaxConnectionsPerNode = maxConnectionsPerNode; } @@ -1086,6 +1755,14 @@ public int getClientConnectionTimeoutMs() { return clientConnectionTimeoutMs; } + /** + * {@link ClientConfig#setConnectionTimeout(int, java.util.concurrent.TimeUnit)} + * + *
      + *
    • Property :"client.connection.timeout.ms"
    • + *
    • Default :500
    • + *
    + */ public void setClientConnectionTimeoutMs(int connectionTimeoutMs) { this.clientConnectionTimeoutMs = connectionTimeoutMs; } @@ -1112,6 +1789,14 @@ public int getClientMaxThreads() { return clientMaxThreads; } + /** + * {@link ClientConfig#setMaxThreads(int)} + * + *
      + *
    • Property :"client.max.threads"
    • + *
    • Default :500
    • + *
    + */ public void setClientMaxThreads(int clientMaxThreads) { this.clientMaxThreads = clientMaxThreads; } @@ -1120,6 +1805,14 @@ public int getClientThreadIdleMs() { return clientThreadIdleMs; } + /** + * {@link ClientConfig#setThreadIdleTime(long, java.util.concurrent.TimeUnit)} + * + *
      + *
    • Property :"client.thread.idle.ms"
    • + *
    • Default :100000
    • + *
    + */ public void setClientThreadIdleMs(int clientThreadIdleMs) { this.clientThreadIdleMs = clientThreadIdleMs; } @@ -1128,6 +1821,13 @@ public int getClientMaxQueuedRequests() { return clientMaxQueuedRequests; } + /** + * {@link ClientConfig#setMaxQueuedRequests(int)} + *
      + *
    • Property :
    • + *
    • Default :
    • + *
    + */ public void setClientMaxQueuedRequests(int clientMaxQueuedRequests) { this.clientMaxQueuedRequests = clientMaxQueuedRequests; } @@ -1136,6 +1836,14 @@ public boolean isSlopEnabled() { return this.enableSlop; } + /** + * Whether or not slop store should be created on the server. + * + *
      + *
    • Property :"slop.enable"
    • + *
    • Default :true
    • + *
    + */ public void setEnableSlop(boolean enableSlop) { this.enableSlop = enableSlop; } @@ -1144,6 +1852,16 @@ public boolean isSlopPusherJobEnabled() { return enableSlopPusherJob; } + /** + * Whether or not {@link StreamingSlopPusherJob} or + * {@link BlockingSlopPusherJob} should be enabled to asynchronous push + * slops to failed servers + * + *
      + *
    • Property :"slop.pusher.enable"
    • + *
    • Default :true
    • + *
    + */ public void setEnableSlopPusherJob(boolean enableSlopPusherJob) { this.enableSlopPusherJob = enableSlopPusherJob; } @@ -1152,6 +1870,14 @@ public boolean isRepairEnabled() { return this.enableRepair; } + /** + * Whether {@link RepairJob} will be enabled + * + *
      + *
    • Property :"enable.repair"
    • + *
    • Default :true
    • + *
    + */ public void setEnableRepair(boolean enableRepair) { this.enableRepair = enableRepair; } @@ -1160,6 +1886,15 @@ public boolean isVerboseLoggingEnabled() { return this.enableVerboseLogging; } + /** + * if enabled, {@link LoggingStore} will be enable to ouput more detailed + * trace debugging if needed + * + *
      + *
    • Property :"enable.verbose.logging"
    • + *
    • Default :true
    • + *
    + */ public void setEnableVerboseLogging(boolean enableVerboseLogging) { this.enableVerboseLogging = enableVerboseLogging; } @@ -1168,6 +1903,15 @@ public boolean isStatTrackingEnabled() { return this.enableStatTracking; } + /** + * If enabled, {@link StatTrackingStore} will be enabled to account + * performance statistics + * + *
      + *
    • Property :"enable.stat.tracking"
    • + *
    • Default :true
    • + *
    + */ public void setEnableStatTracking(boolean enableStatTracking) { this.enableStatTracking = enableStatTracking; } @@ -1176,46 +1920,32 @@ public boolean isMetadataCheckingEnabled() { return enableMetadataChecking; } + /** + * If enabled, {@link InvalidMetadataCheckingStore} will reject traffic that + * does not belong to this server with a {@link InvalidMetadataException} + * + *
      + *
    • Property :"enable.metadata.checking"
    • + *
    • Default :true
    • + *
    + */ public void setEnableMetadataChecking(boolean enableMetadataChecking) { this.enableMetadataChecking = enableMetadataChecking; } - public long getBdbCheckpointBytes() { - return this.bdbCheckpointBytes; - } - - public void setBdbCheckpointBytes(long bdbCheckpointBytes) { - this.bdbCheckpointBytes = bdbCheckpointBytes; - } - - public long getBdbCheckpointMs() { - return this.bdbCheckpointMs; - } - - public void setBdbCheckpointMs(long bdbCheckpointMs) { - this.bdbCheckpointMs = bdbCheckpointMs; - } - - public long getBdbStatsCacheTtlMs() { - return this.bdbStatsCacheTtlMs; - } - - public void setBdbStatsCacheTtlMs(long statsCacheTtlMs) { - this.bdbStatsCacheTtlMs = statsCacheTtlMs; - } - - public long getBdbMinimumSharedCache() { - return this.bdbMinimumSharedCache; - } - - public void setBdbMinimumSharedCache(long minimumSharedCache) { - this.bdbMinimumSharedCache = minimumSharedCache; - } - public int getSchedulerThreads() { return schedulerThreads; } + /** + * Number of {@link SchedulerService} threads to create that run all the + * background async jobs + * + *
      + *
    • Property :"client.max.queued.requests"
    • + *
    • Default :1000
    • + *
    + */ public void setSchedulerThreads(int schedulerThreads) { this.schedulerThreads = schedulerThreads; } @@ -1224,6 +1954,15 @@ public boolean canInterruptService() { return mayInterruptService; } + /** + * Determines whether the scheduler can be allowed to interrupt a + * {@link AsyncOperation}, when terminating the job + * + *
      + *
    • Property :"service.interruptible"
    • + *
    • Default :true
    • + *
    + */ public void setInterruptible(boolean canInterrupt) { this.mayInterruptService = canInterrupt; } @@ -1232,61 +1971,112 @@ public String getReadOnlyDataStorageDirectory() { return this.readOnlyStorageDir; } + /** + * Directory to store the read-only data and index files in + * + *
      + *
    • Property :"readonly.data.directory"
    • + *
    • Default : DATA_DIR/read-only
    • + *
    + */ public void setReadOnlyDataStorageDirectory(String readOnlyStorageDir) { this.readOnlyStorageDir = readOnlyStorageDir; } - public int getReadOnlyBackups() { - return readOnlyBackups; - } - - public void setReadOnlyBackups(int readOnlyBackups) { - this.readOnlyBackups = readOnlyBackups; + public int getNumReadOnlyVersions() { + return numReadOnlyVersions; } /** - * Amount of time we will wait before we start deleting the backup. This - * happens during swaps when old backups need to be deleted. Some delay is - * required so that we don't cause a sudden increase of IOPs during swap. + * Number of previous versions to keep around for + * {@link ReadOnlyStorageEngine} * - * @return The start time in ms + *
      + *
    • Property :"readonly.backups"
    • + *
    • Default :1
    • + *
    */ + public void setNumReadOnlyVersions(int readOnlyBackups) { + this.numReadOnlyVersions = readOnlyBackups; + } + public int getReadOnlyDeleteBackupMs() { return readOnlyDeleteBackupTimeMs; } + /** + * Amount of time we will wait before we start deleting the backup. This + * happens during swaps when old backups need to be deleted. Some delay is + * + *
      + *
    • Property :"readonly.delete.backup.ms"
    • + *
    • Default :0
    • + *
    + */ public void setReadOnlyDeleteBackupMs(int readOnlyDeleteBackupTimeMs) { this.readOnlyDeleteBackupTimeMs = readOnlyDeleteBackupTimeMs; } - public boolean isBdbWriteTransactionsEnabled() { - return bdbWriteTransactions; + public String getReadOnlyKeytabPath() { + return readOnlyKeytabPath; } - public void setBdbWriteTransactions(boolean bdbWriteTransactions) { - this.bdbWriteTransactions = bdbWriteTransactions; + /** + * Path to keytab for principal used for kerberized Hadoop grids + * + *
      + *
    • Property :"readonly.keytab.path"
    • + *
    • Default :METADATA_DIR/voldemrt.headless.keytab
    • + *
    + */ + public void setReadOnlyKeytabPath(String readOnlyKeytabPath) { + this.readOnlyKeytabPath = readOnlyKeytabPath; } - public boolean isBdbSortedDuplicatesEnabled() { - return this.bdbSortedDuplicates; + public String getReadOnlyKerberosUser() { + return readOnlyKerberosUser; } - public void setBdbSortedDuplicates(boolean enable) { - this.bdbSortedDuplicates = enable; + /** + * Principal used in kerberized Hadoop grids + * + *
      + *
    • Property :"readonly.kerberos.user"
    • + *
    • Default :"voldemrt"
    • + *
    + */ + public void setReadOnlyKerberosUser(String readOnlyKerberosUser) { + this.readOnlyKerberosUser = readOnlyKerberosUser; } - public void setBdbOneEnvPerStore(boolean bdbOneEnvPerStore) { - this.bdbOneEnvPerStore = bdbOneEnvPerStore; + public String getHadoopConfigPath() { + return hadoopConfigPath; } - public boolean isBdbOneEnvPerStore() { - return bdbOneEnvPerStore; + /** + * Path to the hadoop config + * + *
      + *
    • Property :"readonly.hadoop.config.path"
    • + *
    • Default : METADATA_DIR/hadoop-conf
    • + *
    + */ + public void setHadoopConfigPath(String hadoopConfigPath) { + this.hadoopConfigPath = hadoopConfigPath; } public int getSocketBufferSize() { return socketBufferSize; } + /** + * {@link ClientConfig#setSocketBufferSize(int)} + * + *
      + *
    • Property :"socket.buffer.size"
    • + *
    • Default :64kb
    • + *
    + */ public void setSocketBufferSize(int socketBufferSize) { this.socketBufferSize = socketBufferSize; } @@ -1295,38 +2085,32 @@ public boolean getSocketKeepAlive() { return this.socketKeepAlive; } + /** + * {@link ClientConfig#setSocketKeepAlive(boolean)} + * + *
      + *
    • Property :"socket.keepalive"
    • + *
    • Default :false
    • + *
    + */ public void setSocketKeepAlive(boolean on) { this.socketKeepAlive = on; } - public boolean getUseNioConnector() { - return this.useNioConnector; - } - - public void setUseNioConnector(boolean useNio) { - this.useNioConnector = useNio; - } - - public int getNioConnectorSelectors() { - return nioConnectorSelectors; - } - - public void setNioConnectorSelectors(int nioConnectorSelectors) { - this.nioConnectorSelectors = nioConnectorSelectors; - } - - public int getNioAdminConnectorSelectors() { - return nioAdminConnectorSelectors; - } - - public void setNioAdminConnectorSelectors(int nioAdminConnectorSelectors) { - this.nioAdminConnectorSelectors = nioAdminConnectorSelectors; - } - public int getNioAcceptorBacklog() { return nioAcceptorBacklog; } + /** + * Determines the size of the {@link NioSocketService}'s accept backlog + * queue. A large enough backlog queue prevents connections from being + * dropped during connection bursts + * + *
      + *
    • Property :"nio.acceptor.backlog"
    • + *
    • Default : 256
    • + *
    + */ public void setNioAcceptorBacklog(int nioAcceptorBacklog) { this.nioAcceptorBacklog = nioAcceptorBacklog; } @@ -1335,6 +2119,14 @@ public int getAdminSocketBufferSize() { return adminStreamBufferSize; } + /** + * {@link ClientConfig#setSocketBufferSize(int)} to use for network + * operations during admin operations + *
      + *
    • Property :"admin.streams.buffer.size"
    • + *
    • Default :10MB
    • + *
    + */ public void setAdminSocketBufferSize(int socketBufferSize) { this.adminStreamBufferSize = socketBufferSize; } @@ -1343,6 +2135,17 @@ public List getStorageConfigurations() { return storageConfigurations; } + /** + * List of fully qualified class names of {@link StorageEngine} types to + * enable on the server + * + *
      + *
    • Property :"storage.configs"
    • + *
    • Default : {@link BdbStorageConfiguration} + * {@link MysqlStorageConfiguration} {@link InMemoryStorageConfiguration} + * {@link CacheStorageConfiguration} {@link ReadOnlyStorageConfiguration}
    • + *
        + */ public void setStorageConfigurations(List storageConfigurations) { this.storageConfigurations = storageConfigurations; } @@ -1351,6 +2154,14 @@ public Props getAllProps() { return this.allProps; } + /** + * {@link ClientConfig#setRequestFormatType(RequestFormatType)} + * + *
          + *
        • Property :"request.format"
        • + *
        • Default :"vp1"
        • + *
        + */ public void setRequestFormatType(RequestFormatType type) { this.requestFormatType = type; } @@ -1363,6 +2174,16 @@ public boolean isServerRoutingEnabled() { return this.enableServerRouting; } + /** + * If enabled, Routing may happen in the server,depending on store + * definition. Note that the Java Client {@link DefaultStoreClient} does not + * support this yet. + * + *
          + *
        • Property :"enable.server.routing"
        • + *
        • Default : true
        • + *
        + */ public void setEnableServerRouting(boolean enableServerRouting) { this.enableServerRouting = enableServerRouting; } @@ -1371,6 +2192,17 @@ public int getNumScanPermits() { return numScanPermits; } + /** + * Maximum number of background tasks to run parallely with the online + * traffic. This trades off between time to finish background work and + * impact on online performance eg: {@link DataCleanupJob} and + * {@link StreamingSlopPusherJob} + * + *
          + *
        • Property :"num.scan.permits"
        • + *
        • Default :1
        • + *
        + */ public void setNumScanPermits(int numScanPermits) { this.numScanPermits = numScanPermits; } @@ -1379,6 +2211,14 @@ public String getFailureDetectorImplementation() { return failureDetectorImplementation; } + /** + * {@link ClientConfig#setFailureDetectorImplementation(String)} + * + *
          + *
        • Property :"failuredetector.implementation"
        • + *
        • Default :FailureDetectorConfig.DEFAULT_IMPLEMENTATION_CLASS_NAME
        • + *
        + */ public void setFailureDetectorImplementation(String failureDetectorImplementation) { this.failureDetectorImplementation = failureDetectorImplementation; } @@ -1387,6 +2227,14 @@ public long getFailureDetectorBannagePeriod() { return failureDetectorBannagePeriod; } + /** + * {@link ClientConfig#setFailureDetectorBannagePeriod(long)} + * + *
          + *
        • Property :"failuredetector.bannage.period"
        • + *
        • Default :FailureDetectorConfig.DEFAULT_BANNAGE_PERIOD
        • + *
        + */ public void setFailureDetectorBannagePeriod(long failureDetectorBannagePeriod) { this.failureDetectorBannagePeriod = failureDetectorBannagePeriod; } @@ -1395,6 +2243,14 @@ public int getFailureDetectorThreshold() { return failureDetectorThreshold; } + /** + * {@link ClientConfig#setFailureDetectorThreshold(int)} + * + *
          + *
        • Property :"failuredetector.threshold"
        • + *
        • Default :FailureDetectorConfig.DEFAULT_THRESHOLD
        • + *
        + */ public void setFailureDetectorThreshold(int failureDetectorThreshold) { this.failureDetectorThreshold = failureDetectorThreshold; } @@ -1403,6 +2259,14 @@ public int getFailureDetectorThresholdCountMinimum() { return failureDetectorThresholdCountMinimum; } + /** + * {@link ClientConfig#setFailureDetectorThresholdCountMinimum(int)} + * + *
          + *
        • Property :"failuredetector.threshold.countminimum"
        • + *
        • Default :FailureDetectorConfig.DEFAULT_THRESHOLD_COUNT_MINIMUM
        • + *
        + */ public void setFailureDetectorThresholdCountMinimum(int failureDetectorThresholdCountMinimum) { this.failureDetectorThresholdCountMinimum = failureDetectorThresholdCountMinimum; } @@ -1411,6 +2275,14 @@ public long getFailureDetectorThresholdInterval() { return failureDetectorThresholdInterval; } + /** + * {@link ClientConfig#setFailureDetectorThresholdInterval(long)} + * + *
          + *
        • Property :"failuredetector.threshold.interval"
        • + *
        • Default :FailureDetectorConfig.DEFAULT_THRESHOLD_INTERVAL
        • + *
        + */ public void setFailureDetectorThresholdInterval(long failureDetectorThresholdInterval) { this.failureDetectorThresholdInterval = failureDetectorThresholdInterval; } @@ -1419,6 +2291,14 @@ public long getFailureDetectorAsyncRecoveryInterval() { return failureDetectorAsyncRecoveryInterval; } + /** + * {@link ClientConfig#setFailureDetectorAsyncRecoveryInterval(long)} + * + *
          + *
        • Property :"failuredetector.asyncrecovery.interval"
        • + *
        • Default :FailureDetectorConfig.DEFAULT_ASYNC_RECOVERY_INTERVAL
        • + *
        + */ public void setFailureDetectorAsyncRecoveryInterval(long failureDetectorAsyncRecoveryInterval) { this.failureDetectorAsyncRecoveryInterval = failureDetectorAsyncRecoveryInterval; } @@ -1427,6 +2307,14 @@ public List getFailureDetectorCatastrophicErrorTypes() { return failureDetectorCatastrophicErrorTypes; } + /** + * {@link ClientConfig#setFailureDetectorCatastrophicErrorTypes(List)} + * + *
          + *
        • Property :"failuredetector.catastrophic.error.types"
        • + *
        • Default :FailureDetectorConfig.DEFAULT_CATASTROPHIC_ERROR_TYPES
        • + *
        + */ public void setFailureDetectorCatastrophicErrorTypes(List failureDetectorCatastrophicErrorTypes) { this.failureDetectorCatastrophicErrorTypes = failureDetectorCatastrophicErrorTypes; } @@ -1435,6 +2323,14 @@ public long getFailureDetectorRequestLengthThreshold() { return failureDetectorRequestLengthThreshold; } + /** + * {@link ClientConfig#setFailureDetectorRequestLengthThreshold(long)} + * + *
          + *
        • Property :"failuredetector.request.length.threshold"
        • + *
        • Default :same as socket timeout
        • + *
        + */ public void setFailureDetectorRequestLengthThreshold(long failureDetectorRequestLengthThreshold) { this.failureDetectorRequestLengthThreshold = failureDetectorRequestLengthThreshold; } @@ -1443,22 +2339,117 @@ public int getRetentionCleanupFirstStartTimeInHour() { return retentionCleanupFirstStartTimeInHour; } + /** + * The first hour in the day, when the {@link DataCleanupJob} will start + *
          + *
        • Property :"retention.cleanup.first.start.hour"
        • + *
        • Default :0
        • + *
        + */ public void setRetentionCleanupFirstStartTimeInHour(int retentionCleanupFirstStartTimeInHour) { this.retentionCleanupFirstStartTimeInHour = retentionCleanupFirstStartTimeInHour; } + public int getRetentionCleanupFirstStartDayOfWeek() { + return retentionCleanupFirstStartDayOfWeek; + } + + /** + * First day of the week to run {@link DataCleanupJob}, after server starts + * up. From there on, it will run with the configured frequency. 1=SUN, + * 2=MON, 3=TUE, 4=WED, 5=THU, 6=FRI,7=SAT + * + *
          + *
        • Property :"retention.cleanup.first.start.day"
        • + *
        • Default :tomorrow
        • + *
        + */ + public void setRetentionCleanupFirstStartDayOfWeek(int retentionCleanupFirstStartDayOfWeek) { + this.retentionCleanupFirstStartDayOfWeek = retentionCleanupFirstStartDayOfWeek; + } + public int getRetentionCleanupScheduledPeriodInHour() { return retentionCleanupScheduledPeriodInHour; } + /** + * Frequency to run {@link DataCleanupJob} + * + *
          + *
        • Property :
        • + *
        • Default :
        • + *
        + */ public void setRetentionCleanupScheduledPeriodInHour(int retentionCleanupScheduledPeriodInHour) { this.retentionCleanupScheduledPeriodInHour = retentionCleanupScheduledPeriodInHour; } + public boolean getRetentionCleanupPinStartTime() { + return retentionCleanupPinStartTime; + } + + /** + * if enabled, {@link DataCleanupJob} will be pinned to the same time each + * run interval. Otherwise, it will slowly shift based on how long the job + * actually takes to complete. See + * {@link Timer#scheduleAtFixedRate(TimerTask, java.util.Date, long)} + * + *
          + *
        • Property :"retention.cleanup.pin.start.time"
        • + *
        • Default :true
        • + *
        + */ + public void setRetentionCleanupPinStartTime(boolean retentionCleanupFixStartTime) { + this.retentionCleanupPinStartTime = retentionCleanupFixStartTime; + } + + public boolean isEnforceRetentionPolicyOnRead() { + return enforceRetentionPolicyOnRead; + } + + /** + * If enabled, the server will perform an expiry check for get and getall + * and will not return stale entries + * + *
          + *
        • Property :"enforce.retention.policy.on.read"
        • + *
        • Default :false
        • + *
        + */ + public void setEnforceRetentionPolicyOnRead(boolean enforceRetentionPolicyOnRead) { + this.enforceRetentionPolicyOnRead = enforceRetentionPolicyOnRead; + } + + public boolean isDeleteExpiredValuesOnRead() { + return deleteExpiredValuesOnRead; + } + + /** + * If enabled, in addition to filtering stale entries, the server will also + * delete the stale value + * + *
          + *
        • Property :"delete.expired.values.on.read"
        • + *
        • Default :false
        • + *
        + */ + public void setDeleteExpiredValuesOnRead(boolean deleteExpiredValuesOnRead) { + this.deleteExpiredValuesOnRead = deleteExpiredValuesOnRead; + } + public int getAdminSocketTimeout() { return adminSocketTimeout; } + /** + * {@link ClientConfig#setSocketTimeout(int, java.util.concurrent.TimeUnit)} + * to use in AdminService + * + *
          + *
        • Property :"admin.client.socket.timeout.sec"
        • + *
        • Default :24 * 60 * 60
        • + *
        + */ public void setAdminSocketTimeout(int adminSocketTimeout) { this.adminSocketTimeout = adminSocketTimeout; } @@ -1467,34 +2458,49 @@ public int getAdminConnectionTimeout() { return adminConnectionTimeout; } + /** + * ( + * {@link ClientConfig#setConnectionTimeout(int, java.util.concurrent.TimeUnit)} + * to use in AdminService + * + *
          + *
        • Property :"admin.client.connection.timeout.sec"
        • + *
        • Default :60
        • + *
        + */ public void setAdminConnectionTimeout(int adminConnectionTimeout) { this.adminConnectionTimeout = adminConnectionTimeout; } - public void setMaxRebalancingAttempt(int maxRebalancingAttempt) { - this.maxRebalancingAttempt = maxRebalancingAttempt; - } - - public int getMaxRebalancingAttempt() { - return this.maxRebalancingAttempt; - } - public long getRebalancingTimeoutSec() { return rebalancingTimeoutSec; } + /** + * The maximum amount of time the server will wait for the remote + * rebalancing tasks to finish. + * + *
          + *
        • Property :"rebalancing.timeout.seconds"
        • + *
        • Default :10 * 24 * 60 * 60
        • + *
        + */ public void setRebalancingTimeoutSec(long rebalancingTimeoutSec) { this.rebalancingTimeoutSec = rebalancingTimeoutSec; } - public VoldemortConfig(int nodeId, String voldemortHome) { - this(new Props().with("node.id", nodeId).with("voldemort.home", voldemortHome)); - } - public boolean isGossipEnabled() { return enableGossip; } + /** + * Enabled gossip between servers, in server side routing.. Has no effect + * when using client side routing, as in {@link DefaultStoreClient} + *
          + *
        • Property :"enable.gossip"
        • + *
        • Default :false
        • + *
        + */ public void setEnableGossip(boolean enableGossip) { this.enableGossip = enableGossip; } @@ -1503,38 +2509,84 @@ public String getReadOnlySearchStrategy() { return readOnlySearchStrategy; } - public long getMaxBytesPerSecond() { - return maxBytesPerSecond; + public long getReadOnlyFetcherMaxBytesPerSecond() { + return readOnlyFetcherMaxBytesPerSecond; } - public void setMaxBytesPerSecond(long maxBytesPerSecond) { - this.maxBytesPerSecond = maxBytesPerSecond; + /** + * Global throttle limit for all hadoop fetches. New flows will dynamically + * share bandwidth with existing flows, to respect this parameter at all + * times. + * + *
          + *
        • Property :"fetcher.max.bytes.per.sec"
        • + *
        • Default :0, No throttling
        • + *
        + */ + public void setReadOnlyFetcherMaxBytesPerSecond(long maxBytesPerSecond) { + this.readOnlyFetcherMaxBytesPerSecond = maxBytesPerSecond; } - public long getMinBytesPerSecond() { - return minBytesPerSecond; + public long getReadOnlyFetcherMinBytesPerSecond() { + return readOnlyFetcherMinBytesPerSecond; } - public void setMinBytesPerSecond(long minBytesPerSecond) { - this.minBytesPerSecond = minBytesPerSecond; + /** + * Minimum amount of bandwidth that is guaranteed for any read only hadoop + * fetch.. New flows will be rejected if the server cannot guarantee this + * property for existing flows, if it accepts the new flow. + * + *
          + *
        • Property :"fetcher.min.bytes.per.sec"
        • + *
        • Default :0, no lower limit
        • + *
        + */ + public void setReadOnlyFetcherMinBytesPerSecond(long minBytesPerSecond) { + this.readOnlyFetcherMinBytesPerSecond = minBytesPerSecond; } - public long getReportingIntervalBytes() { - return reportingIntervalBytes; + public long getReadOnlyFetcherReportingIntervalBytes() { + return readOnlyFetcherReportingIntervalBytes; } - public void setReportingIntervalBytes(long reportingIntervalBytes) { - this.reportingIntervalBytes = reportingIntervalBytes; + /** + * Interval to report statistics for HDFS fetches + * + *
          + *
        • Property :"fetcher.reporting.interval.bytes"
        • + *
        • Default :25MB
        • + *
        + */ + public void setReadOnlyFetcherReportingIntervalBytes(long reportingIntervalBytes) { + this.readOnlyFetcherReportingIntervalBytes = reportingIntervalBytes; } public int getFetcherBufferSize() { return fetcherBufferSize; } + /** + * Size of buffer to be used for HdfsFetcher. Note that this does not apply + * to WebHDFS fetches + * + *
          + *
        • Property :"hdfs.fetcher.buffer.size"
        • + *
        • Default :64kb
        • + *
        + */ public void setFetcherBufferSize(int fetcherBufferSize) { this.fetcherBufferSize = fetcherBufferSize; } + /** + * Strategy to be used to search the read-only index for a given key. Either + * {@link BinarySearchStrategy} or {@link InterpolationSearchStrategy} + * + *
          + *
        • Property :"readonly.search.strategy"
        • + *
        • Default :BinarySearchStrategy.class.getName()
        • + *
        + */ public void setReadOnlySearchStrategy(String readOnlySearchStrategy) { this.readOnlySearchStrategy = readOnlySearchStrategy; } @@ -1543,10 +2595,27 @@ public boolean isNetworkClassLoaderEnabled() { return enableNetworkClassLoader; } + /** + * Loads a class to be used as a {@link VoldemortFilter}. Note that this is + * not officially supported + * + *
          + *
        • Property :"enable.network.classloader"
        • + *
        • Default :false
        • + *
        + */ public void setEnableNetworkClassLoader(boolean enableNetworkClassLoader) { this.enableNetworkClassLoader = enableNetworkClassLoader; } + /** + * If enabled, Rebalancing is enabled on the server + * + *
          + *
        • Property :"enable.rebalancing"
        • + *
        • Default : true
        • + *
        + */ public void setEnableRebalanceService(boolean enableRebalanceService) { this.enableRebalanceService = enableRebalanceService; } @@ -1559,6 +2628,16 @@ public int getMaxParallelStoresRebalancing() { return maxParallelStoresRebalancing; } + /** + * The maximum number of stores that can be rebalancing at the same time. + * This is one of the parameters that trades off between rebalancing speed + * and impact to online traffic + * + *
          + *
        • Property :"max.parallel.stores.rebalancing"
        • + *
        • Default :3
        • + *
        + */ public void setMaxParallelStoresRebalancing(int maxParallelStoresRebalancing) { this.maxParallelStoresRebalancing = maxParallelStoresRebalancing; } @@ -1567,14 +2646,51 @@ public boolean getRebalancingOptimization() { return rebalancingOptimization; } - public void setMaxParallelStoresRebalancing(boolean rebalancingOptimization) { + /** + * Prevents the some unnecessary data movement during rebalancing. For + * example, If a secondary were to become the primary of the partition, no + * data will be copied from the old primary. + * + *
          + *
        • Property :"rebalancing.optimization"
        • + *
        • Default :true
        • + *
        + */ + public void setRebalancingOptimization(boolean rebalancingOptimization) { this.rebalancingOptimization = rebalancingOptimization; } + public boolean usePartitionScanForRebalance() { + return usePartitionScanForRebalance; + } + + /** + * Enables fast, efficient range scans to be used for rebalancing + * + * Note: Only valid if the storage engine supports partition scans + * {@link StorageEngine#isPartitionScanSupported()} + * + *
          + *
        • Property :"use.partition.scan.for.rebalance"
        • + *
        • Default :true
        • + *
        + */ + public void setUsePartitionScanForRebalance(boolean usePartitionScanForRebalance) { + this.usePartitionScanForRebalance = usePartitionScanForRebalance; + } + public boolean isEnableJmxClusterName() { return enableJmxClusterName; } + /** + * If enabled, the cluster name will be used as a part of the Mbeans + * created. + *
          + *
        • Property :"enable.jmx.clustername"
        • + *
        • Default :false
        • + *
        + */ public void setEnableJmxClusterName(boolean enableJmxClusterName) { this.enableJmxClusterName = enableJmxClusterName; } @@ -1586,4 +2702,39 @@ public OpTimeMap testingGetSlowQueueingDelays() { public OpTimeMap testingGetSlowConcurrentDelays() { return this.testingSlowConcurrentDelays; } + + public boolean isUseMlock() { + return useMlock; + } + + /** + * If true, the server will mlock read-only index files and pin them to + * memory. This might help in controlling thrashing of index pages. + * + *
          + *
        • Property : "readonly.mlock.index"
        • + *
        • Default " true
        • + *
        + * + * @param useMlock + */ + public void setUseMlock(boolean useMlock) { + this.useMlock = useMlock; + } + + public int getGossipInterval() { + return gossipIntervalMs; + } + + /** + * When Gossip is enabled, time interval to exchange gossip messages between + * servers + *
          + *
        • Property :"gossip.interval.ms"
        • + *
        • Default :30000
        • + *
        + */ + public void setGossipInterval(int gossipIntervalMs) { + this.gossipIntervalMs = gossipIntervalMs; + } } diff --git a/src/java/voldemort/server/VoldemortServer.java b/src/java/voldemort/server/VoldemortServer.java index fd22f46f6f..c7471d827e 100644 --- a/src/java/voldemort/server/VoldemortServer.java +++ b/src/java/voldemort/server/VoldemortServer.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2012 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -83,7 +83,7 @@ public class VoldemortServer extends AbstractService { public VoldemortServer(VoldemortConfig config) { super(ServiceType.VOLDEMORT); this.voldemortConfig = config; - this.storeRepository = new StoreRepository(); + this.storeRepository = new StoreRepository(config.isJmxEnabled()); this.metadata = MetadataStore.readFromDirectory(new File(this.voldemortConfig.getMetadataDirectory()), voldemortConfig.getNodeId()); this.identityNode = metadata.getCluster().getNodeById(voldemortConfig.getNodeId()); @@ -97,7 +97,7 @@ public VoldemortServer(VoldemortConfig config, Cluster cluster) { this.identityNode = cluster.getNodeById(voldemortConfig.getNodeId()); this.checkHostName(); - this.storeRepository = new StoreRepository(); + this.storeRepository = new StoreRepository(config.isJmxEnabled()); // update cluster details in metaDataStore ConfigurationStorageEngine metadataInnerEngine = new ConfigurationStorageEngine("metadata-config-store", voldemortConfig.getMetadataDirectory()); @@ -358,9 +358,10 @@ public void restoreDataFromReplication(int numberOfParallelTransfers) { metadata.getCluster(), numberOfParallelTransfers * 2); try { - adminClient.restoreDataFromReplications(metadata.getNodeId(), numberOfParallelTransfers); + adminClient.restoreOps.restoreDataFromReplications(metadata.getNodeId(), + numberOfParallelTransfers); } finally { - adminClient.stop(); + adminClient.close(); } } diff --git a/src/java/voldemort/server/gossip/GossipService.java b/src/java/voldemort/server/gossip/GossipService.java index 170ecdc5ac..25c6d7d3d0 100644 --- a/src/java/voldemort/server/gossip/GossipService.java +++ b/src/java/voldemort/server/gossip/GossipService.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2010 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -56,6 +56,6 @@ protected void startInner() { @Override protected void stopInner() { gossiper.stop(); - adminClient.stop(); + adminClient.close(); } } diff --git a/src/java/voldemort/server/gossip/Gossiper.java b/src/java/voldemort/server/gossip/Gossiper.java index 47ecfa8093..25fa66f402 100644 --- a/src/java/voldemort/server/gossip/Gossiper.java +++ b/src/java/voldemort/server/gossip/Gossiper.java @@ -1,18 +1,18 @@ /* * Copyright 2008-2010 LinkedIn, Inc - * + * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. -*/ + */ package voldemort.server.gossip; @@ -29,17 +29,18 @@ import voldemort.versioning.Versioned; /** - * Implementation of a Gossip protocol for metadata. - * Periodically, choose a random peer and Gossip for all metadata keys specified in + * Implementation of a Gossip protocol for metadata. Periodically, choose a + * random peer and Gossip for all metadata keys specified in * {@link voldemort.store.metadata.MetadataStore#GOSSIP_KEYS} with that peer. *

        - * Gossip between nodes A and B for a metadata key K starts with node A retrieving the - * key K and its vector clock from node B. The retrieved vector clock is then - * compared with the local vector clock for K. If the value at node B is determined to have - * come after the value at node A, node A will accept the value at node B. If the value - * at node B is determined to have come before the value at node A, node A will do nothing - * and allow node B to initiate Gossip. If the two vector clocks are found to be - * concurrent i.e., causally unrelated, an error is logged. + * Gossip between nodes A and B for a metadata key K starts with node A + * retrieving the key K and its vector clock from node B. The retrieved vector + * clock is then compared with the local vector clock for K. If the value at + * node B is determined to have come after the value at node A, node A will + * accept the value at node B. If the value at node B is determined to have come + * before the value at node A, node A will do nothing and allow node B to + * initiate Gossip. If the two vector clocks are found to be concurrent i.e., + * causally unrelated, an error is logged. *

        */ public class Gossiper implements Runnable { @@ -53,12 +54,13 @@ public class Gossiper implements Runnable { private final static Logger logger = Logger.getLogger(Gossiper.class); /** - * Create a Gossiper object, which implements {@link Runnable} allowing it to be - * run as a thread or be submitted to an Executor. + * Create a Gossiper object, which implements {@link Runnable} + * allowing it to be run as a thread or be submitted to an Executor. * - * @param metadataStore The instance of {@link voldemort.store.metadata.MetadataStore} for the - * local node. - * @param adminClient Instance of {@link voldemort.client.protocol.admin.AdminClient} + * @param metadataStore The instance of + * {@link voldemort.store.metadata.MetadataStore} for the local node. + * @param adminClient Instance of + * {@link voldemort.client.protocol.admin.AdminClient} * @param gossipInterval Interval in milliseconds at which we want * to gossip. */ @@ -86,8 +88,9 @@ public void stop() { /** * Perform Gossip: until receiving a shutdown signal, pick a peer at random, - * Gossip all keys listed in {@link voldemort.store.metadata.MetadataStore#GOSSIP_KEYS} - * with that peer and then sleep for a specified interval. + * Gossip all keys listed in + * {@link voldemort.store.metadata.MetadataStore#GOSSIP_KEYS} with that peer + * and then sleep for a specified interval. */ public void run() { while(running.get()) { @@ -99,7 +102,7 @@ public void run() { Node node = selectPeer(); adminClient.setAdminClientCluster(metadataStore.getCluster()); - if (logger.isDebugEnabled()) + if(logger.isDebugEnabled()) logger.debug("Starting gossip with " + node); for(String key: MetadataStore.GOSSIP_KEYS) { @@ -114,8 +117,8 @@ public void run() { /** * Randomly select a distinct peer. Method is protected rather - * than private, so that it may be overridden if - * peer selection logic is to be changed e.g., to add datacenter/rack awareness. + * than private, so that it may be overridden if peer selection + * logic is to be changed e.g., to add datacenter/rack awareness. * * @return Peer for Gossip. */ @@ -131,61 +134,71 @@ protected Node selectPeer() { } /** - * Perform Gossip on a specified metadata key with a remote node. As metadata is - * data, vector clocks are used to determine causality. - *

        - * Method is protected rather than private, so that it may - * be overridden if the behaviour for handling concurrent values of the same key was to be - * changed e.g., if two differently named stores were added during a network split, merge - * appropriate metadata to include both stores. + * Perform Gossip on a specified metadata key with a remote node. As + * metadata is data, vector clocks are used to determine causality. + *

        + * Method is protected rather than private, so + * that it may be overridden if the behaviour for handling concurrent values + * of the same key was to be changed e.g., if two differently named stores + * were added during a network split, merge appropriate metadata to include + * both stores. *

        - * + * * @param node Node to Gossip with. * @param key Metadata key to exchange by Gossip. */ protected void gossipKey(Node node, String key) { - if (logger.isDebugEnabled()) { + if(logger.isDebugEnabled()) { logger.debug("Gossiping key " + key); } /* - * Retrieve local and remote versions of the key. Uses AdminClient for remote as well - * as local operations (rather than going directly to MetadataStore for local operations), - * to avoid having to convert back and forth between byte[] and String. + * Retrieve local and remote versions of the key. Uses AdminClient for + * remote as well as local operations (rather than going directly to + * MetadataStore for local operations), to avoid having to convert back + * and forth between byte[] and String. */ - Versioned remoteVersioned = adminClient.getRemoteMetadata(node.getId(), key); - Versioned localVersioned = adminClient.getRemoteMetadata(metadataStore.getNodeId(), key); + Versioned remoteVersioned = adminClient.metadataMgmtOps.getRemoteMetadata(node.getId(), + key); + Versioned localVersioned = adminClient.metadataMgmtOps.getRemoteMetadata(metadataStore.getNodeId(), + key); switch(remoteVersioned.getVersion().compare(localVersioned.getVersion())) { - - // If remote version came after local version, update with remote version + + // If remote version came after local version, update with remote + // version case AFTER: { logger.info("Updating key " + key + " from " + node); - adminClient.updateRemoteMetadata(metadataStore.getNodeId(), key, remoteVersioned); - if (logger.isDebugEnabled()) { + adminClient.metadataMgmtOps.updateRemoteMetadata(metadataStore.getNodeId(), + key, + remoteVersioned); + if(logger.isDebugEnabled()) { logger.debug("Updated key " + key + ": " + remoteVersioned); } break; } - // If remote version came before the local version, do nothing and wait for the other - // node to gossip with us. + // If remote version came before the local version, do nothing and + // wait for the other + // node to gossip with us. case BEFORE: { - if (logger.isDebugEnabled()) { - logger.debug("Remote(" + remoteVersioned + ") came before, allowing them to initiate Gossip"); + if(logger.isDebugEnabled()) { + logger.debug("Remote(" + remoteVersioned + + ") came before, allowing them to initiate Gossip"); } break; } /* - * If we can't establish a causal relationship between two versions, there's a conflict. - * Ideally we should perform sensible reconciliation, but for simplicity's sake we will just - * log an error. + * If we can't establish a causal relationship between two versions, + * there's a conflict. Ideally we should perform sensible + * reconciliation, but for simplicity's sake we will just log an + * error. */ case CONCURRENTLY: { - logger.error(key + " is concurrent between local node(" + localVersioned + ") and remote at " + node + - "(" + remoteVersioned + ")"); + logger.error(key + " is concurrent between local node(" + localVersioned + + ") and remote at " + node + "(" + remoteVersioned + ")"); break; } diff --git a/src/java/voldemort/server/niosocket/AsyncRequestHandler.java b/src/java/voldemort/server/niosocket/AsyncRequestHandler.java index d6af05b642..29f39dc339 100644 --- a/src/java/voldemort/server/niosocket/AsyncRequestHandler.java +++ b/src/java/voldemort/server/niosocket/AsyncRequestHandler.java @@ -25,18 +25,17 @@ import java.nio.channels.Selector; import java.nio.channels.SocketChannel; -import org.apache.commons.lang.mutable.MutableInt; import org.apache.log4j.Level; import voldemort.VoldemortException; import voldemort.client.protocol.RequestFormatType; +import voldemort.common.nio.SelectorManagerWorker; import voldemort.server.protocol.RequestHandler; import voldemort.server.protocol.RequestHandlerFactory; import voldemort.server.protocol.StreamRequestHandler; import voldemort.server.protocol.StreamRequestHandler.StreamRequestDirection; import voldemort.server.protocol.StreamRequestHandler.StreamRequestHandlerState; import voldemort.utils.ByteUtils; -import voldemort.utils.SelectorManagerWorker; /** * AsyncRequestHandler manages a Selector, SocketChannel, and RequestHandler @@ -60,16 +59,16 @@ public class AsyncRequestHandler extends SelectorManagerWorker { private StreamRequestHandler streamRequestHandler; - private MutableInt serverConnectionCount; + private NioSelectorManagerStats nioStats; public AsyncRequestHandler(Selector selector, SocketChannel socketChannel, RequestHandlerFactory requestHandlerFactory, int socketBufferSize, - MutableInt serverConnectionCount) { - super(selector, socketChannel, socketBufferSize); + NioSelectorManagerStats nioStats) { + super(selector, socketChannel, socketBufferSize, nioStats.getServerCommBufferStats()); this.requestHandlerFactory = requestHandlerFactory; - this.serverConnectionCount = serverConnectionCount; + this.nioStats = nioStats; } @Override @@ -130,8 +129,7 @@ protected void read(SelectionKey selectionKey) throws IOException { DataInputStream dataInputStream = new DataInputStream(inputStream); DataOutputStream dataOutputStream = new DataOutputStream(outputStream); - streamRequestHandler = requestHandler.handleRequest(dataInputStream, - dataOutputStream); + streamRequestHandler = requestHandler.handleRequest(dataInputStream, dataOutputStream); if(logger.isDebugEnabled()) { logger.debug("AsyncRequestHandler:read finished request from " @@ -386,7 +384,7 @@ public void close() { if(!isClosed.compareAndSet(false, true)) return; - serverConnectionCount.decrement(); + nioStats.removeConnection(); closeInternal(); } } diff --git a/src/java/voldemort/server/niosocket/NioSelectorManager.java b/src/java/voldemort/server/niosocket/NioSelectorManager.java index c24327336a..dd70f53f67 100644 --- a/src/java/voldemort/server/niosocket/NioSelectorManager.java +++ b/src/java/voldemort/server/niosocket/NioSelectorManager.java @@ -23,11 +23,12 @@ import java.util.Queue; import java.util.concurrent.ConcurrentLinkedQueue; -import org.apache.commons.lang.mutable.MutableInt; import org.apache.log4j.Level; +import voldemort.common.nio.CommBufferSizeStats; +import voldemort.common.nio.SelectorManager; import voldemort.server.protocol.RequestHandlerFactory; -import voldemort.utils.SelectorManager; +import voldemort.store.stats.Histogram; /** * SelectorManager handles the non-blocking polling of IO events using the @@ -100,7 +101,7 @@ public class NioSelectorManager extends SelectorManager { private final int socketBufferSize; - private MutableInt numActiveConnections; + private final NioSelectorManagerStats stats; public NioSelectorManager(InetSocketAddress endpoint, RequestHandlerFactory requestHandlerFactory, @@ -109,7 +110,7 @@ public NioSelectorManager(InetSocketAddress endpoint, this.socketChannelQueue = new ConcurrentLinkedQueue(); this.requestHandlerFactory = requestHandlerFactory; this.socketBufferSize = socketBufferSize; - this.numActiveConnections = new MutableInt(0); + this.stats = new NioSelectorManagerStats(); } public void accept(SocketChannel socketChannel) { @@ -123,6 +124,9 @@ public void accept(SocketChannel socketChannel) { @Override protected void processEvents() { try { + // update stats + stats.updateSelectStats(selectCount, selectTimeMs, processingTimeMs); + SocketChannel socketChannel = null; while((socketChannel = socketChannelQueue.poll()) != null) { @@ -160,11 +164,11 @@ protected void processEvents() { socketChannel, requestHandlerFactory, socketBufferSize, - numActiveConnections); + stats); if(!isClosed.get()) { socketChannel.register(selector, SelectionKey.OP_READ, attachment); - numActiveConnections.increment(); + stats.addConnection(); } } catch(ClosedSelectorException e) { if(logger.isDebugEnabled()) @@ -190,7 +194,7 @@ protected void processEvents() { * @return */ public Integer getNumActiveConnections() { - return numActiveConnections.toInteger(); + return stats.getNumActiveConnections(); } /** @@ -201,4 +205,20 @@ public Integer getNumActiveConnections() { public Integer getNumQueuedConnections() { return socketChannelQueue.size(); } + + public Histogram getSelectTimeMsHistogram() { + return stats.getSelectTimeMsHistogram(); + } + + public Histogram getSelectCountHistogram() { + return stats.getSelectCountHistogram(); + } + + public Histogram getProcessingTimeMsHistogram() { + return stats.getProcessingTimeMsHistogram(); + } + + public CommBufferSizeStats getCommBufferSizeStats() { + return stats.getServerCommBufferStats(); + } } diff --git a/src/java/voldemort/server/niosocket/NioSelectorManagerStats.java b/src/java/voldemort/server/niosocket/NioSelectorManagerStats.java new file mode 100644 index 0000000000..7ac1db4251 --- /dev/null +++ b/src/java/voldemort/server/niosocket/NioSelectorManagerStats.java @@ -0,0 +1,89 @@ +package voldemort.server.niosocket; + +import org.apache.commons.lang.mutable.MutableInt; + +import voldemort.common.nio.CommBufferSizeStats; +import voldemort.common.nio.SelectorManager; +import voldemort.store.stats.Histogram; + +/** + * Encapsulates all the statistics about various metrics in the NIO Network + * layer + * + */ +public class NioSelectorManagerStats { + + private static long SELECTOR_STATS_RESET_INTERVAL = 60000; + + private MutableInt numActiveConnections; + + private Histogram selectTimeMsHistogram; + + private Histogram selectCountHistogram; + + private Histogram processingTimeMsHistogram; + + private CommBufferSizeStats serverCommBufferStats; + + public NioSelectorManagerStats() { + this.numActiveConnections = new MutableInt(0); + this.serverCommBufferStats = new CommBufferSizeStats(); + + // Theoretically, the delay can be only upto SELECTOR_POLL_MS. + // But sometimes wallclock time can be higher + this.selectTimeMsHistogram = new Histogram(SelectorManager.SELECTOR_POLL_MS * 2, + 1, + SELECTOR_STATS_RESET_INTERVAL); + // Not a scientific limit. Not expecting a server thread to handle more + // than 100K connections. + this.selectCountHistogram = new Histogram(100000, 1, SELECTOR_STATS_RESET_INTERVAL); + // again not scientific. But we really don't care about any processing + // time higher than 15 seconds + this.processingTimeMsHistogram = new Histogram(15000, 1, SELECTOR_STATS_RESET_INTERVAL); + } + + public void addConnection() { + numActiveConnections.increment(); + } + + public void removeConnection() { + numActiveConnections.decrement(); + } + + public void updateSelectStats(int selectCount, long selectTimeMs, long processingTimeMs) { + // update selection statistics + if(selectCount > -1) { + selectCountHistogram.insert(selectCount); + selectTimeMsHistogram.insert(selectTimeMs); + } + // update processing time statistics only if some work was picked up + if(processingTimeMs > -1 && selectCount > 0) { + processingTimeMsHistogram.insert(processingTimeMs); + } + } + + /** + * Returns the number of active connections for this selector manager + * + * @return + */ + public Integer getNumActiveConnections() { + return numActiveConnections.toInteger(); + } + + public Histogram getSelectTimeMsHistogram() { + return selectTimeMsHistogram; + } + + public Histogram getSelectCountHistogram() { + return selectCountHistogram; + } + + public Histogram getProcessingTimeMsHistogram() { + return processingTimeMsHistogram; + } + + public CommBufferSizeStats getServerCommBufferStats() { + return serverCommBufferStats; + } +} diff --git a/src/java/voldemort/server/niosocket/NioSocketService.java b/src/java/voldemort/server/niosocket/NioSocketService.java index 9bc44ac0c8..82c6674f91 100644 --- a/src/java/voldemort/server/niosocket/NioSocketService.java +++ b/src/java/voldemort/server/niosocket/NioSocketService.java @@ -281,4 +281,75 @@ public final int getNumQueuedConnections() { return sum; } + @JmxGetter(name = "selectCountAvg", description = "average number of connections selected in each select() call") + public final double getSelectCountAvg() { + double sum = 0.0; + for(NioSelectorManager manager: selectorManagers) { + sum += manager.getSelectCountHistogram().getAverage(); + } + return sum / selectorManagers.length; + } + + @JmxGetter(name = "selectCount99th", description = "99th percentile of number of connections selected in each select() call") + public final double getSelectCount99th() { + double sum = 0; + for(NioSelectorManager manager: selectorManagers) { + sum += manager.getSelectCountHistogram().getQuantile(0.99); + } + return sum / selectorManagers.length; + } + + @JmxGetter(name = "selectTimeMsAvg", description = "average time spent in the select() call") + public final double getSelectTimeMsAvg() { + double sum = 0; + for(NioSelectorManager manager: selectorManagers) { + sum += manager.getSelectTimeMsHistogram().getAverage(); + } + return sum / selectorManagers.length; + } + + @JmxGetter(name = "selectTimeMs99th", description = "99th percentile of time spent in the select() call") + public final double getSelectTimeMs99th() { + double sum = 0; + for(NioSelectorManager manager: selectorManagers) { + sum += manager.getSelectTimeMsHistogram().getQuantile(0.99); + } + return sum / selectorManagers.length; + } + + @JmxGetter(name = "processingTimeMsAvg", description = "average time spent processing all read/write requests, in a select() loop") + public final double getProcessingTimeMsAvg() { + double sum = 0; + for(NioSelectorManager manager: selectorManagers) { + sum += manager.getProcessingTimeMsHistogram().getAverage(); + } + return sum / selectorManagers.length; + } + + @JmxGetter(name = "processingTimeMs99th", description = "99th percentile of time spent processing all the read/write requests, in a select() loop") + public final double getprocessingTimeMs99th() { + double sum = 0; + for(NioSelectorManager manager: selectorManagers) { + sum += manager.getProcessingTimeMsHistogram().getQuantile(0.99); + } + return sum / selectorManagers.length; + } + + @JmxGetter(name = "commReadBufferSize", description = "total amount of memory consumed by all the communication read buffers, in bytes") + public final double getCommReadBufferSize() { + long sum = 0; + for(NioSelectorManager manager: selectorManagers) { + sum += manager.getCommBufferSizeStats().getCommReadBufferSizeTracker().longValue(); + } + return sum; + } + + @JmxGetter(name = "commWriteBufferSize", description = "total amount of memory consumed by all the communication write buffers, in bytes") + public final double getCommWriteBufferSize() { + long sum = 0; + for(NioSelectorManager manager: selectorManagers) { + sum += manager.getCommBufferSizeStats().getCommWriteBufferSizeTracker().longValue(); + } + return sum; + } } diff --git a/src/java/voldemort/server/protocol/SocketRequestHandlerFactory.java b/src/java/voldemort/server/protocol/SocketRequestHandlerFactory.java index e3c746eaf7..51c1f01996 100644 --- a/src/java/voldemort/server/protocol/SocketRequestHandlerFactory.java +++ b/src/java/voldemort/server/protocol/SocketRequestHandlerFactory.java @@ -12,9 +12,6 @@ import voldemort.server.storage.StorageService; import voldemort.store.ErrorCodeMapper; import voldemort.store.metadata.MetadataStore; -import voldemort.store.stats.StreamStats; -import voldemort.store.stats.StreamStatsJmx; -import voldemort.utils.JmxUtils; /** * A factory that gets the appropriate request handler for a given @@ -30,7 +27,6 @@ public class SocketRequestHandlerFactory implements RequestHandlerFactory { private final VoldemortConfig voldemortConfig; private final AsyncOperationService asyncService; private final Rebalancer rebalancer; - private final StreamStats stats; public SocketRequestHandlerFactory(StorageService storageService, StoreRepository repository, @@ -44,17 +40,6 @@ public SocketRequestHandlerFactory(StorageService storageService, this.voldemortConfig = voldemortConfig; this.asyncService = asyncService; this.rebalancer = rebalancer; - this.stats = new StreamStats(); - if(null != voldemortConfig && voldemortConfig.isJmxEnabled()) - if(this.voldemortConfig.isEnableJmxClusterName()) - JmxUtils.registerMbean(new StreamStatsJmx(stats), - JmxUtils.createObjectName(metadata.getCluster().getName() - + ".voldemort.store.stats", - "admin-streaming")); - else - JmxUtils.registerMbean(new StreamStatsJmx(stats), - JmxUtils.createObjectName("voldemort.store.stats", - "admin-streaming")); } public RequestHandler getRequestHandler(RequestFormatType type) { @@ -76,8 +61,7 @@ public RequestHandler getRequestHandler(RequestFormatType type) { metadata, voldemortConfig, asyncService, - rebalancer, - stats); + rebalancer); default: throw new VoldemortException("Unknown wire format " + type); } diff --git a/src/java/voldemort/server/protocol/StreamRequestHandler.java b/src/java/voldemort/server/protocol/StreamRequestHandler.java index 39b1ecbe57..91f7d7eac3 100644 --- a/src/java/voldemort/server/protocol/StreamRequestHandler.java +++ b/src/java/voldemort/server/protocol/StreamRequestHandler.java @@ -15,6 +15,8 @@ public interface StreamRequestHandler { + public final static int STAT_RECORDS_INTERVAL = 100000; + /** * Handles a "segment" of a streaming request. * diff --git a/src/java/voldemort/server/protocol/admin/AdminServiceRequestHandler.java b/src/java/voldemort/server/protocol/admin/AdminServiceRequestHandler.java index 079088b9a5..109afc6276 100644 --- a/src/java/voldemort/server/protocol/admin/AdminServiceRequestHandler.java +++ b/src/java/voldemort/server/protocol/admin/AdminServiceRequestHandler.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2009 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -40,6 +40,7 @@ import voldemort.client.protocol.pb.VAdminProto.VoldemortAdminRequest; import voldemort.client.rebalance.RebalancePartitionsInfo; import voldemort.cluster.Cluster; +import voldemort.common.nio.ByteBufferBackedInputStream; import voldemort.server.StoreRepository; import voldemort.server.VoldemortConfig; import voldemort.server.protocol.RequestHandler; @@ -59,9 +60,7 @@ import voldemort.store.readonly.ReadOnlyStorageEngine; import voldemort.store.readonly.ReadOnlyUtils; import voldemort.store.slop.SlopStorageEngine; -import voldemort.store.stats.StreamStats; import voldemort.utils.ByteArray; -import voldemort.utils.ByteBufferBackedInputStream; import voldemort.utils.ByteUtils; import voldemort.utils.ClosableIterator; import voldemort.utils.EventThrottler; @@ -69,6 +68,7 @@ import voldemort.utils.Pair; import voldemort.utils.RebalanceUtils; import voldemort.utils.ReflectUtils; +import voldemort.utils.StoreInstance; import voldemort.utils.Utils; import voldemort.versioning.ObsoleteVersionException; import voldemort.versioning.VectorClock; @@ -97,7 +97,6 @@ public class AdminServiceRequestHandler implements RequestHandler { private final VoldemortConfig voldemortConfig; private final AsyncOperationService asyncService; private final Rebalancer rebalancer; - private final StreamStats stats; private FileFetcher fileFetcher; public AdminServiceRequestHandler(ErrorCodeMapper errorCodeMapper, @@ -106,8 +105,7 @@ public AdminServiceRequestHandler(ErrorCodeMapper errorCodeMapper, MetadataStore metadataStore, VoldemortConfig voldemortConfig, AsyncOperationService asyncService, - Rebalancer rebalancer, - StreamStats stats) { + Rebalancer rebalancer) { this.errorCodeMapper = errorCodeMapper; this.storageService = storageService; this.metadataStore = metadataStore; @@ -117,7 +115,6 @@ public AdminServiceRequestHandler(ErrorCodeMapper errorCodeMapper, .getContextClassLoader()); this.asyncService = asyncService; this.rebalancer = rebalancer; - this.stats = stats; setFetcherClass(voldemortConfig); } @@ -217,10 +214,10 @@ public StreamRequestHandler handleRequest(final DataInputStream inputStream, ProtoUtils.writeMessage(outputStream, handleDeleteStore(request.getDeleteStore())); break; case FETCH_STORE: - ProtoUtils.writeMessage(outputStream, handleFetchStore(request.getFetchStore())); + ProtoUtils.writeMessage(outputStream, handleFetchROStore(request.getFetchStore())); break; case SWAP_STORE: - ProtoUtils.writeMessage(outputStream, handleSwapStore(request.getSwapStore())); + ProtoUtils.writeMessage(outputStream, handleSwapROStore(request.getSwapStore())); break; case ROLLBACK_STORE: ProtoUtils.writeMessage(outputStream, @@ -239,12 +236,12 @@ public StreamRequestHandler handleRequest(final DataInputStream inputStream, handleGetROStorageFormat(request.getGetRoStorageFormat())); break; case FETCH_PARTITION_FILES: - return handleFetchPartitionFiles(request.getFetchPartitionFiles()); + return handleFetchROPartitionFiles(request.getFetchPartitionFiles()); case UPDATE_SLOP_ENTRIES: return handleUpdateSlopEntries(request.getUpdateSlopEntries()); case FAILED_FETCH_STORE: ProtoUtils.writeMessage(outputStream, - handleFailedFetch(request.getFailedFetchStore())); + handleFailedROFetch(request.getFailedFetchStore())); break; case REBALANCE_STATE_CHANGE: ProtoUtils.writeMessage(outputStream, @@ -493,7 +490,7 @@ public VAdminProto.GetROStorageFormatResponse handleGetROStorageFormat(VAdminPro return response.build(); } - public VAdminProto.FailedFetchStoreResponse handleFailedFetch(VAdminProto.FailedFetchStoreRequest request) { + public VAdminProto.FailedFetchStoreResponse handleFailedROFetch(VAdminProto.FailedFetchStoreRequest request) { final String storeDir = request.getStoreDir(); final String storeName = request.getStoreName(); VAdminProto.FailedFetchStoreResponse.Builder response = VAdminProto.FailedFetchStoreResponse.newBuilder(); @@ -528,37 +525,57 @@ public VAdminProto.FailedFetchStoreResponse handleFailedFetch(VAdminProto.Failed return response.build(); } - public StreamRequestHandler handleFetchPartitionFiles(VAdminProto.FetchPartitionFilesRequest request) { + public StreamRequestHandler handleFetchROPartitionFiles(VAdminProto.FetchPartitionFilesRequest request) { return new FetchPartitionFileStreamRequestHandler(request, metadataStore, voldemortConfig, - storeRepository, - stats); + storeRepository); } public StreamRequestHandler handleUpdateSlopEntries(VAdminProto.UpdateSlopEntriesRequest request) { - return new UpdateSlopEntriesRequestHandler(request, errorCodeMapper, storeRepository, stats); + return new UpdateSlopEntriesRequestHandler(request, + errorCodeMapper, + storeRepository, + voldemortConfig); } public StreamRequestHandler handleFetchPartitionEntries(VAdminProto.FetchPartitionEntriesRequest request) { boolean fetchValues = request.hasFetchValues() && request.getFetchValues(); + boolean fetchOrphaned = request.hasFetchOrphaned() && request.getFetchOrphaned(); + StorageEngine storageEngine = AdminServiceRequestHandler.getStorageEngine(storeRepository, + request.getStore()); if(fetchValues) { - return new FetchEntriesStreamRequestHandler(request, - metadataStore, - errorCodeMapper, - voldemortConfig, - storeRepository, - networkClassLoader, - stats); - } else - return new FetchKeysStreamRequestHandler(request, - metadataStore, - errorCodeMapper, - voldemortConfig, - storeRepository, - networkClassLoader, - stats); + if(storageEngine.isPartitionScanSupported() && !fetchOrphaned) + return new PartitionScanFetchEntriesRequestHandler(request, + metadataStore, + errorCodeMapper, + voldemortConfig, + storeRepository, + networkClassLoader); + else + return new FullScanFetchEntriesRequestHandler(request, + metadataStore, + errorCodeMapper, + voldemortConfig, + storeRepository, + networkClassLoader); + } else { + if(storageEngine.isPartitionScanSupported() && !fetchOrphaned) + return new PartitionScanFetchKeysRequestHandler(request, + metadataStore, + errorCodeMapper, + voldemortConfig, + storeRepository, + networkClassLoader); + else + return new FullScanFetchKeysRequestHandler(request, + metadataStore, + errorCodeMapper, + voldemortConfig, + storeRepository, + networkClassLoader); + } } public StreamRequestHandler handleUpdatePartitionEntries(VAdminProto.UpdatePartitionEntriesRequest request) { @@ -566,8 +583,7 @@ public StreamRequestHandler handleUpdatePartitionEntries(VAdminProto.UpdateParti errorCodeMapper, voldemortConfig, storeRepository, - networkClassLoader, - stats); + networkClassLoader); } public VAdminProto.AsyncOperationListResponse handleAsyncOperationList(VAdminProto.AsyncOperationListRequest request) { @@ -687,7 +703,7 @@ private String swapStore(String storeName, String directory) throws VoldemortExc return currentDirPath; } - public VAdminProto.SwapStoreResponse handleSwapStore(VAdminProto.SwapStoreRequest request) { + public VAdminProto.SwapStoreResponse handleSwapROStore(VAdminProto.SwapStoreRequest request) { final String dir = request.getStoreDir(); final String storeName = request.getStoreName(); VAdminProto.SwapStoreResponse.Builder response = VAdminProto.SwapStoreResponse.newBuilder(); @@ -713,7 +729,7 @@ public VAdminProto.SwapStoreResponse handleSwapStore(VAdminProto.SwapStoreReques } } - public VAdminProto.AsyncOperationStatusResponse handleFetchStore(VAdminProto.FetchStoreRequest request) { + public VAdminProto.AsyncOperationStatusResponse handleFetchROStore(VAdminProto.FetchStoreRequest request) { final String fetchUrl = request.getStoreDir(); final String storeName = request.getStoreName(); @@ -907,14 +923,14 @@ public void operate() { // Should not do rename only because then we won't // be able to rollback - adminClient.fetchPartitionFiles(nodeId, - storeName, - replicaToPartitionList, - destinationDir, - readOnlyStorageEngine.getChunkedFileSet() - .getChunkIdToNumChunks() - .keySet(), - running); + adminClient.readonlyOps.fetchPartitionFiles(nodeId, + storeName, + replicaToPartitionList, + destinationDir, + readOnlyStorageEngine.getChunkedFileSet() + .getChunkIdToNumChunks() + .keySet(), + running); } else { logger.info("Fetching entries for RW store '" + storeName @@ -952,13 +968,13 @@ public void operate() { } if(optimizedReplicaToPartitionList.size() > 0) { - Iterator>> entriesIterator = adminClient.fetchEntries(nodeId, - storeName, - optimizedReplicaToPartitionList, - filter, - false, - initialCluster, - 0); + Iterator>> entriesIterator = adminClient.bulkFetchOps.fetchEntries(nodeId, + storeName, + optimizedReplicaToPartitionList, + filter, + false, + initialCluster, + 0); long numTuples = 0; long startTime = System.currentTimeMillis(); while(running.get() && entriesIterator.hasNext()) { @@ -1003,7 +1019,7 @@ public void operate() { } } finally { - adminClient.stop(); + adminClient.close(); } } }); @@ -1072,12 +1088,12 @@ public VAdminProto.DeletePartitionEntriesResponse handleDeletePartitionEntries(V ByteArray key = entry.getFirst(); Versioned value = entry.getSecond(); throttler.maybeThrottle(key.length() + valueSize(value)); - if(RebalanceUtils.checkKeyBelongsToPartition(metadataStore.getNodeId(), - key.get(), - replicaToPartitionList, - request.hasInitialCluster() ? new ClusterMapper().readCluster(new StringReader(request.getInitialCluster())) - : metadataStore.getCluster(), - metadataStore.getStoreDef(storeName)) + if(StoreInstance.checkKeyBelongsToPartition(metadataStore.getNodeId(), + key.get(), + replicaToPartitionList, + request.hasInitialCluster() ? new ClusterMapper().readCluster(new StringReader(request.getInitialCluster())) + : metadataStore.getCluster(), + metadataStore.getStoreDef(storeName)) && filter.accept(key, value)) { if(storageEngine.delete(key, value.getVersion())) { deleteSuccess++; @@ -1187,9 +1203,10 @@ public VAdminProto.DeleteStoreResponse handleDeleteStore(VAdminProto.DeleteStore if(storeRepository.hasLocalStore(storeName)) { if(storeName.compareTo(SlopStorageEngine.SLOP_STORE_NAME) == 0) { - storageService.unregisterEngine(storeRepository.getStorageEngine(storeName), - false, - "slop"); + storageService.removeEngine(storeRepository.getStorageEngine(storeName), + false, + "slop", + true); } else { // update stores list in metadata store List oldStoreDefList = metadataStore.getStoreDefList(); @@ -1203,9 +1220,10 @@ public VAdminProto.DeleteStoreResponse handleDeleteStore(VAdminProto.DeleteStore newStoreDefList.add(storeDef); } else { logger.info("Deleting view '" + storeDef.getName() + "'"); - storageService.unregisterEngine(storeRepository.getStorageEngine(storeDef.getName()), - isReadOnly, - storeDef.getType()); + storageService.removeEngine(storeRepository.getStorageEngine(storeDef.getName()), + isReadOnly, + storeDef.getType(), + false); logger.info("Successfully deleted view '" + storeDef.getName() + "'"); } @@ -1214,9 +1232,10 @@ public VAdminProto.DeleteStoreResponse handleDeleteStore(VAdminProto.DeleteStore newStoreDefList.add(storeDef); } else { logger.info("Deleting store '" + storeDef.getName() + "'"); - storageService.unregisterEngine(storeRepository.getStorageEngine(storeDef.getName()), - isReadOnly, - storeDef.getType()); + storageService.removeEngine(storeRepository.getStorageEngine(storeDef.getName()), + isReadOnly, + storeDef.getType(), + true); logger.info("Successfully deleted store '" + storeDef.getName() + "'"); } @@ -1325,6 +1344,7 @@ public VAdminProto.AddStoreResponse handleAddStore(VAdminProto.AddStoreRequest r * returns * @return True if the buffer holds a complete request, false otherwise */ + @Override public boolean isCompleteRequest(ByteBuffer buffer) { DataInputStream inputStream = new DataInputStream(new ByteBufferBackedInputStream(buffer)); diff --git a/src/java/voldemort/server/protocol/admin/FetchKeysStreamRequestHandler.java b/src/java/voldemort/server/protocol/admin/FetchKeysStreamRequestHandler.java deleted file mode 100644 index 80bd5821a8..0000000000 --- a/src/java/voldemort/server/protocol/admin/FetchKeysStreamRequestHandler.java +++ /dev/null @@ -1,91 +0,0 @@ -package voldemort.server.protocol.admin; - -import java.io.DataInputStream; -import java.io.DataOutputStream; -import java.io.IOException; - -import voldemort.client.protocol.pb.ProtoUtils; -import voldemort.client.protocol.pb.VAdminProto; -import voldemort.client.protocol.pb.VAdminProto.FetchPartitionEntriesRequest; -import voldemort.server.StoreRepository; -import voldemort.server.VoldemortConfig; -import voldemort.store.ErrorCodeMapper; -import voldemort.store.metadata.MetadataStore; -import voldemort.store.stats.StreamStats; -import voldemort.store.stats.StreamStats.Operation; -import voldemort.utils.ByteArray; -import voldemort.utils.NetworkClassLoader; -import voldemort.utils.RebalanceUtils; - -import com.google.protobuf.Message; - -public class FetchKeysStreamRequestHandler extends FetchStreamRequestHandler { - - public FetchKeysStreamRequestHandler(FetchPartitionEntriesRequest request, - MetadataStore metadataStore, - ErrorCodeMapper errorCodeMapper, - VoldemortConfig voldemortConfig, - StoreRepository storeRepository, - NetworkClassLoader networkClassLoader, - StreamStats stats) { - super(request, - metadataStore, - errorCodeMapper, - voldemortConfig, - storeRepository, - networkClassLoader, - stats, - Operation.FETCH_KEYS); - logger.info("Starting fetch keys for store '" + storageEngine.getName() - + "' with replica to partition mapping " + replicaToPartitionList); - } - - public StreamRequestHandlerState handleRequest(DataInputStream inputStream, - DataOutputStream outputStream) - throws IOException { - if(!keyIterator.hasNext()) - return StreamRequestHandlerState.COMPLETE; - - long startNs = System.nanoTime(); - ByteArray key = keyIterator.next(); - stats.recordDiskTime(handle, System.nanoTime() - startNs); - - throttler.maybeThrottle(key.length()); - if(RebalanceUtils.checkKeyBelongsToPartition(nodeId, - key.get(), - replicaToPartitionList, - initialCluster, - storeDef) - && filter.accept(key, null) && counter % skipRecords == 0) { - VAdminProto.FetchPartitionEntriesResponse.Builder response = VAdminProto.FetchPartitionEntriesResponse.newBuilder(); - response.setKey(ProtoUtils.encodeBytes(key)); - - fetched++; - handle.incrementEntriesScanned(); - Message message = response.build(); - - startNs = System.nanoTime(); - ProtoUtils.writeMessage(outputStream, message); - stats.recordNetworkTime(handle, System.nanoTime() - startNs); - } - - // log progress - counter++; - - if(0 == counter % 100000) { - long totalTime = (System.currentTimeMillis() - startTime) / 1000; - - logger.info("Fetch keys scanned " + counter + " keys, fetched " + fetched - + " keys for store '" + storageEngine.getName() - + "' replicaToPartitionList:" + replicaToPartitionList + " in " + totalTime - + " s"); - } - - if(keyIterator.hasNext()) - return StreamRequestHandlerState.WRITING; - else { - stats.closeHandle(handle); - return StreamRequestHandlerState.COMPLETE; - } - } -} diff --git a/src/java/voldemort/server/protocol/admin/FetchPartitionFileStreamRequestHandler.java b/src/java/voldemort/server/protocol/admin/FetchPartitionFileStreamRequestHandler.java index a8b7b52238..87f74baa70 100644 --- a/src/java/voldemort/server/protocol/admin/FetchPartitionFileStreamRequestHandler.java +++ b/src/java/voldemort/server/protocol/admin/FetchPartitionFileStreamRequestHandler.java @@ -27,7 +27,8 @@ import voldemort.store.metadata.MetadataStore; import voldemort.store.readonly.ReadOnlyStorageConfiguration; import voldemort.store.readonly.ReadOnlyStorageEngine; -import voldemort.store.stats.StreamStats; +import voldemort.store.stats.StreamingStats; +import voldemort.store.stats.StreamingStats.Operation; import voldemort.utils.EventThrottler; import voldemort.utils.Pair; import voldemort.utils.RebalanceUtils; @@ -44,9 +45,7 @@ public class FetchPartitionFileStreamRequestHandler implements StreamRequestHand private final long blockSize; - private final StreamStats stats; - - private final StreamStats.Handle handle; + private final StreamingStats streamStats; private final Iterator> partitionIterator; @@ -79,8 +78,7 @@ private enum FetchStatus { protected FetchPartitionFileStreamRequestHandler(VAdminProto.FetchPartitionFilesRequest request, MetadataStore metadataStore, VoldemortConfig voldemortConfig, - StoreRepository storeRepository, - StreamStats stats) { + StoreRepository storeRepository) { this.request = request; StoreDefinition storeDef = metadataStore.getStoreDef(request.getStore()); boolean isReadOnly = storeDef.getType().compareTo(ReadOnlyStorageConfiguration.TYPE_NAME) == 0; @@ -100,8 +98,11 @@ protected FetchPartitionFileStreamRequestHandler(VAdminProto.FetchPartitionFiles voldemortConfig.getAdminSocketBufferSize()); this.storeDir = new File(storageEngine.getCurrentDirPath()); this.throttler = new EventThrottler(voldemortConfig.getStreamMaxReadBytesPerSec()); - this.stats = stats; - this.handle = stats.makeHandle(StreamStats.Operation.FETCH_FILE, replicaToPartitionList); + if(voldemortConfig.isJmxEnabled()) { + this.streamStats = storeRepository.getStreamingStats(storageEngine.getName()); + } else { + this.streamStats = null; + } this.partitionIterator = Collections.unmodifiableSet(replicaToPartitionTuples).iterator(); this.fetchStatus = FetchStatus.NEXT_PARTITION; this.currentChunkId = 0; @@ -158,7 +159,8 @@ private void handleSendIndexFile() throws IOException { this.chunkedFileWriter.close(); currentChunkId++; dataFile = indexFile = null; - handle.incrementEntriesScanned(); + if(streamStats != null) + streamStats.reportStreamingFetch(Operation.FETCH_FILE); if(currentChunkId >= numChunks) { fetchStatus = FetchStatus.NEXT_PARTITION; } else { @@ -237,9 +239,7 @@ private StreamRequestHandlerState handleNextPartition() { // partition list logger.info("Finished streaming files for partitions tuples " + replicaToPartitionTuples); - stats.closeHandle(handle); handlerState = StreamRequestHandlerState.COMPLETE; - } return handlerState; diff --git a/src/java/voldemort/server/protocol/admin/FetchStreamRequestHandler.java b/src/java/voldemort/server/protocol/admin/FetchStreamRequestHandler.java index 177fb5db95..4bb19ce66b 100644 --- a/src/java/voldemort/server/protocol/admin/FetchStreamRequestHandler.java +++ b/src/java/voldemort/server/protocol/admin/FetchStreamRequestHandler.java @@ -1,3 +1,19 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + package voldemort.server.protocol.admin; import java.io.DataOutputStream; @@ -21,15 +37,21 @@ import voldemort.store.StorageEngine; import voldemort.store.StoreDefinition; import voldemort.store.metadata.MetadataStore; -import voldemort.store.stats.StreamStats; -import voldemort.store.stats.StreamStats.Handle; +import voldemort.store.stats.StreamingStats; import voldemort.store.system.SystemStoreConstants; import voldemort.utils.ByteArray; -import voldemort.utils.ClosableIterator; import voldemort.utils.EventThrottler; import voldemort.utils.NetworkClassLoader; +import voldemort.utils.StoreInstance; +import voldemort.utils.Time; import voldemort.xml.ClusterMapper; +import com.google.protobuf.Message; + +/** + * Base class for all key/entry stream fetching handlers. + * + */ public abstract class FetchStreamRequestHandler implements StreamRequestHandler { protected final VAdminProto.FetchPartitionEntriesRequest request; @@ -40,31 +62,35 @@ public abstract class FetchStreamRequestHandler implements StreamRequestHandler protected final EventThrottler throttler; - protected final HashMap> replicaToPartitionList; + protected HashMap> replicaToPartitionList; protected final VoldemortFilter filter; protected final StorageEngine storageEngine; - protected final ClosableIterator keyIterator; + protected final StreamingStats streamStats; - protected long counter; + protected boolean isJmxEnabled; - protected long skipRecords; + protected final StreamingStats.Operation operation; - protected int fetched; + protected long scanned; // Read from disk. - protected final long startTime; + protected long fetched; // Returned to caller. - protected final Handle handle; + protected final long recordsPerPartition; - protected final StreamStats stats; + protected final long startTimeMs; protected final Logger logger = Logger.getLogger(getClass()); protected int nodeId; - protected StoreDefinition storeDef; + protected final StoreDefinition storeDef; + + protected boolean fetchOrphaned; + + protected final StoreInstance storeInstance; protected FetchStreamRequestHandler(VAdminProto.FetchPartitionEntriesRequest request, MetadataStore metadataStore, @@ -72,22 +98,29 @@ protected FetchStreamRequestHandler(VAdminProto.FetchPartitionEntriesRequest req VoldemortConfig voldemortConfig, StoreRepository storeRepository, NetworkClassLoader networkClassLoader, - StreamStats stats, - StreamStats.Operation operation) { + StreamingStats.Operation operation) { this.nodeId = metadataStore.getNodeId(); this.request = request; this.errorCodeMapper = errorCodeMapper; - this.replicaToPartitionList = ProtoUtils.decodePartitionTuple(request.getReplicaToPartitionList()); - this.stats = stats; - this.handle = stats.makeHandle(operation, replicaToPartitionList); + if(request.getReplicaToPartitionList() != null) + this.replicaToPartitionList = ProtoUtils.decodePartitionTuple(request.getReplicaToPartitionList()); this.storageEngine = AdminServiceRequestHandler.getStorageEngine(storeRepository, request.getStore()); + if(voldemortConfig.isJmxEnabled()) { + this.streamStats = storeRepository.getStreamingStats(this.storageEngine.getName()); + } else { + this.streamStats = null; + } + + this.operation = operation; this.storeDef = getStoreDef(request.getStore(), metadataStore); if(request.hasInitialCluster()) { this.initialCluster = new ClusterMapper().readCluster(new StringReader(request.getInitialCluster())); } else { this.initialCluster = metadataStore.getCluster(); } + this.storeInstance = new StoreInstance(this.initialCluster, this.storeDef); + this.throttler = new EventThrottler(voldemortConfig.getStreamMaxReadBytesPerSec()); if(request.hasFilter()) { this.filter = AdminServiceRequestHandler.getFilterFromRequest(request.getFilter(), @@ -96,14 +129,15 @@ protected FetchStreamRequestHandler(VAdminProto.FetchPartitionEntriesRequest req } else { this.filter = new DefaultVoldemortFilter(); } - this.keyIterator = storageEngine.keys(); - this.startTime = System.currentTimeMillis(); - this.counter = 0; + this.startTimeMs = System.currentTimeMillis(); + this.scanned = 0; - this.skipRecords = 1; - if(request.hasSkipRecords() && request.getSkipRecords() >= 0) { - this.skipRecords = request.getSkipRecords() + 1; + if(request.hasRecordsPerPartition() && request.getRecordsPerPartition() > 0) { + this.recordsPerPartition = request.getRecordsPerPartition(); + } else { + this.recordsPerPartition = 0; } + this.fetchOrphaned = request.hasFetchOrphaned() && request.getFetchOrphaned(); } private StoreDefinition getStoreDef(String store, MetadataStore metadataStore) { @@ -116,21 +150,21 @@ private StoreDefinition getStoreDef(String store, MetadataStore metadataStore) { return def; } + @Override public final StreamRequestDirection getDirection() { return StreamRequestDirection.WRITING; } - public final void close(DataOutputStream outputStream) throws IOException { - logger.info("Successfully scanned " + counter + " tuples, fetched " + fetched + @Override + public void close(DataOutputStream outputStream) throws IOException { + logger.info("Successfully scanned " + scanned + " tuples, fetched " + fetched + " tuples for store '" + storageEngine.getName() + "' in " - + ((System.currentTimeMillis() - startTime) / 1000) + " s"); - - if(null != keyIterator) - keyIterator.close(); + + ((System.currentTimeMillis() - startTimeMs) / 1000) + " s"); ProtoUtils.writeEndOfStream(outputStream); } + @Override public final void handleError(DataOutputStream outputStream, VoldemortException e) throws IOException { VAdminProto.FetchPartitionEntriesResponse response = VAdminProto.FetchPartitionEntriesResponse.newBuilder() @@ -143,4 +177,61 @@ public final void handleError(DataOutputStream outputStream, VoldemortException e); } + /** + * Progress info message + * + * @param tag Message that precedes progress info. Indicate 'keys' or + * 'entries'. + */ + protected void progressInfoMessage(final String tag) { + if(logger.isInfoEnabled()) { + long totalTimeS = (System.currentTimeMillis() - startTimeMs) / Time.MS_PER_SECOND; + + logger.info(tag + " : scanned " + scanned + " and fetched " + fetched + " for store '" + + storageEngine.getName() + "' replicaToPartitionList:" + + replicaToPartitionList + " in " + totalTimeS + " s"); + } + } + + /** + * Account for item being scanned. + * + * @param itemTag mad libs style string to insert into progress message. + * + */ + protected void accountForScanProgress(String itemTag) { + scanned++; + if(0 == scanned % STAT_RECORDS_INTERVAL) { + progressInfoMessage("Fetch " + itemTag + " (progress)"); + } + } + + /** + * Helper method to send message on outputStream and account for network + * time stats. + * + * @param outputStream + * @param message + * @throws IOException + */ + protected void sendMessage(DataOutputStream outputStream, Message message) throws IOException { + long startNs = System.nanoTime(); + ProtoUtils.writeMessage(outputStream, message); + if(streamStats != null) { + streamStats.reportNetworkTime(operation, System.nanoTime() - startNs); + } + } + + /** + * Helper method to track storage operations & time via StreamingStats. + * + * @param startNs + */ + protected void reportStorageOpTime(long startNs) { + if(streamStats != null) { + streamStats.reportStreamingScan(operation); + streamStats.reportStorageTime(operation, System.nanoTime() - startNs); + } + } + } diff --git a/src/java/voldemort/server/protocol/admin/FetchEntriesStreamRequestHandler.java b/src/java/voldemort/server/protocol/admin/FullScanFetchEntriesRequestHandler.java similarity index 52% rename from src/java/voldemort/server/protocol/admin/FetchEntriesStreamRequestHandler.java rename to src/java/voldemort/server/protocol/admin/FullScanFetchEntriesRequestHandler.java index fc31ef7746..4451980ade 100644 --- a/src/java/voldemort/server/protocol/admin/FetchEntriesStreamRequestHandler.java +++ b/src/java/voldemort/server/protocol/admin/FullScanFetchEntriesRequestHandler.java @@ -1,3 +1,19 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + package voldemort.server.protocol.admin; import java.io.DataInputStream; @@ -12,106 +28,89 @@ import voldemort.server.VoldemortConfig; import voldemort.store.ErrorCodeMapper; import voldemort.store.metadata.MetadataStore; -import voldemort.store.stats.StreamStats; -import voldemort.store.stats.StreamStats.Operation; +import voldemort.store.stats.StreamingStats.Operation; import voldemort.utils.ByteArray; import voldemort.utils.NetworkClassLoader; -import voldemort.utils.RebalanceUtils; import voldemort.versioning.Versioned; import com.google.protobuf.Message; /** - * FetchEntries fetches and return key/value entry. + * Fetches entries by scanning entire storage engine in storage-order. *

        * For performance reason use storageEngine.keys() iterator to filter out * unwanted keys and then call storageEngine.get() for valid keys. *

        */ - -public class FetchEntriesStreamRequestHandler extends FetchStreamRequestHandler { - - public FetchEntriesStreamRequestHandler(FetchPartitionEntriesRequest request, - MetadataStore metadataStore, - ErrorCodeMapper errorCodeMapper, - VoldemortConfig voldemortConfig, - StoreRepository storeRepository, - NetworkClassLoader networkClassLoader, - StreamStats stats) { +public class FullScanFetchEntriesRequestHandler extends FullScanFetchStreamRequestHandler { + + public FullScanFetchEntriesRequestHandler(FetchPartitionEntriesRequest request, + MetadataStore metadataStore, + ErrorCodeMapper errorCodeMapper, + VoldemortConfig voldemortConfig, + StoreRepository storeRepository, + NetworkClassLoader networkClassLoader) { super(request, metadataStore, errorCodeMapper, voldemortConfig, storeRepository, networkClassLoader, - stats, Operation.FETCH_ENTRIES); logger.info("Starting fetch entries for store '" + storageEngine.getName() + "' with replica to partition mapping " + replicaToPartitionList); } + @Override public StreamRequestHandlerState handleRequest(DataInputStream inputStream, DataOutputStream outputStream) throws IOException { - if(!keyIterator.hasNext()) + if(!keyIterator.hasNext()) { return StreamRequestHandlerState.COMPLETE; + } + + // NOTE: Storage time is accounted for somewhat incorrectly because + // .hasNext() is invoked at end of method for the common case. + // Since key reading (keyIterator.next()) is done separately from entry + // fetching (storageEngine.get()), must be careful about when to invoke + // reportStorageOpTime and when to invoke maybeThrottle(). long startNs = System.nanoTime(); ByteArray key = keyIterator.next(); - if(RebalanceUtils.checkKeyBelongsToPartition(nodeId, - key.get(), - replicaToPartitionList, - initialCluster, - storeDef) + // Cannot invoke 'throttler.maybeThrottle(key.length());' here since + // that would affect timing measurements of storage operations. - && counter % skipRecords == 0) { + if(isItemAccepted(key.get())) { List> values = storageEngine.get(key, null); - stats.recordDiskTime(handle, System.nanoTime() - startNs); + reportStorageOpTime(startNs); + throttler.maybeThrottle(key.length()); for(Versioned value: values) { - throttler.maybeThrottle(key.length()); + if(filter.accept(key, value)) { - fetched++; - handle.incrementEntriesScanned(); - VAdminProto.FetchPartitionEntriesResponse.Builder response = VAdminProto.FetchPartitionEntriesResponse.newBuilder(); + accountForFetchedKey(key.get()); + VAdminProto.FetchPartitionEntriesResponse.Builder response = VAdminProto.FetchPartitionEntriesResponse.newBuilder(); VAdminProto.PartitionEntry partitionEntry = VAdminProto.PartitionEntry.newBuilder() .setKey(ProtoUtils.encodeBytes(key)) .setVersioned(ProtoUtils.encodeVersioned(value)) .build(); response.setPartitionEntry(partitionEntry); - Message message = response.build(); - startNs = System.nanoTime(); - ProtoUtils.writeMessage(outputStream, message); - stats.recordNetworkTime(handle, System.nanoTime() - startNs); + sendMessage(outputStream, message); throttler.maybeThrottle(AdminServiceRequestHandler.valueSize(value)); } } } else { - stats.recordDiskTime(handle, System.nanoTime() - startNs); + reportStorageOpTime(startNs); + throttler.maybeThrottle(key.length()); } - // log progress - counter++; + accountForScanProgress("entries"); - if(0 == counter % 100000) { - long totalTime = (System.currentTimeMillis() - startTime) / 1000; - - logger.info("Fetch entries scanned " + counter + " entries, fetched " + fetched - + " entries for store '" + storageEngine.getName() - + "' replicaToPartitionList:" + replicaToPartitionList + " in " + totalTime - + " s"); - } - - if(keyIterator.hasNext()) - return StreamRequestHandlerState.WRITING; - else { - stats.closeHandle(handle); - return StreamRequestHandlerState.COMPLETE; - } + return determineRequestHandlerState("entries"); } } diff --git a/src/java/voldemort/server/protocol/admin/FullScanFetchKeysRequestHandler.java b/src/java/voldemort/server/protocol/admin/FullScanFetchKeysRequestHandler.java new file mode 100644 index 0000000000..19a20ee85a --- /dev/null +++ b/src/java/voldemort/server/protocol/admin/FullScanFetchKeysRequestHandler.java @@ -0,0 +1,91 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.server.protocol.admin; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; + +import voldemort.client.protocol.pb.ProtoUtils; +import voldemort.client.protocol.pb.VAdminProto; +import voldemort.client.protocol.pb.VAdminProto.FetchPartitionEntriesRequest; +import voldemort.server.StoreRepository; +import voldemort.server.VoldemortConfig; +import voldemort.store.ErrorCodeMapper; +import voldemort.store.metadata.MetadataStore; +import voldemort.store.stats.StreamingStats.Operation; +import voldemort.utils.ByteArray; +import voldemort.utils.NetworkClassLoader; + +import com.google.protobuf.Message; + +/** + * Fetches keys by scanning entire storage engine in storage-order. + * + */ +public class FullScanFetchKeysRequestHandler extends FullScanFetchStreamRequestHandler { + + public FullScanFetchKeysRequestHandler(FetchPartitionEntriesRequest request, + MetadataStore metadataStore, + ErrorCodeMapper errorCodeMapper, + VoldemortConfig voldemortConfig, + StoreRepository storeRepository, + NetworkClassLoader networkClassLoader) { + super(request, + metadataStore, + errorCodeMapper, + voldemortConfig, + storeRepository, + networkClassLoader, + Operation.FETCH_KEYS); + logger.info("Starting fetch keys for store '" + storageEngine.getName() + + "' with replica to partition mapping " + replicaToPartitionList); + } + + @Override + public StreamRequestHandlerState handleRequest(DataInputStream inputStream, + DataOutputStream outputStream) + throws IOException { + if(!keyIterator.hasNext()) { + return StreamRequestHandlerState.COMPLETE; + } + + // NOTE: Storage time is accounted for somewhat incorrectly because + // .hasNext() is invoked at end of method for the common case. + long startNs = System.nanoTime(); + ByteArray key = keyIterator.next(); + reportStorageOpTime(startNs); + + throttler.maybeThrottle(key.length()); + + if(isItemAccepted(key.get())) { + if(filter.accept(key, null)) { + accountForFetchedKey(key.get()); + + VAdminProto.FetchPartitionEntriesResponse.Builder response = VAdminProto.FetchPartitionEntriesResponse.newBuilder(); + response.setKey(ProtoUtils.encodeBytes(key)); + Message message = response.build(); + + sendMessage(outputStream, message); + } + } + + accountForScanProgress("keys"); + + return determineRequestHandlerState("keys"); + } +} diff --git a/src/java/voldemort/server/protocol/admin/FullScanFetchStreamRequestHandler.java b/src/java/voldemort/server/protocol/admin/FullScanFetchStreamRequestHandler.java new file mode 100644 index 0000000000..7d63e1baab --- /dev/null +++ b/src/java/voldemort/server/protocol/admin/FullScanFetchStreamRequestHandler.java @@ -0,0 +1,220 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package voldemort.server.protocol.admin; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import voldemort.client.protocol.pb.VAdminProto.FetchPartitionEntriesRequest; +import voldemort.server.StoreRepository; +import voldemort.server.VoldemortConfig; +import voldemort.store.ErrorCodeMapper; +import voldemort.store.metadata.MetadataStore; +import voldemort.store.stats.StreamingStats; +import voldemort.utils.ByteArray; +import voldemort.utils.ClosableIterator; +import voldemort.utils.NetworkClassLoader; +import voldemort.utils.StoreInstance; +import voldemort.utils.Utils; + +/** + * Base class for key/entry stream fetching handlers that do an unordered full + * scan to fetch items. + * + */ +public abstract class FullScanFetchStreamRequestHandler extends FetchStreamRequestHandler { + + protected final ClosableIterator keyIterator; + + // PartitionId to count of fetches on that partition. + protected Map partitionFetches; + // PartitionIds of partitions that still need more fetched... + protected Set partitionsToFetch; + + public FullScanFetchStreamRequestHandler(FetchPartitionEntriesRequest request, + MetadataStore metadataStore, + ErrorCodeMapper errorCodeMapper, + VoldemortConfig voldemortConfig, + StoreRepository storeRepository, + NetworkClassLoader networkClassLoader, + StreamingStats.Operation operation) { + super(request, + metadataStore, + errorCodeMapper, + voldemortConfig, + storeRepository, + networkClassLoader, + operation); + + this.keyIterator = storageEngine.keys(); + + this.partitionFetches = new HashMap(); + for(Integer replicaType: replicaToPartitionList.keySet()) { + if(replicaToPartitionList.get(replicaType) != null) { + for(Integer partitionId: replicaToPartitionList.get(replicaType)) { + this.partitionFetches.put(partitionId, new Long(0)); + } + } + } + this.partitionsToFetch = new HashSet(partitionFetches.keySet()); + } + + /** + * Given the key, figures out which partition on the local node hosts the + * key. + * + * @param key + * @return + */ + private Integer getKeyPartitionId(byte[] key) { + Integer keyPartitionId = storeInstance.getNodesPartitionIdForKey(nodeId, key); + Utils.notNull(keyPartitionId); + return keyPartitionId; + } + + /** + * Determines if the key is needed. To be 'needed', a key must (i) belong to + * a partition being requested and (ii) be necessary to meet + * recordsPerPartition constraint, if any. + * + * @param nodeId + * @param key + * @param replicaToPartitionList + * @param cluster + * @param storeDef + * @return true iff key is needed. + */ + protected boolean isKeyNeeded(byte[] key) { + if(!StoreInstance.checkKeyBelongsToPartition(nodeId, + key, + replicaToPartitionList, + initialCluster, + storeDef)) { + return false; + } + + if(recordsPerPartition <= 0) { + return true; + } + if(partitionsToFetch.contains(getKeyPartitionId(key))) { + return true; + } + return false; + } + + /** + * Determines if entry is accepted. For normal usage, this means confirming + * that the key is needed. For orphan usage, this simply means confirming + * the key belongs to the node. + * + * @param key + * @return + */ + protected boolean isItemAccepted(byte[] key) { + boolean entryAccepted = false; + if(!fetchOrphaned) { + if(isKeyNeeded(key)) { + entryAccepted = true; + } + } else { + if(!StoreInstance.checkKeyBelongsToNode(key, nodeId, initialCluster, storeDef)) { + entryAccepted = true; + } + } + return entryAccepted; + } + + /** + * Account for key being fetched. + * + * @param key + */ + protected void accountForFetchedKey(byte[] key) { + fetched++; + if(streamStats != null) { + streamStats.reportStreamingFetch(operation); + } + + if(recordsPerPartition <= 0) { + return; + } + + Integer keyPartitionId = getKeyPartitionId(key); + Long partitionFetch = partitionFetches.get(keyPartitionId); + Utils.notNull(partitionFetch); + partitionFetch++; + + partitionFetches.put(keyPartitionId, partitionFetch); + if(partitionFetch == recordsPerPartition) { + if(partitionsToFetch.contains(keyPartitionId)) { + partitionsToFetch.remove(keyPartitionId); + } else { + logger.warn("Partitions to fetch did not contain expected partition ID: " + + keyPartitionId); + } + } else if(partitionFetch > recordsPerPartition) { + logger.warn("Partition fetch count larger than expected for partition ID " + + keyPartitionId + " : " + partitionFetch); + } + } + + /** + * True iff enough items have been fetched for all partitions, where + * 'enough' is relative to recordsPerPartition value. + * + * @return + */ + protected boolean fetchedEnoughForAllPartitions() { + if(recordsPerPartition <= 0) { + return false; + } + + if(partitionsToFetch.size() > 0) { + return false; + } + return true; + } + + /** + * Determines if still WRITING or COMPLETE. + * + * @param itemTag mad libs style string to insert into progress message. + * @return + */ + protected StreamRequestHandlerState determineRequestHandlerState(String itemTag) { + + if(keyIterator.hasNext() && !fetchedEnoughForAllPartitions()) { + return StreamRequestHandlerState.WRITING; + } else { + logger.info("Finished fetch " + itemTag + " for store '" + storageEngine.getName() + + "' with replica to partition mapping " + replicaToPartitionList); + progressInfoMessage("Fetch " + itemTag + " (end of scan)"); + + return StreamRequestHandlerState.COMPLETE; + } + } + + @Override + public final void close(DataOutputStream outputStream) throws IOException { + if(null != keyIterator) + keyIterator.close(); + super.close(outputStream); + } +} diff --git a/src/java/voldemort/server/protocol/admin/PartitionScanFetchEntriesRequestHandler.java b/src/java/voldemort/server/protocol/admin/PartitionScanFetchEntriesRequestHandler.java new file mode 100644 index 0000000000..0a1ff69d7b --- /dev/null +++ b/src/java/voldemort/server/protocol/admin/PartitionScanFetchEntriesRequestHandler.java @@ -0,0 +1,149 @@ +/* + * Copyright 2008-2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.server.protocol.admin; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; + +import voldemort.client.protocol.pb.ProtoUtils; +import voldemort.client.protocol.pb.VAdminProto; +import voldemort.client.protocol.pb.VAdminProto.FetchPartitionEntriesRequest; +import voldemort.server.StoreRepository; +import voldemort.server.VoldemortConfig; +import voldemort.store.ErrorCodeMapper; +import voldemort.store.metadata.MetadataStore; +import voldemort.store.stats.StreamingStats.Operation; +import voldemort.utils.ByteArray; +import voldemort.utils.ClosableIterator; +import voldemort.utils.NetworkClassLoader; +import voldemort.utils.Pair; +import voldemort.utils.StoreInstance; +import voldemort.versioning.Versioned; + +import com.google.protobuf.Message; + +/** + * Fetches entries using an efficient partition scan. Of course, only works if + * isPartitionScanSupported() is true for the storage engine to be scanned.. + * + */ +public class PartitionScanFetchEntriesRequestHandler extends PartitionScanFetchStreamRequestHandler { + + protected ClosableIterator>> entriesPartitionIterator; + + public PartitionScanFetchEntriesRequestHandler(FetchPartitionEntriesRequest request, + MetadataStore metadataStore, + ErrorCodeMapper errorCodeMapper, + VoldemortConfig voldemortConfig, + StoreRepository storeRepository, + NetworkClassLoader networkClassLoader) { + super(request, + metadataStore, + errorCodeMapper, + voldemortConfig, + storeRepository, + networkClassLoader, + Operation.FETCH_ENTRIES); + logger.info("Starting fetch entries for store '" + storageEngine.getName() + + "' with replica to partition mapping " + replicaToPartitionList); + + entriesPartitionIterator = null; + } + + @Override + public StreamRequestHandlerState handleRequest(DataInputStream inputStream, + DataOutputStream outputStream) + throws IOException { + + // process the next partition + if(entriesPartitionIterator == null) { + + if(currentIndex == partitionList.size()) { + return StreamRequestHandlerState.COMPLETE; + } + + // find the next partition to scan + boolean found = false; + while(!found && (currentIndex < partitionList.size())) { + currentPartition = new Integer(partitionList.get(currentIndex)); + currentReplicaType = new Integer(replicaTypeList.get(currentIndex)); + + // Check the current node contains the partition as the + // requested replicatype + if(!fetchedPartitions.contains(currentPartition) + && StoreInstance.checkPartitionBelongsToNode(currentPartition, + currentReplicaType, + nodeId, + initialCluster, + storeDef)) { + found = true; + completedFetchingCurrentPartition(); + entriesPartitionIterator = storageEngine.entries(currentPartition); + statusInfoMessage("Starting fetch entries"); + } + currentIndex++; + } + } else { + long startNs = System.nanoTime(); + // do a check before reading in case partition has 0 elements + if(entriesPartitionIterator.hasNext()) { + Pair> entry = entriesPartitionIterator.next(); + ByteArray key = entry.getFirst(); + Versioned value = entry.getSecond(); + reportStorageOpTime(startNs); + + throttler.maybeThrottle(key.length()); + + if(filter.accept(key, value)) { + recordFetched(); + + VAdminProto.FetchPartitionEntriesResponse.Builder response = VAdminProto.FetchPartitionEntriesResponse.newBuilder(); + VAdminProto.PartitionEntry partitionEntry = VAdminProto.PartitionEntry.newBuilder() + .setKey(ProtoUtils.encodeBytes(key)) + .setVersioned(ProtoUtils.encodeVersioned(value)) + .build(); + response.setPartitionEntry(partitionEntry); + Message message = response.build(); + + sendMessage(outputStream, message); + + throttler.maybeThrottle(AdminServiceRequestHandler.valueSize(value)); + } + + accountForScanProgress("entries"); + } + + if(!entriesPartitionIterator.hasNext() || fetchedEnoughForCurrentPartition()) { + // Finished current partition. Reset iterator. Info status. + entriesPartitionIterator.close(); + entriesPartitionIterator = null; + + statusInfoMessage("Finished fetch keys"); + progressInfoMessage("Fetch entries (end of partition)"); + } + } + return StreamRequestHandlerState.WRITING; + } + + @Override + public final void close(DataOutputStream outputStream) throws IOException { + if(null != entriesPartitionIterator) + entriesPartitionIterator.close(); + super.close(outputStream); + } +} diff --git a/src/java/voldemort/server/protocol/admin/PartitionScanFetchKeysRequestHandler.java b/src/java/voldemort/server/protocol/admin/PartitionScanFetchKeysRequestHandler.java new file mode 100644 index 0000000000..351335b4fc --- /dev/null +++ b/src/java/voldemort/server/protocol/admin/PartitionScanFetchKeysRequestHandler.java @@ -0,0 +1,139 @@ +/* + * Copyright 2008-2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.server.protocol.admin; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; + +import voldemort.client.protocol.pb.ProtoUtils; +import voldemort.client.protocol.pb.VAdminProto; +import voldemort.client.protocol.pb.VAdminProto.FetchPartitionEntriesRequest; +import voldemort.server.StoreRepository; +import voldemort.server.VoldemortConfig; +import voldemort.store.ErrorCodeMapper; +import voldemort.store.metadata.MetadataStore; +import voldemort.store.stats.StreamingStats.Operation; +import voldemort.utils.ByteArray; +import voldemort.utils.ClosableIterator; +import voldemort.utils.NetworkClassLoader; +import voldemort.utils.StoreInstance; + +import com.google.protobuf.Message; + +/** + * Fetches keys using an efficient partition scan. Of course, only works if + * isPartitionScanSupported() is true for the storage engine to be scanned.. + * + */ +public class PartitionScanFetchKeysRequestHandler extends PartitionScanFetchStreamRequestHandler { + + protected ClosableIterator keysPartitionIterator; + + public PartitionScanFetchKeysRequestHandler(FetchPartitionEntriesRequest request, + MetadataStore metadataStore, + ErrorCodeMapper errorCodeMapper, + VoldemortConfig voldemortConfig, + StoreRepository storeRepository, + NetworkClassLoader networkClassLoader) { + super(request, + metadataStore, + errorCodeMapper, + voldemortConfig, + storeRepository, + networkClassLoader, + Operation.FETCH_KEYS); + logger.info("Starting fetch keys for store '" + storageEngine.getName() + + "' with replica to partition mapping " + replicaToPartitionList); + + keysPartitionIterator = null; + } + + @Override + public StreamRequestHandlerState handleRequest(DataInputStream inputStream, + DataOutputStream outputStream) + throws IOException { + + // process the next partition + if(keysPartitionIterator == null) { + + if(currentIndex == partitionList.size()) { + return StreamRequestHandlerState.COMPLETE; + } + + // find the next partition to scan and set currentIndex. + boolean found = false; + while(!found && (currentIndex < partitionList.size())) { + currentPartition = partitionList.get(currentIndex); + currentReplicaType = replicaTypeList.get(currentIndex); + + // Check the current node contains the partition as the + // requested replicatype + if(!fetchedPartitions.contains(currentPartition) + && StoreInstance.checkPartitionBelongsToNode(currentPartition, + currentReplicaType, + nodeId, + initialCluster, + storeDef)) { + found = true; + completedFetchingCurrentPartition(); + keysPartitionIterator = storageEngine.keys(currentPartition); + statusInfoMessage("Starting fetch keys"); + } + currentIndex++; + } + } else { + long startNs = System.nanoTime(); + // do a check before reading in case partition has 0 elements + if(keysPartitionIterator.hasNext()) { + ByteArray key = keysPartitionIterator.next(); + reportStorageOpTime(startNs); + + throttler.maybeThrottle(key.length()); + + if(filter.accept(key, null)) { + recordFetched(); + + VAdminProto.FetchPartitionEntriesResponse.Builder response = VAdminProto.FetchPartitionEntriesResponse.newBuilder(); + response.setKey(ProtoUtils.encodeBytes(key)); + Message message = response.build(); + + sendMessage(outputStream, message); + } + + accountForScanProgress("keys"); + } + + if(!keysPartitionIterator.hasNext() || fetchedEnoughForCurrentPartition()) { + // Finished current partition. Reset iterator. Info status. + keysPartitionIterator.close(); + keysPartitionIterator = null; + + statusInfoMessage("Finished fetch keys"); + progressInfoMessage("Fetch keys (end of partition)"); + } + } + return StreamRequestHandlerState.WRITING; + } + + @Override + public final void close(DataOutputStream outputStream) throws IOException { + if(null != keysPartitionIterator) + keysPartitionIterator.close(); + super.close(outputStream); + } +} diff --git a/src/java/voldemort/server/protocol/admin/PartitionScanFetchStreamRequestHandler.java b/src/java/voldemort/server/protocol/admin/PartitionScanFetchStreamRequestHandler.java new file mode 100644 index 0000000000..b630356d28 --- /dev/null +++ b/src/java/voldemort/server/protocol/admin/PartitionScanFetchStreamRequestHandler.java @@ -0,0 +1,129 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package voldemort.server.protocol.admin; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import voldemort.client.protocol.pb.VAdminProto.FetchPartitionEntriesRequest; +import voldemort.server.StoreRepository; +import voldemort.server.VoldemortConfig; +import voldemort.store.ErrorCodeMapper; +import voldemort.store.metadata.MetadataStore; +import voldemort.store.stats.StreamingStats; +import voldemort.utils.NetworkClassLoader; + +/** + * Base class for key/entry stream fetching handlers that use efficient + * partition scan (PID layout). Of course, only works if + * isPartitionScanSupported() is true for the storage engine to be scanned.. + * + */ +public abstract class PartitionScanFetchStreamRequestHandler extends FetchStreamRequestHandler { + + protected Set fetchedPartitions; + protected List replicaTypeList; + protected List partitionList; + + protected Integer currentIndex; + protected Integer currentPartition; + protected Integer currentReplicaType; + protected long currentPartitionFetched; + + public PartitionScanFetchStreamRequestHandler(FetchPartitionEntriesRequest request, + MetadataStore metadataStore, + ErrorCodeMapper errorCodeMapper, + VoldemortConfig voldemortConfig, + StoreRepository storeRepository, + NetworkClassLoader networkClassLoader, + StreamingStats.Operation operation) { + super(request, + metadataStore, + errorCodeMapper, + voldemortConfig, + storeRepository, + networkClassLoader, + operation); + + fetchedPartitions = new HashSet(); + replicaTypeList = new ArrayList(); + partitionList = new ArrayList(); + + // flatten the replicatype to partition map + for(Integer replicaType: replicaToPartitionList.keySet()) { + if(replicaToPartitionList.get(replicaType) != null) { + for(Integer partitionId: replicaToPartitionList.get(replicaType)) { + partitionList.add(partitionId); + replicaTypeList.add(replicaType); + } + } + } + + currentIndex = 0; + currentPartition = null; + currentReplicaType = null; + currentPartitionFetched = 0; + } + + /** + * Simple info message for status + * + * @param tag Message to print out at start of info message + * @param currentIndex current partition index + */ + protected void statusInfoMessage(final String tag) { + if(logger.isInfoEnabled()) { + logger.info(tag + " : [partition: " + currentPartition + ", replica type:" + + currentReplicaType + ", partitionFetched: " + currentPartitionFetched + + "] for store " + storageEngine.getName()); + } + } + + /** + * True iff enough items have been fetched for current partition + * + * @return + */ + protected boolean fetchedEnoughForCurrentPartition() { + if(recordsPerPartition <= 0) { + return false; + } + return (currentPartitionFetched >= recordsPerPartition); + } + + /** + * Account for fetch. + * + * @param key + */ + protected void recordFetched() { + fetched++; + currentPartitionFetched++; + if(streamStats != null) { + streamStats.reportStreamingFetch(operation); + } + } + + /** + * Called when current partition has been completely fetched. + */ + protected void completedFetchingCurrentPartition() { + fetchedPartitions.add(currentPartition); + currentPartitionFetched = 0; + } +} diff --git a/src/java/voldemort/server/protocol/admin/UpdatePartitionEntriesStreamRequestHandler.java b/src/java/voldemort/server/protocol/admin/UpdatePartitionEntriesStreamRequestHandler.java index 5dc79b14ff..53ae284fdd 100644 --- a/src/java/voldemort/server/protocol/admin/UpdatePartitionEntriesStreamRequestHandler.java +++ b/src/java/voldemort/server/protocol/admin/UpdatePartitionEntriesStreamRequestHandler.java @@ -4,8 +4,7 @@ import java.io.DataOutputStream; import java.io.EOFException; import java.io.IOException; -import java.util.HashMap; -import java.util.List; +import java.util.concurrent.atomic.AtomicBoolean; import org.apache.log4j.Level; import org.apache.log4j.Logger; @@ -21,7 +20,8 @@ import voldemort.server.protocol.StreamRequestHandler; import voldemort.store.ErrorCodeMapper; import voldemort.store.StorageEngine; -import voldemort.store.stats.StreamStats; +import voldemort.store.stats.StreamingStats; +import voldemort.store.stats.StreamingStats.Operation; import voldemort.utils.ByteArray; import voldemort.utils.ByteUtils; import voldemort.utils.EventThrottler; @@ -52,18 +52,17 @@ public class UpdatePartitionEntriesStreamRequestHandler implements StreamRequest private final long startTime; - private final StreamStats stats; - - private final StreamStats.Handle handle; + private final StreamingStats streamStats; private final Logger logger = Logger.getLogger(getClass()); + private AtomicBoolean isBatchWriteOff; + public UpdatePartitionEntriesStreamRequestHandler(UpdatePartitionEntriesRequest request, ErrorCodeMapper errorCodeMapper, VoldemortConfig voldemortConfig, StoreRepository storeRepository, - NetworkClassLoader networkClassLoader, - StreamStats stats) { + NetworkClassLoader networkClassLoader) { super(); this.request = request; this.errorCodeMapper = errorCodeMapper; @@ -75,9 +74,22 @@ public UpdatePartitionEntriesStreamRequestHandler(UpdatePartitionEntriesRequest networkClassLoader) : new DefaultVoldemortFilter(); startTime = System.currentTimeMillis(); - this.stats = stats; - this.handle = stats.makeHandle(StreamStats.Operation.UPDATE, - new HashMap>()); + if(voldemortConfig.isJmxEnabled()) { + this.streamStats = storeRepository.getStreamingStats(storageEngine.getName()); + } else { + this.streamStats = null; + } + storageEngine.beginBatchModifications(); + isBatchWriteOff = new AtomicBoolean(false); + } + + @Override + protected void finalize() { + // when the object is GCed, don't forget to end the batch-write mode. + // This is ugly. But the cleanest way to do this, given our network code + // does not guarantee that close() will always be called + if(!isBatchWriteOff.get()) + storageEngine.endBatchModifications(); } public StreamRequestHandlerState handleRequest(DataInputStream inputStream, @@ -92,7 +104,9 @@ public StreamRequestHandlerState handleRequest(DataInputStream inputStream, } catch(EOFException e) { if(logger.isTraceEnabled()) logger.trace("Incomplete read for message size"); - stats.recordNetworkTime(handle, System.nanoTime() - startNs); + if(streamStats != null) + streamStats.reportNetworkTime(Operation.UPDATE_ENTRIES, System.nanoTime() + - startNs); return StreamRequestHandlerState.INCOMPLETE_READ; } @@ -104,8 +118,9 @@ public StreamRequestHandlerState handleRequest(DataInputStream inputStream, if(logger.isTraceEnabled()) logger.trace("Message size -1, completed partition update"); - stats.recordNetworkTime(handle, System.nanoTime() - startNs); - stats.closeHandle(handle); + if(streamStats != null) + streamStats.reportNetworkTime(Operation.UPDATE_ENTRIES, System.nanoTime() + - startNs); return StreamRequestHandlerState.COMPLETE; } @@ -122,7 +137,9 @@ public StreamRequestHandlerState handleRequest(DataInputStream inputStream, return StreamRequestHandlerState.INCOMPLETE_READ; } finally { - stats.recordNetworkTime(handle, System.nanoTime() - startNs); + if(streamStats != null) + streamStats.reportNetworkTime(Operation.UPDATE_ENTRIES, System.nanoTime() + - startNs); } VAdminProto.UpdatePartitionEntriesRequest.Builder builder = VAdminProto.UpdatePartitionEntriesRequest.newBuilder(); @@ -146,7 +163,9 @@ public StreamRequestHandlerState handleRequest(DataInputStream inputStream, if(logger.isDebugEnabled()) logger.debug("updateEntries (Streaming put) threw ObsoleteVersionException, Ignoring."); } finally { - stats.recordDiskTime(handle, System.nanoTime() - startNs); + if(streamStats != null) + streamStats.reportStorageTime(Operation.UPDATE_ENTRIES, System.nanoTime() + - startNs); } throttler.maybeThrottle(key.length() + AdminServiceRequestHandler.valueSize(value)); @@ -154,9 +173,10 @@ public StreamRequestHandlerState handleRequest(DataInputStream inputStream, // log progress counter++; - handle.incrementEntriesScanned(); + if(streamStats != null) + streamStats.reportStreamingPut(Operation.UPDATE_ENTRIES); - if(0 == counter % 100000) { + if(0 == counter % STAT_RECORDS_INTERVAL) { long totalTime = (System.currentTimeMillis() - startTime) / 1000; logger.info("Update entries updated " + counter + " entries for store '" @@ -173,13 +193,13 @@ public StreamRequestDirection getDirection() { public void close(DataOutputStream outputStream) throws IOException { ProtoUtils.writeMessage(outputStream, responseBuilder.build()); + storageEngine.endBatchModifications(); + isBatchWriteOff.compareAndSet(false, true); } public void handleError(DataOutputStream outputStream, VoldemortException e) throws IOException { responseBuilder.setError(ProtoUtils.encodeError(errorCodeMapper, e)); - if(logger.isEnabledFor(Level.ERROR)) logger.error("handleUpdatePartitionEntries failed for request(" + request + ")", e); } - } diff --git a/src/java/voldemort/server/protocol/admin/UpdateSlopEntriesRequestHandler.java b/src/java/voldemort/server/protocol/admin/UpdateSlopEntriesRequestHandler.java index 15054d3da4..fc7fa45ba6 100644 --- a/src/java/voldemort/server/protocol/admin/UpdateSlopEntriesRequestHandler.java +++ b/src/java/voldemort/server/protocol/admin/UpdateSlopEntriesRequestHandler.java @@ -4,8 +4,6 @@ import java.io.DataOutputStream; import java.io.EOFException; import java.io.IOException; -import java.util.HashMap; -import java.util.List; import org.apache.log4j.Level; import org.apache.log4j.Logger; @@ -15,10 +13,12 @@ import voldemort.client.protocol.pb.VAdminProto; import voldemort.client.protocol.pb.VAdminProto.UpdateSlopEntriesRequest; import voldemort.server.StoreRepository; +import voldemort.server.VoldemortConfig; import voldemort.server.protocol.StreamRequestHandler; import voldemort.store.ErrorCodeMapper; import voldemort.store.StorageEngine; -import voldemort.store.stats.StreamStats; +import voldemort.store.stats.StreamingStats; +import voldemort.store.stats.StreamingStats.Operation; import voldemort.utils.ByteArray; import voldemort.utils.ByteUtils; import voldemort.versioning.ObsoleteVersionException; @@ -37,26 +37,25 @@ public class UpdateSlopEntriesRequestHandler implements StreamRequestHandler { private final long startTime; - private long counter = 0L; + private long networkTimeNs; - private final StreamStats stats; + private boolean isJmxEnabled; - private final StreamStats.Handle handle; + private long counter = 0L; private final Logger logger = Logger.getLogger(getClass()); public UpdateSlopEntriesRequestHandler(UpdateSlopEntriesRequest request, ErrorCodeMapper errorCodeMapper, StoreRepository storeRepository, - StreamStats stats) { + VoldemortConfig voldemortConfig) { super(); this.request = request; this.errorCodeMapper = errorCodeMapper; this.storeRepository = storeRepository; - this.stats = stats; - this.handle = stats.makeHandle(StreamStats.Operation.SLOP, - new HashMap>()); startTime = System.currentTimeMillis(); + networkTimeNs = 0; + this.isJmxEnabled = voldemortConfig.isJmxEnabled(); } public StreamRequestDirection getDirection() { @@ -71,7 +70,7 @@ public void handleError(DataOutputStream outputStream, VoldemortException e) thr responseBuilder.setError(ProtoUtils.encodeError(errorCodeMapper, e)); if(logger.isEnabledFor(Level.ERROR)) - logger.error("handleUpdatePartitionEntries failed for request(" + request + ")", e); + logger.error("handleUpdateSlopEntries failed for request(" + request + ")", e); } public StreamRequestHandlerState handleRequest(DataInputStream inputStream, @@ -86,26 +85,24 @@ public StreamRequestHandlerState handleRequest(DataInputStream inputStream, } catch(EOFException e) { if(logger.isTraceEnabled()) logger.trace("Incomplete read for message size"); - stats.recordNetworkTime(handle, System.nanoTime() - startNs); + networkTimeNs += System.nanoTime() - startNs; return StreamRequestHandlerState.INCOMPLETE_READ; } if(size == -1) { if(logger.isTraceEnabled()) - logger.trace("Message size -1, completed partition update"); - stats.recordNetworkTime(handle, System.nanoTime() - startNs); - stats.closeHandle(handle); + logger.trace("Message size -1, completed slop update"); return StreamRequestHandlerState.COMPLETE; } if(logger.isTraceEnabled()) - logger.trace("UpdatePartitionEntriesRequest message size: " + size); + logger.trace("UpdateSlopEntriesRequest message size: " + size); byte[] input = new byte[size]; try { ByteUtils.read(inputStream, input); - stats.recordNetworkTime(handle, System.nanoTime() - startNs); + networkTimeNs += System.nanoTime() - startNs; } catch(EOFException e) { if(logger.isTraceEnabled()) logger.trace("Incomplete read for message"); @@ -120,6 +117,13 @@ public StreamRequestHandlerState handleRequest(DataInputStream inputStream, StorageEngine storageEngine = AdminServiceRequestHandler.getStorageEngine(storeRepository, request.getStore()); + StreamingStats streamStats = null; + if(isJmxEnabled) { + streamStats = storeRepository.getStreamingStats(storageEngine.getName()); + streamStats.reportNetworkTime(Operation.SLOP_UPDATE, networkTimeNs); + } + networkTimeNs = 0; + ByteArray key = ProtoUtils.decodeBytes(request.getKey()); VectorClock vectorClock = ProtoUtils.decodeClock(request.getVersion()); @@ -137,7 +141,9 @@ public StreamRequestHandlerState handleRequest(DataInputStream inputStream, byte[] value = ProtoUtils.decodeBytes(request.getValue()).get(); startNs = System.nanoTime(); storageEngine.put(key, Versioned.value(value, vectorClock), transforms); - stats.recordDiskTime(handle, System.nanoTime() - startNs); + if(isJmxEnabled) + streamStats.reportStorageTime(Operation.SLOP_UPDATE, System.nanoTime() + - startNs); if(logger.isTraceEnabled()) logger.trace("updateSlopEntries (Streaming put) successful"); } catch(ObsoleteVersionException e) { @@ -150,7 +156,9 @@ public StreamRequestHandlerState handleRequest(DataInputStream inputStream, try { startNs = System.nanoTime(); storageEngine.delete(key, vectorClock); - stats.recordDiskTime(handle, System.nanoTime() - startNs); + if(isJmxEnabled) + streamStats.reportStorageTime(Operation.SLOP_UPDATE, System.nanoTime() + - startNs); if(logger.isTraceEnabled()) logger.trace("updateSlopEntries (Streaming delete) successful"); @@ -166,7 +174,8 @@ public StreamRequestHandlerState handleRequest(DataInputStream inputStream, // log progress counter++; - handle.incrementEntriesScanned(); + if(isJmxEnabled) + streamStats.reportStreamingPut(Operation.SLOP_UPDATE); if(0 == counter % 100000) { long totalTime = (System.currentTimeMillis() - startTime) / 1000; diff --git a/src/java/voldemort/server/protocol/vold/VoldemortNativeRequestHandler.java b/src/java/voldemort/server/protocol/vold/VoldemortNativeRequestHandler.java index cadce8b8bb..fb73727dcc 100644 --- a/src/java/voldemort/server/protocol/vold/VoldemortNativeRequestHandler.java +++ b/src/java/voldemort/server/protocol/vold/VoldemortNativeRequestHandler.java @@ -13,6 +13,7 @@ import voldemort.VoldemortException; import voldemort.common.VoldemortOpCode; +import voldemort.common.nio.ByteBufferBackedInputStream; import voldemort.server.RequestRoutingType; import voldemort.server.StoreRepository; import voldemort.server.protocol.AbstractRequestHandler; @@ -21,7 +22,6 @@ import voldemort.store.ErrorCodeMapper; import voldemort.store.Store; import voldemort.utils.ByteArray; -import voldemort.utils.ByteBufferBackedInputStream; import voldemort.utils.ByteUtils; import voldemort.versioning.VectorClock; import voldemort.versioning.Version; diff --git a/src/java/voldemort/server/rebalance/Rebalancer.java b/src/java/voldemort/server/rebalance/Rebalancer.java index 13425f45c0..95c6e8df90 100644 --- a/src/java/voldemort/server/rebalance/Rebalancer.java +++ b/src/java/voldemort/server/rebalance/Rebalancer.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2010 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -298,8 +298,8 @@ private void changeCluster(final Cluster cluster) { System.currentTimeMillis()); logger.info("Switching metadata from " + metadataStore.getCluster() + " to " + cluster + " [ " + updatedVectorClock + " ]"); - metadataStore.put(MetadataStore.CLUSTER_KEY, Versioned.value((Object) cluster, - updatedVectorClock)); + metadataStore.put(MetadataStore.CLUSTER_KEY, + Versioned.value((Object) cluster, updatedVectorClock)); } finally { metadataStore.writeLock.unlock(); } @@ -332,16 +332,18 @@ public int rebalanceNodeOnDonor(final List stealInfos) int stealerNodeId = info.getStealerId(); // Check if stealer node is in rebalancing state - if(!adminClient.getRemoteServerState(stealerNodeId) - .getValue() - .equals(VoldemortState.REBALANCING_MASTER_SERVER)) { + if(!adminClient.rebalanceOps.getRemoteServerState(stealerNodeId) + .getValue() + .equals(VoldemortState.REBALANCING_MASTER_SERVER)) { throw new VoldemortException("Stealer node " + stealerNodeId + " not in " + VoldemortState.REBALANCING_MASTER_SERVER + " state "); } // Also check if it has this plan - if(adminClient.getRemoteRebalancerState(stealerNodeId).getValue().find(donorNodeId) == null) { + if(adminClient.rebalanceOps.getRemoteRebalancerState(stealerNodeId) + .getValue() + .find(donorNodeId) == null) { throw new VoldemortException("Stealer node " + stealerNodeId + " does not have any plan for donor " + donorNodeId + ". Excepted to have " + info); @@ -368,7 +370,7 @@ public int rebalanceNodeOnDonor(final List stealInfos) } finally { if(adminClient != null) { - adminClient.stop(); + adminClient.close(); } } @@ -383,7 +385,8 @@ public int rebalanceNodeOnDonor(final List stealInfos) voldemortConfig, metadataStore, requestId, - stealInfos)); + stealInfos, + voldemortConfig.usePartitionScanForRebalance())); return requestId; } diff --git a/src/java/voldemort/server/rebalance/RebalancerState.java b/src/java/voldemort/server/rebalance/RebalancerState.java index 1be5cce63c..6eb62d2ea7 100644 --- a/src/java/voldemort/server/rebalance/RebalancerState.java +++ b/src/java/voldemort/server/rebalance/RebalancerState.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2010 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -26,7 +26,7 @@ import voldemort.serialization.json.JsonReader; import voldemort.serialization.json.JsonWriter; import voldemort.store.metadata.MetadataStore; -import voldemort.utils.RebalanceUtils; +import voldemort.utils.StoreInstance; import com.google.common.collect.Lists; import com.google.common.collect.Maps; @@ -103,9 +103,9 @@ public RebalancePartitionsInfo find(String storeName, // If yes, check if the key belongs to one of the partitions // being moved - if(RebalanceUtils.checkKeyBelongsToPartition(keyPartitions, - nodePartitions, - info.getReplicaToAddPartitionList(storeName))) { + if(StoreInstance.checkKeyBelongsToPartition(keyPartitions, + nodePartitions, + info.getReplicaToAddPartitionList(storeName))) { return info; } } diff --git a/src/java/voldemort/server/rebalance/async/DonorBasedRebalanceAsyncOperation.java b/src/java/voldemort/server/rebalance/async/DonorBasedRebalanceAsyncOperation.java index 8f6a595bb9..3d94150ce7 100644 --- a/src/java/voldemort/server/rebalance/async/DonorBasedRebalanceAsyncOperation.java +++ b/src/java/voldemort/server/rebalance/async/DonorBasedRebalanceAsyncOperation.java @@ -1,5 +1,5 @@ /* - * Copyright 2011 LinkedIn, Inc + * Copyright 2012-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -16,13 +16,15 @@ package voldemort.server.rebalance.async; +import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; -import java.util.Set; import java.util.Map.Entry; +import java.util.Set; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.CopyOnWriteArrayList; import java.util.concurrent.ExecutorService; @@ -40,6 +42,7 @@ import voldemort.server.VoldemortConfig; import voldemort.server.rebalance.Rebalancer; import voldemort.server.rebalance.VoldemortRebalancingException; +import voldemort.store.PartitionListIterator; import voldemort.store.StorageEngine; import voldemort.store.StoreDefinition; import voldemort.store.metadata.MetadataStore; @@ -48,6 +51,7 @@ import voldemort.utils.ClosableIterator; import voldemort.utils.Pair; import voldemort.utils.RebalanceUtils; +import voldemort.utils.StoreInstance; import voldemort.versioning.Versioned; import com.google.common.collect.HashMultimap; @@ -78,6 +82,7 @@ public class DonorBasedRebalanceAsyncOperation extends RebalanceAsyncOperation { private final AtomicBoolean running = new AtomicBoolean(true); private final Cluster initialCluster; private final Cluster targetCluster; + private final boolean usePartitionScan; private final HashMultimap>>> storeToNodePartitionMapping; @@ -103,13 +108,15 @@ public DonorBasedRebalanceAsyncOperation(Rebalancer rebalancer, VoldemortConfig voldemortConfig, MetadataStore metadataStore, int requestId, - List stealInfos) { + List stealInfos, + boolean usePartitionScan) { super(rebalancer, voldemortConfig, metadataStore, requestId, "Donor based rebalance : " + stealInfos); this.storeRepository = storeRepository; this.stealInfos = stealInfos; this.targetCluster = metadataStore.getCluster(); this.initialCluster = stealInfos.get(0).getInitialCluster(); + this.usePartitionScan = usePartitionScan; // Group the plans by the store names this.storeToNodePartitionMapping = groupByStores(stealInfos); @@ -154,9 +161,9 @@ public void run() { // Remove the metadata from all the stealer nodes for(Pair>> entry: stealerNodeToMappingTuples) { - adminClient.deleteStoreRebalanceState(metadataStore.getNodeId(), - entry.getFirst(), - storeName); + adminClient.rebalanceOps.deleteStoreRebalanceState(metadataStore.getNodeId(), + entry.getFirst(), + storeName); logger.info("Removed rebalance state for store " + storeName + " : " + metadataStore.getNodeId() + " ---> " + entry.getFirst()); @@ -199,7 +206,7 @@ public void run() { + " completed successfully for all " + totalStoresCount + " stores"); } } finally { - adminClient.stop(); + adminClient.close(); adminClient = null; for(RebalancePartitionsInfo stealInfo: stealInfos) { rebalancer.releaseRebalancingPermit(stealInfo.getStealerId()); @@ -293,11 +300,18 @@ public Thread newThread(Runnable r) { logger.info("Started a thread for " + jobName); } - fetchEntriesForStealers(storageEngine, - optimizedStealerNodeToMappingTuples, - storeDef, - nodeToQueue, - storeName); + if(usePartitionScan && storageEngine.isPartitionScanSupported()) + fetchEntriesForStealersPartitionScan(storageEngine, + optimizedStealerNodeToMappingTuples, + storeDef, + nodeToQueue, + storeName); + else + fetchEntriesForStealers(storageEngine, + optimizedStealerNodeToMappingTuples, + storeDef, + nodeToQueue, + storeName); } } @@ -316,10 +330,10 @@ private void fetchEntriesForStealers(StorageEngine st while(running.get() && keys.hasNext()) { ByteArray key = keys.next(); scanned++; - List nodeIds = RebalanceUtils.checkKeyBelongsToPartition(key.get(), - optimizedStealerNodeToMappingTuples, - targetCluster, - storeDef); + List nodeIds = StoreInstance.checkKeyBelongsToPartition(key.get(), + optimizedStealerNodeToMappingTuples, + targetCluster, + storeDef); if(nodeIds.size() > 0) { List> values = storageEngine.get(key, null); @@ -340,18 +354,92 @@ private void fetchEntriesForStealers(StorageEngine st } } + private void fetchEntriesForStealersPartitionScan(StorageEngine storageEngine, + Set>>> optimizedStealerNodeToMappingTuples, + StoreDefinition storeDef, + HashMap>>> nodeToQueue, + String storeName) { + int scanned = 0; + int[] fetched = new int[targetCluster.getNumberOfNodes()]; + long startTime = System.currentTimeMillis(); + + // construct a set of all the partitions we will be fetching + Set partitionsToDonate = new HashSet(); + for(Pair>> nodePartitionMapPair: optimizedStealerNodeToMappingTuples) { + // for each of the nodes, add all the partitions requested + HashMap> replicaToPartitionMap = nodePartitionMapPair.getSecond(); + if(replicaToPartitionMap != null && replicaToPartitionMap.values() != null) { + for(List partitions: replicaToPartitionMap.values()) + if(partitions != null) + partitionsToDonate.addAll(partitions); + } + } + + // check if all the partitions being requested are present in the + // current node + for(Integer partition: partitionsToDonate) { + if(!StoreInstance.checkPartitionBelongsToNode(partition, + voldemortConfig.getNodeId(), + initialCluster, + storeDef)) { + logger.info("Node " + voldemortConfig.getNodeId() + + " does not seem to contain partition " + partition + + " as primary/secondary"); + } + } + + PartitionListIterator entries = new PartitionListIterator(storageEngine, + new ArrayList(partitionsToDonate)); + + try { + while(running.get() && entries.hasNext()) { + Pair> entry = entries.next(); + ByteArray key = entry.getFirst(); + Versioned value = entry.getSecond(); + + scanned++; + List nodeIds = StoreInstance.checkKeyBelongsToPartition(key.get(), + optimizedStealerNodeToMappingTuples, + targetCluster, + storeDef); + + if(nodeIds.size() > 0) { + putValue(nodeIds, key, value, nodeToQueue, fetched); + } + + // print progress for every 100k entries. + if(0 == scanned % SCAN_PROGRESS_COUNT) { + printProgress(scanned, fetched, startTime, storeName); + } + } + terminateAllSlaves(storeName); + } catch(InterruptedException e) { + logger.info("InterruptedException received while sending entries to remote nodes, the process is terminating..."); + terminateAllSlavesAsync(storeName); + } finally { + close(entries, storeName, scanned, fetched, startTime); + } + } + private void putAll(List dests, ByteArray key, List> values, HashMap>>> nodeToQueue, int[] fetched) throws InterruptedException { - for(Versioned value: values) { - for(int nodeId: dests) { - fetched[nodeId]++; - nodeToQueue.get(nodeId).put(Pair.create(key, value)); - if(0 == fetched[nodeId] % FETCHUPDATE_BATCH_SIZE) { - nodeToQueue.get(nodeId).put(BREAK); - } + for(Versioned value: values) + putValue(dests, key, value, nodeToQueue, fetched); + } + + private void putValue(List dests, + ByteArray key, + Versioned value, + HashMap>>> nodeToQueue, + int[] fetched) throws InterruptedException { + for(int nodeId: dests) { + fetched[nodeId]++; + nodeToQueue.get(nodeId).put(Pair.create(key, value)); + if(0 == fetched[nodeId] % FETCHUPDATE_BATCH_SIZE) { + nodeToQueue.get(nodeId).put(BREAK); } } } @@ -364,15 +452,15 @@ private void printProgress(int scanned, int[] fetched, long startTime, String st } } - private void close(ClosableIterator keys, + private void close(ClosableIterator storageItr, String storeName, int scanned, int[] fetched, long startTime) { printProgress(scanned, fetched, startTime, storeName); - if(null != keys) - keys.close(); + if(null != storageItr) + storageItr.close(); } private void terminateAllSlaves(String storeName) { diff --git a/src/java/voldemort/server/rebalance/async/DonorBasedRebalancePusherSlave.java b/src/java/voldemort/server/rebalance/async/DonorBasedRebalancePusherSlave.java index 240205d06c..10ba9c641d 100644 --- a/src/java/voldemort/server/rebalance/async/DonorBasedRebalancePusherSlave.java +++ b/src/java/voldemort/server/rebalance/async/DonorBasedRebalancePusherSlave.java @@ -1,3 +1,18 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ package voldemort.server.rebalance.async; import java.io.IOException; @@ -44,7 +59,7 @@ public void run() throws VoldemortException { while(!nodeIterator.done) { try { nodeIterator.reset(); - adminClient.updateEntries(nodeId, storeName, nodeIterator, null); + adminClient.streamingOps.updateEntries(nodeId, storeName, nodeIterator, null); nodeIterator.purge(); } catch(VoldemortException e) { if(e.getCause() instanceof IOException) { diff --git a/src/java/voldemort/server/rebalance/async/StealerBasedRebalanceAsyncOperation.java b/src/java/voldemort/server/rebalance/async/StealerBasedRebalanceAsyncOperation.java index 7b489e4e79..3ce97089c3 100644 --- a/src/java/voldemort/server/rebalance/async/StealerBasedRebalanceAsyncOperation.java +++ b/src/java/voldemort/server/rebalance/async/StealerBasedRebalanceAsyncOperation.java @@ -1,5 +1,5 @@ /* - * Copyright 2011 LinkedIn, Inc + * Copyright 2011-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -135,7 +135,7 @@ public void run() { + stealInfo.getDonorId()); rebalancer.releaseRebalancingPermit(stealInfo.getDonorId()); - adminClient.stop(); + adminClient.close(); adminClient = null; } } @@ -145,7 +145,7 @@ public void stop() { updateStatus(getHeader(stealInfo) + "Stop called on rebalance operation"); if(null != adminClient) { for(int asyncID: rebalanceStatusList) { - adminClient.stopAsyncRequest(metadataStore.getNodeId(), asyncID); + adminClient.rpcOps.stopAsyncRequest(metadataStore.getNodeId(), asyncID); } } @@ -175,24 +175,24 @@ private void rebalanceStore(String storeName, logger.info(getHeader(stealInfo) + "Starting partitions migration for store " + storeName + " from donor node " + stealInfo.getDonorId()); - int asyncId = adminClient.migratePartitions(stealInfo.getDonorId(), - metadataStore.getNodeId(), - storeName, - stealInfo.getReplicaToAddPartitionList(storeName), - null, - stealInfo.getInitialCluster(), - true); + int asyncId = adminClient.storeMntOps.migratePartitions(stealInfo.getDonorId(), + metadataStore.getNodeId(), + storeName, + stealInfo.getReplicaToAddPartitionList(storeName), + null, + stealInfo.getInitialCluster(), + true); rebalanceStatusList.add(asyncId); if(logger.isDebugEnabled()) { logger.debug(getHeader(stealInfo) + "Waiting for completion for " + storeName + " with async id " + asyncId); } - adminClient.waitForCompletion(metadataStore.getNodeId(), - asyncId, - voldemortConfig.getRebalancingTimeoutSec(), - TimeUnit.SECONDS, - getStatus()); + adminClient.rpcOps.waitForCompletion(metadataStore.getNodeId(), + asyncId, + voldemortConfig.getRebalancingTimeoutSec(), + TimeUnit.SECONDS, + getStatus()); rebalanceStatusList.remove((Object) asyncId); @@ -207,11 +207,11 @@ private void rebalanceStore(String storeName, logger.info(getHeader(stealInfo) + "Deleting partitions for store " + storeName + " on donor node " + stealInfo.getDonorId()); - adminClient.deletePartitions(stealInfo.getDonorId(), - storeName, - stealInfo.getReplicaToDeletePartitionList(storeName), - stealInfo.getInitialCluster(), - null); + adminClient.storeMntOps.deletePartitions(stealInfo.getDonorId(), + storeName, + stealInfo.getReplicaToDeletePartitionList(storeName), + stealInfo.getInitialCluster(), + null); logger.info(getHeader(stealInfo) + "Deleted partitions for store " + storeName + " on donor node " + stealInfo.getDonorId()); diff --git a/src/java/voldemort/server/scheduler/DataCleanupJob.java b/src/java/voldemort/server/scheduler/DataCleanupJob.java index 5adfc4a4f5..919ca915df 100644 --- a/src/java/voldemort/server/scheduler/DataCleanupJob.java +++ b/src/java/voldemort/server/scheduler/DataCleanupJob.java @@ -63,6 +63,7 @@ public DataCleanupJob(StorageEngine store, public void run() { acquireCleanupPermit(progressThisRun); + store.beginBatchModifications(); ClosableIterator>> iterator = null; try { @@ -105,6 +106,7 @@ public void run() { totalEntriesScanned += progressThisRun.get(); progressThisRun.set(0); } + store.endBatchModifications(); } } diff --git a/src/java/voldemort/server/scheduler/slop/StreamingSlopPusherJob.java b/src/java/voldemort/server/scheduler/slop/StreamingSlopPusherJob.java index 6b5950b11d..b8cbd1804a 100644 --- a/src/java/voldemort/server/scheduler/slop/StreamingSlopPusherJob.java +++ b/src/java/voldemort/server/scheduler/slop/StreamingSlopPusherJob.java @@ -1,3 +1,18 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ package voldemort.server.scheduler.slop; import java.util.Date; @@ -18,6 +33,7 @@ import org.apache.log4j.Logger; import voldemort.VoldemortException; +import voldemort.client.ClientConfig; import voldemort.client.protocol.admin.AdminClient; import voldemort.client.protocol.admin.AdminClientConfig; import voldemort.cluster.Cluster; @@ -114,7 +130,8 @@ public void run() { if(adminClient == null) { adminClient = new AdminClient(cluster, - new AdminClientConfig().setMaxConnectionsPerNode(1)); + new AdminClientConfig().setMaxConnectionsPerNode(1), + new ClientConfig()); } if(voldemortConfig.getSlopZonesDownToTerminate() > 0) { @@ -280,7 +297,7 @@ private void loadMetadata() { private void stopAdminClient() { if(adminClient != null) { - adminClient.stop(); + adminClient.close(); adminClient = null; } } @@ -356,7 +373,8 @@ protected Versioned computeNext() { writeThrottler.maybeThrottle(writtenLast); writtenLast = slopSize(head); - deleteBatch.add(Pair.create(head.getValue().makeKey(), head.getVersion())); + deleteBatch.add(Pair.create(head.getValue().makeKey(), + (Version) head.getVersion())); return head; } } @@ -421,7 +439,7 @@ public void run() { } this.startTime = System.currentTimeMillis(); iterator = new SlopIterator(slopQueue, current); - adminClient.updateSlopEntries(nodeId, iterator); + adminClient.streamingOps.updateSlopEntries(nodeId, iterator); } while(!iterator.isComplete()); // Clear up both previous and current diff --git a/src/java/voldemort/server/storage/RepairJob.java b/src/java/voldemort/server/storage/RepairJob.java index aa95547210..6544593729 100644 --- a/src/java/voldemort/server/storage/RepairJob.java +++ b/src/java/voldemort/server/storage/RepairJob.java @@ -18,6 +18,7 @@ import voldemort.store.StorageEngine; import voldemort.store.StoreDefinition; import voldemort.store.metadata.MetadataStore; +import voldemort.store.readonly.ReadOnlyStorageConfiguration; import voldemort.utils.ByteArray; import voldemort.utils.ClosableIterator; import voldemort.utils.Pair; @@ -31,7 +32,8 @@ public class RepairJob implements Runnable { private final static int DELETE_BATCH_SIZE = 10000; private final static Logger logger = Logger.getLogger(RepairJob.class.getName()); - public final static List blackList = Arrays.asList("mysql", "krati", "read-only"); + public final static List blackList = Arrays.asList("krati", + ReadOnlyStorageConfiguration.TYPE_NAME); private final ScanPermitWrapper repairPermits; private final StoreRepository storeRepo; diff --git a/src/java/voldemort/server/storage/StorageService.java b/src/java/voldemort/server/storage/StorageService.java index ba13fdc5b5..0fe606b502 100644 --- a/src/java/voldemort/server/storage/StorageService.java +++ b/src/java/voldemort/server/storage/StorageService.java @@ -55,6 +55,7 @@ import voldemort.common.service.ServiceType; import voldemort.routing.RoutingStrategy; import voldemort.routing.RoutingStrategyFactory; +import voldemort.routing.RoutingStrategyType; import voldemort.serialization.SerializerDefinition; import voldemort.serialization.avro.versioned.SchemaEvolutionValidator; import voldemort.server.RequestRoutingType; @@ -78,6 +79,7 @@ import voldemort.store.readonly.ReadOnlyStorageEngine; import voldemort.store.rebalancing.RebootstrappingStore; import voldemort.store.rebalancing.RedirectingStore; +import voldemort.store.retention.RetentionEnforcingStore; import voldemort.store.routed.RoutedStore; import voldemort.store.routed.RoutedStoreFactory; import voldemort.store.slop.SlopStorageEngine; @@ -101,6 +103,7 @@ import voldemort.utils.ReflectUtils; import voldemort.utils.SystemTime; import voldemort.utils.Time; +import voldemort.utils.Utils; import voldemort.versioning.VectorClock; import voldemort.versioning.VectorClockInconsistencyResolver; import voldemort.versioning.Versioned; @@ -172,7 +175,7 @@ public StorageService(StoreRepository storeRepository, */ if(this.voldemortConfig.getStorageConfigurations() .contains(ReadOnlyStorageConfiguration.class.getName())) { - long rate = this.voldemortConfig.getMaxBytesPerSecond(); + long rate = this.voldemortConfig.getReadOnlyFetcherMaxBytesPerSecond(); this.dynThrottleLimit = new DynamicThrottleLimit(rate); } else this.dynThrottleLimit = null; @@ -236,7 +239,7 @@ private void updateRepFactor(List storesDefs) { @Override protected void startInner() { - registerEngine(metadata, false, "metadata"); + registerInternalEngine(metadata, false, "metadata"); /* Initialize storage configurations */ for(String configClassName: voldemortConfig.getStorageConfigurations()) @@ -270,7 +273,7 @@ protected void startInner() { null, null, null, - null, + RoutingStrategyType.CONSISTENT_STRATEGY, 0, null, 0, @@ -289,9 +292,11 @@ protected void startInner() { null, null, 0); - SlopStorageEngine slopEngine = new SlopStorageEngine(config.getStore(slopStoreDefinition), + SlopStorageEngine slopEngine = new SlopStorageEngine(config.getStore(slopStoreDefinition, + new RoutingStrategyFactory().updateRoutingStrategy(slopStoreDefinition, + metadata.getCluster())), metadata.getCluster()); - registerEngine(slopEngine, false, "slop"); + registerInternalEngine(slopEngine, false, "slop"); storeRepository.setSlopStore(slopEngine); if(voldemortConfig.isSlopPusherJobEnabled()) { @@ -440,7 +445,7 @@ public void openSystemStore(StoreDefinition storeDef) { + " but " + storeDef.getType() + " storage engine has not been enabled."); - final StorageEngine engine = config.getStore(storeDef); + final StorageEngine engine = config.getStore(storeDef, null); // Noted that there is no read-only processing as for user stores. @@ -584,13 +589,11 @@ public void openStore(StoreDefinition storeDef) { + " storage engine has not been enabled."); boolean isReadOnly = storeDef.getType().compareTo(ReadOnlyStorageConfiguration.TYPE_NAME) == 0; - if(isReadOnly) { - final RoutingStrategy routingStrategy = new RoutingStrategyFactory().updateRoutingStrategy(storeDef, - metadata.getCluster()); - ((ReadOnlyStorageConfiguration) config).setRoutingStrategy(routingStrategy); - } + final RoutingStrategy routingStrategy = new RoutingStrategyFactory().updateRoutingStrategy(storeDef, + metadata.getCluster()); - final StorageEngine engine = config.getStore(storeDef); + final StorageEngine engine = config.getStore(storeDef, + routingStrategy); // Update the routing strategy + add listener to metadata if(storeDef.getType().compareTo(ReadOnlyStorageConfiguration.TYPE_NAME) == 0) { metadata.addMetadataStoreListener(storeDef.getName(), new MetadataStoreListener() { @@ -598,12 +601,16 @@ public void openStore(StoreDefinition storeDef) { public void updateRoutingStrategy(RoutingStrategy updatedRoutingStrategy) { ((ReadOnlyStorageEngine) engine).setRoutingStrategy(updatedRoutingStrategy); } + + public void updateStoreDefinition(StoreDefinition storeDef) { + return; + } }); } // openStore() should have atomic semantics try { - registerEngine(engine, isReadOnly, storeDef.getType()); + registerEngine(engine, isReadOnly, storeDef.getType(), storeDef); if(voldemortConfig.isServerRoutingEnabled()) registerNodeStores(storeDef, metadata.getCluster(), voldemortConfig.getNodeId()); @@ -611,21 +618,25 @@ public void updateRoutingStrategy(RoutingStrategy updatedRoutingStrategy) { if(storeDef.hasRetentionPeriod()) scheduleCleanupJob(storeDef, engine); } catch(Exception e) { - unregisterEngine(engine, isReadOnly, storeDef.getType()); + removeEngine(engine, isReadOnly, storeDef.getType(), false); throw new VoldemortException(e); } } /** - * Unregister and remove the engine from the storage repository + * Unregister and remove the engine from the storage repository. This is + * called during deletion of stores and if there are exceptions + * adding/opening stores * * @param engine The actual engine to remove * @param isReadOnly Is this read-only? * @param storeType The storage type of the store + * @param truncate Should the store be truncated? */ - public void unregisterEngine(StorageEngine engine, - boolean isReadOnly, - String storeType) { + public void removeEngine(StorageEngine engine, + boolean isReadOnly, + String storeType, + boolean truncate) { String storeName = engine.getName(); Store store = storeRepository.removeLocalStore(storeName); @@ -683,21 +694,37 @@ public void unregisterEngine(StorageEngine engine, } storeRepository.removeStorageEngine(storeName); - if(!isView) + if(truncate) engine.truncate(); engine.close(); } + /** + * Register the given internal engine (slop and metadata) with the storage + * repository + * + * @param engine Register the storage engine + * @param isReadOnly Boolean indicating if this store is read-only + * @param storeType The type of the store + */ + public void registerInternalEngine(StorageEngine engine, + boolean isReadOnly, + String storeType) { + registerEngine(engine, isReadOnly, storeType, null); + } + /** * Register the given engine with the storage repository * * @param engine Register the storage engine * @param isReadOnly Boolean indicating if this store is read-only * @param storeType The type of the store + * @param storeDef store definition for the store to be registered */ public void registerEngine(StorageEngine engine, boolean isReadOnly, - String storeType) { + String storeType, + StoreDefinition storeDef) { Cluster cluster = this.metadata.getCluster(); storeRepository.addStorageEngine(engine); @@ -713,31 +740,45 @@ public void registerEngine(StorageEngine engine, cluster.getName(), SystemTime.INSTANCE); if(!isSlop) { - if(voldemortConfig.isEnableRebalanceService() && !isReadOnly && !isMetadata && !isView) { - store = new RedirectingStore(store, - metadata, - storeRepository, - failureDetector, - storeFactory); - if(voldemortConfig.isJmxEnabled()) { - MBeanServer mbeanServer = ManagementFactory.getPlatformMBeanServer(); - ObjectName name = null; - if(this.voldemortConfig.isEnableJmxClusterName()) - name = JmxUtils.createObjectName(cluster.getName() - + "." - + JmxUtils.getPackageName(RedirectingStore.class), - store.getName()); - else - name = JmxUtils.createObjectName(JmxUtils.getPackageName(RedirectingStore.class), - store.getName()); + if(!isReadOnly && !isMetadata && !isView) { + // wrap store to enforce retention policy + if(voldemortConfig.isEnforceRetentionPolicyOnRead() && storeDef != null) { + RetentionEnforcingStore retentionEnforcingStore = new RetentionEnforcingStore(store, + storeDef, + voldemortConfig.isDeleteExpiredValuesOnRead(), + SystemTime.INSTANCE); + metadata.addMetadataStoreListener(store.getName(), retentionEnforcingStore); + store = retentionEnforcingStore; + } - synchronized(mbeanServer) { - if(mbeanServer.isRegistered(name)) - JmxUtils.unregisterMbean(mbeanServer, name); + if(voldemortConfig.isEnableRebalanceService()) { + store = new RedirectingStore(store, + metadata, + storeRepository, + failureDetector, + storeFactory); + if(voldemortConfig.isJmxEnabled()) { + MBeanServer mbeanServer = ManagementFactory.getPlatformMBeanServer(); + ObjectName name = null; + if(this.voldemortConfig.isEnableJmxClusterName()) + name = JmxUtils.createObjectName(cluster.getName() + + "." + + JmxUtils.getPackageName(RedirectingStore.class), + store.getName()); + else + name = JmxUtils.createObjectName(JmxUtils.getPackageName(RedirectingStore.class), + store.getName()); + + synchronized(mbeanServer) { + if(mbeanServer.isRegistered(name)) + JmxUtils.unregisterMbean(mbeanServer, name); + + JmxUtils.registerMbean(mbeanServer, + JmxUtils.createModelMBean(store), + name); + } - JmxUtils.registerMbean(mbeanServer, JmxUtils.createModelMBean(store), name); } - } } @@ -856,13 +897,10 @@ private Store createNodeStore(String storeName, Node */ private void scheduleCleanupJob(StoreDefinition storeDef, StorageEngine engine) { - // Schedule data retention cleanup job starting next day. - GregorianCalendar cal = new GregorianCalendar(); - cal.add(Calendar.DAY_OF_YEAR, 1); - cal.set(Calendar.HOUR_OF_DAY, voldemortConfig.getRetentionCleanupFirstStartTimeInHour()); - cal.set(Calendar.MINUTE, 0); - cal.set(Calendar.SECOND, 0); - cal.set(Calendar.MILLISECOND, 0); + // Compute the start time of the job, based on current time + GregorianCalendar cal = Utils.getCalendarForNextRun(new GregorianCalendar(), + voldemortConfig.getRetentionCleanupFirstStartDayOfWeek(), + voldemortConfig.getRetentionCleanupFirstStartTimeInHour()); // allow only one cleanup job at a time Date startTime = cal.getTime(); @@ -892,7 +930,8 @@ private void scheduleCleanupJob(StoreDefinition storeDef, this.scheduler.schedule("cleanup-" + storeDef.getName(), cleanupJob, startTime, - retentionFreqHours * Time.MS_PER_HOUR); + retentionFreqHours * Time.MS_PER_HOUR, + voldemortConfig.getRetentionCleanupPinStartTime()); } @Override diff --git a/src/java/voldemort/store/AbstractStorageEngine.java b/src/java/voldemort/store/AbstractStorageEngine.java new file mode 100644 index 0000000000..16819c8507 --- /dev/null +++ b/src/java/voldemort/store/AbstractStorageEngine.java @@ -0,0 +1,57 @@ +package voldemort.store; + +import voldemort.utils.ClosableIterator; +import voldemort.utils.Pair; +import voldemort.versioning.Versioned; + +public class AbstractStorageEngine extends AbstractStore implements + StorageEngine { + + public AbstractStorageEngine(String name) { + super(name); + } + + @Override + public ClosableIterator>> entries() { + return null; + } + + @Override + public ClosableIterator keys() { + return null; + } + + @Override + public ClosableIterator>> entries(int partitionId) { + return null; + } + + @Override + public ClosableIterator keys(int partitionId) { + return null; + } + + @Override + public void truncate() {} + + @Override + public boolean isPartitionAware() { + return false; + } + + @Override + public boolean isPartitionScanSupported() { + return false; + } + + @Override + public boolean beginBatchModifications() { + return false; + } + + @Override + public boolean endBatchModifications() { + return false; + } + +} diff --git a/src/java/voldemort/store/AbstractStore.java b/src/java/voldemort/store/AbstractStore.java new file mode 100644 index 0000000000..4bbdf4343c --- /dev/null +++ b/src/java/voldemort/store/AbstractStore.java @@ -0,0 +1,75 @@ +package voldemort.store; + +import java.util.List; +import java.util.Map; + +import voldemort.VoldemortException; +import voldemort.utils.Utils; +import voldemort.versioning.Version; +import voldemort.versioning.Versioned; + +public abstract class AbstractStore implements Store { + + private final String storeName; + + public AbstractStore(String name) { + this.storeName = Utils.notNull(name); + } + + @Override + public List> get(K key, T transforms) throws VoldemortException { + return null; + } + + @Override + public Map>> getAll(Iterable keys, Map transforms) + throws VoldemortException { + return null; + } + + @Override + public void put(K key, Versioned value, T transforms) throws VoldemortException {} + + @Override + public boolean delete(K key, Version version) throws VoldemortException { + return false; + } + + @Override + public String getName() { + return this.storeName; + } + + @Override + public void close() throws VoldemortException {} + + @Override + public Object getCapability(StoreCapabilityType capability) { + throw new NoSuchCapabilityException(capability, getName()); + } + + @Override + public List getVersions(K key) { + return null; + } + + @Override + public List> get(CompositeVoldemortRequest request) throws VoldemortException { + return null; + } + + @Override + public Map>> getAll(CompositeVoldemortRequest request) + throws VoldemortException { + return null; + } + + @Override + public void put(CompositeVoldemortRequest request) throws VoldemortException {} + + @Override + public boolean delete(CompositeVoldemortRequest request) throws VoldemortException { + return false; + } + +} diff --git a/src/java/voldemort/store/CompositeDeleteVoldemortRequest.java b/src/java/voldemort/store/CompositeDeleteVoldemortRequest.java new file mode 100644 index 0000000000..08c70a7927 --- /dev/null +++ b/src/java/voldemort/store/CompositeDeleteVoldemortRequest.java @@ -0,0 +1,11 @@ +package voldemort.store; + +import voldemort.common.VoldemortOpCode; +import voldemort.versioning.Version; + +public class CompositeDeleteVoldemortRequest extends CompositeVoldemortRequest { + + public CompositeDeleteVoldemortRequest(K key, Version version, long timeout) { + super(key, null, null, null, version, timeout, true, VoldemortOpCode.DELETE_OP_CODE); + } +} diff --git a/src/java/voldemort/store/CompositeGetAllVoldemortRequest.java b/src/java/voldemort/store/CompositeGetAllVoldemortRequest.java new file mode 100644 index 0000000000..2c548b9bbd --- /dev/null +++ b/src/java/voldemort/store/CompositeGetAllVoldemortRequest.java @@ -0,0 +1,18 @@ +package voldemort.store; + +import voldemort.common.VoldemortOpCode; + +public class CompositeGetAllVoldemortRequest extends CompositeVoldemortRequest { + + public CompositeGetAllVoldemortRequest(Iterable keys, long timeout, boolean resolveConflicts) { + super(null, + null, + keys, + null, + null, + timeout, + resolveConflicts, + VoldemortOpCode.GET_ALL_OP_CODE); + } + +} diff --git a/src/java/voldemort/store/CompositeGetVoldemortRequest.java b/src/java/voldemort/store/CompositeGetVoldemortRequest.java new file mode 100644 index 0000000000..3826d06760 --- /dev/null +++ b/src/java/voldemort/store/CompositeGetVoldemortRequest.java @@ -0,0 +1,10 @@ +package voldemort.store; + +import voldemort.common.VoldemortOpCode; + +public class CompositeGetVoldemortRequest extends CompositeVoldemortRequest { + + public CompositeGetVoldemortRequest(K key, long timeout, boolean resolveConflicts) { + super(key, null, null, null, null, timeout, resolveConflicts, VoldemortOpCode.GET_OP_CODE); + } +} diff --git a/src/java/voldemort/store/CompositePutVoldemortRequest.java b/src/java/voldemort/store/CompositePutVoldemortRequest.java new file mode 100644 index 0000000000..e187993390 --- /dev/null +++ b/src/java/voldemort/store/CompositePutVoldemortRequest.java @@ -0,0 +1,10 @@ +package voldemort.store; + +import voldemort.common.VoldemortOpCode; + +public class CompositePutVoldemortRequest extends CompositeVoldemortRequest { + + public CompositePutVoldemortRequest(K key, V rawValue, long timeout) { + super(key, rawValue, null, null, null, timeout, true, VoldemortOpCode.PUT_OP_CODE); + } +} diff --git a/src/java/voldemort/store/CompositeVersionedPutVoldemortRequest.java b/src/java/voldemort/store/CompositeVersionedPutVoldemortRequest.java new file mode 100644 index 0000000000..cc96d6e2b3 --- /dev/null +++ b/src/java/voldemort/store/CompositeVersionedPutVoldemortRequest.java @@ -0,0 +1,12 @@ +package voldemort.store; + +import voldemort.common.VoldemortOpCode; +import voldemort.versioning.Versioned; + +public class CompositeVersionedPutVoldemortRequest extends CompositeVoldemortRequest { + + public CompositeVersionedPutVoldemortRequest(K key, Versioned value, long timeout) { + super(key, null, null, value, null, timeout, true, VoldemortOpCode.PUT_OP_CODE); + } + +} diff --git a/src/java/voldemort/store/CompositeVoldemortRequest.java b/src/java/voldemort/store/CompositeVoldemortRequest.java new file mode 100644 index 0000000000..f8e834f0a2 --- /dev/null +++ b/src/java/voldemort/store/CompositeVoldemortRequest.java @@ -0,0 +1,75 @@ +package voldemort.store; + +import voldemort.versioning.Version; +import voldemort.versioning.Versioned; + +public class CompositeVoldemortRequest { + + private final K key; + private final V rawValue; + private final Iterable getAllIterableKeys; + private final Versioned value; + private Version version; + private long routingTimeout; + private final boolean resolveConflicts; + private final byte operationType; + + public CompositeVoldemortRequest(K key, + V rawValue, + Iterable keys, + Versioned value, + Version version, + long timeout, + boolean resolveConflicts, + byte operationType) { + this.key = key; + this.rawValue = rawValue; + this.getAllIterableKeys = keys; + this.routingTimeout = timeout; + this.value = value; + this.version = version; + this.resolveConflicts = resolveConflicts; + this.operationType = operationType; + } + + public K getKey() { + return key; + } + + public Versioned getValue() { + return value; + } + + public Version getVersion() { + return version; + } + + public void setVersion(Version version) { + this.version = version; + } + + public long getRoutingTimeoutInMs() { + return routingTimeout; + } + + public void setRoutingTimeoutInMs(long timeout) { + this.routingTimeout = timeout; + } + + public boolean resolveConflicts() { + return resolveConflicts; + } + + public Iterable getIterableKeys() { + return getAllIterableKeys; + } + + public V getRawValue() { + return rawValue; + } + + public byte getOperationType() { + return operationType; + } + +} diff --git a/src/java/voldemort/store/DelegatingStore.java b/src/java/voldemort/store/DelegatingStore.java index aba86bea79..6762908f2c 100644 --- a/src/java/voldemort/store/DelegatingStore.java +++ b/src/java/voldemort/store/DelegatingStore.java @@ -32,38 +32,40 @@ * * */ -public class DelegatingStore implements Store { +public class DelegatingStore extends AbstractStore { private final Store innerStore; public DelegatingStore(Store innerStore) { + super(innerStore.getName()); this.innerStore = Utils.notNull(innerStore); } + @Override public void close() throws VoldemortException { innerStore.close(); } + @Override public boolean delete(K key, Version version) throws VoldemortException { StoreUtils.assertValidKey(key); return innerStore.delete(key, version); } + @Override public Map>> getAll(Iterable keys, Map transforms) throws VoldemortException { StoreUtils.assertValidKeys(keys); return innerStore.getAll(keys, transforms); } + @Override public List> get(K key, T transform) throws VoldemortException { StoreUtils.assertValidKey(key); return innerStore.get(key, transform); } - public String getName() { - return innerStore.getName(); - } - + @Override public void put(K key, Versioned value, T transform) throws VoldemortException { StoreUtils.assertValidKey(key); innerStore.put(key, value, transform); @@ -73,6 +75,7 @@ public Store getInnerStore() { return innerStore; } + @Override public Object getCapability(StoreCapabilityType capability) { return innerStore.getCapability(capability); } @@ -82,7 +85,31 @@ public String toString() { return innerStore.toString(); } + @Override public List getVersions(K key) { return innerStore.getVersions(key); } + + @Override + public List> get(CompositeVoldemortRequest request) throws VoldemortException { + StoreUtils.assertValidKey(request.getKey()); + return innerStore.get(request); + } + + // TODO: Validate all the keys in the request object + @Override + public Map>> getAll(CompositeVoldemortRequest request) + throws VoldemortException { + return innerStore.getAll(request); + } + + @Override + public void put(CompositeVoldemortRequest request) throws VoldemortException { + innerStore.put(request); + } + + @Override + public boolean delete(CompositeVoldemortRequest request) throws VoldemortException { + return innerStore.delete(request); + } } diff --git a/src/java/voldemort/store/PartitionListIterator.java b/src/java/voldemort/store/PartitionListIterator.java new file mode 100644 index 0000000000..a3d90c1618 --- /dev/null +++ b/src/java/voldemort/store/PartitionListIterator.java @@ -0,0 +1,85 @@ +/* + * Copyright 2008-2012 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package voldemort.store; + +import java.util.List; +import java.util.NoSuchElementException; + +import voldemort.utils.ByteArray; +import voldemort.utils.ClosableIterator; +import voldemort.utils.Pair; +import voldemort.utils.Utils; +import voldemort.versioning.Versioned; + +/** + * Iterator that uses efficient partition scan to iterate across a list of + * supplied partitions + * + */ +public class PartitionListIterator implements ClosableIterator>> { + + StorageEngine storageEngine; + List partitionsToFetch; + ClosableIterator>> partitionIterator; + int currentIndex; + + public PartitionListIterator(StorageEngine storageEngine, + List partitionsToFetch) { + Utils.notNull(partitionsToFetch); + this.storageEngine = storageEngine; + this.partitionsToFetch = partitionsToFetch; + this.currentIndex = 0; + } + + public boolean hasNext() { + // do we have more elements in the current partition we are serving? + if(this.partitionIterator != null && this.partitionIterator.hasNext()) + return true; + // if not, find the next non empty partition + while((currentIndex < partitionsToFetch.size())) { + // close the previous iterator + if(this.partitionIterator != null) + this.partitionIterator.close(); + // advance to the next partition + this.partitionIterator = storageEngine.entries(this.partitionsToFetch.get(currentIndex)); + currentIndex++; + if(this.partitionIterator.hasNext()) + return true; + } + return false; + } + + public Pair> next() { + if(!hasNext()) + throw new NoSuchElementException("End of partition entries stream"); + return this.partitionIterator.next(); + } + + public void remove() { + throw new UnsupportedOperationException("Removal not supported"); + } + + @Override + protected final void finalize() { + close(); + } + + public void close() { + if(partitionIterator != null) { + partitionIterator.close(); + } + } +} diff --git a/src/java/voldemort/store/StorageConfiguration.java b/src/java/voldemort/store/StorageConfiguration.java index a17f583730..19b8e2c565 100644 --- a/src/java/voldemort/store/StorageConfiguration.java +++ b/src/java/voldemort/store/StorageConfiguration.java @@ -16,6 +16,7 @@ package voldemort.store; +import voldemort.routing.RoutingStrategy; import voldemort.utils.ByteArray; /** @@ -37,9 +38,11 @@ public interface StorageConfiguration { * Get an initialized storage implementation * * @param storeDef store definition + * @param strategy routing strategy used for the store * @return The storage engine */ - public StorageEngine getStore(StoreDefinition storeDef); + public StorageEngine getStore(StoreDefinition storeDef, + RoutingStrategy strategy); /** * Get the type of stores returned by this configuration diff --git a/src/java/voldemort/store/StorageEngine.java b/src/java/voldemort/store/StorageEngine.java index 1c475ad7e8..4af3046e94 100644 --- a/src/java/voldemort/store/StorageEngine.java +++ b/src/java/voldemort/store/StorageEngine.java @@ -65,6 +65,30 @@ public interface StorageEngine extends Store { */ public ClosableIterator keys(); + /** + * Get an iterator over pairs of entries in a store's partition. The key is + * the first element in the pair and the versioned value is the second + * element. + * + * Note that the iterator need not be threadsafe, and that it must be + * manually closed after use. + * + * @param partition partition whose entries are to be fetched + * @return An iterator over the entries in this StorageEngine. + */ + public ClosableIterator>> entries(int partition); + + /** + * Get an iterator over keys in the store's partition + * + * Note that the iterator need not be threadsafe, and that it must be + * manually closed after use. + * + * @param partition partition whose keys are to be fetched + * @return An iterator over the keys in this StorageEngine. + */ + public ClosableIterator keys(int partition); + /** * Truncate all entries in the store */ @@ -78,4 +102,27 @@ public interface StorageEngine extends Store { */ public boolean isPartitionAware(); + /** + * Does the storage engine support efficient scanning of a single partition + * + * @return true if the storage engine implements the capability. false + * otherwise + */ + public boolean isPartitionScanSupported(); + + /** + * A lot of storage engines support efficient methods for performing large + * number of writes (puts/deletes) against the data source. This method puts + * the storage engine in this batch write mode + * + * @return true if the storage engine took successful action to switch to + * 'batch-write' mode + */ + public boolean beginBatchModifications(); + + /** + * + * @return true if the storage engine successfully returned to normal mode + */ + public boolean endBatchModifications(); } diff --git a/src/java/voldemort/store/Store.java b/src/java/voldemort/store/Store.java index 4283353666..8f49bbc2ed 100644 --- a/src/java/voldemort/store/Store.java +++ b/src/java/voldemort/store/Store.java @@ -100,6 +100,51 @@ public Map>> getAll(Iterable keys, Map transforms) */ public Object getCapability(StoreCapabilityType capability); + /** + * Get the versions associated with the given key. This is used in a put + * call to write a new value for this key + * + * @param key The key to retrieve the versions for + * @return List of Versions associated with this key. + */ public List getVersions(K key); + /** + * Get the value associated with the given key + * + * @param request Contains the key to check for and associated transforms + * @return The value associated with the key or an empty list if no values + * are found. + * @throws VoldemortException + */ + public List> get(CompositeVoldemortRequest request) throws VoldemortException; + + /** + * Get the values associated with the given keys and returns them in a Map + * of keys to a list of versioned values. Note that the returned map will + * only contain entries for the keys which have a value associated with + * them. + * + * @param requests Contains the keys to check for. + * @return A Map of keys to a list of versioned values. + * @throws VoldemortException + */ + public Map>> getAll(CompositeVoldemortRequest request) + throws VoldemortException; + + /** + * Associate the value with the key and version in this store + * + * @param request Contains the key to use along with the value and version + * to use. + */ + public void put(CompositeVoldemortRequest request) throws VoldemortException; + + /** + * Delete all entries prior to the given version + * + * @param request: Contains the key to delete and current version of the key + * @return True if anything was deleted + */ + public boolean delete(CompositeVoldemortRequest request) throws VoldemortException; } diff --git a/src/java/voldemort/store/StoreBinaryFormat.java b/src/java/voldemort/store/StoreBinaryFormat.java new file mode 100644 index 0000000000..d2d66f65f2 --- /dev/null +++ b/src/java/voldemort/store/StoreBinaryFormat.java @@ -0,0 +1,107 @@ +package voldemort.store; + +import java.util.ArrayList; +import java.util.List; + +import voldemort.VoldemortException; +import voldemort.utils.ByteUtils; +import voldemort.versioning.VectorClock; +import voldemort.versioning.Versioned; + +/*- + * Defines a generic on-disk binary data format for versioned voldemort data + * ----------------------------------------- + * FORMAT_VERSION : 1 byte + * Versioned value (repeating) { + * Clock (variable length) { + * NUM_CLOCK_ENTRIES : 2 bytes (short) + * VERSION_SIZE : 1 byte + * Server clock (repeating) { + * NODE_ID : 2 bytes (short) + * VERSION : VERSION_SIZE bytes + * } + * } + * Value (variable length) { + * VALUE_SIZE : 4 bytes (int) + * VALUE_BYTES : VALUE_SIZE bytes + * } + * } + * ----------------------------------------- + */ +public class StoreBinaryFormat { + + /* In the future we can use this to handle format changes */ + private static final byte VERSION = 0; + + private static final int PARTITIONID_PREFIX_SIZE = ByteUtils.SIZE_OF_SHORT; + + public static byte[] toByteArray(List> values) { + int size = 1; + for(Versioned v: values) { + size += ((VectorClock) v.getVersion()).sizeInBytes(); + size += ByteUtils.SIZE_OF_INT; + size += v.getValue().length; + } + byte[] bytes = new byte[size]; + int pos = 1; + bytes[0] = VERSION; + for(Versioned v: values) { + pos += ((VectorClock) v.getVersion()).toBytes(bytes, pos); + int len = v.getValue().length; + ByteUtils.writeInt(bytes, len, pos); + pos += ByteUtils.SIZE_OF_INT; + System.arraycopy(v.getValue(), 0, bytes, pos, len); + pos += len; + } + if(pos != bytes.length) + throw new VoldemortException((bytes.length - pos) + + " straggling bytes found in value (this should not be possible)!"); + return bytes; + } + + public static List> fromByteArray(byte[] bytes) { + if(bytes.length < 1) + throw new VoldemortException("Invalid value length: " + bytes.length); + if(bytes[0] != VERSION) + throw new VoldemortException("Unexpected version number in value: " + bytes[0]); + int pos = 1; + List> vals = new ArrayList>(2); + while(pos < bytes.length) { + VectorClock clock = new VectorClock(bytes, pos); + pos += clock.sizeInBytes(); + int valueSize = ByteUtils.readInt(bytes, pos); + pos += ByteUtils.SIZE_OF_INT; + byte[] val = new byte[valueSize]; + System.arraycopy(bytes, pos, val, 0, valueSize); + pos += valueSize; + vals.add(Versioned.value(val, clock)); + } + if(pos != bytes.length) + throw new VoldemortException((bytes.length - pos) + + " straggling bytes found in value (this should not be possible)!"); + return vals; + } + + public static byte[] makePrefixedKey(byte[] key, int partitionId) { + byte[] prefixedKey = new byte[PARTITIONID_PREFIX_SIZE + key.length]; + ByteUtils.writeUnsignedShort(prefixedKey, partitionId, 0); + System.arraycopy(key, 0, prefixedKey, PARTITIONID_PREFIX_SIZE, key.length); + return prefixedKey; + } + + public static byte[] makePartitionKey(int partitionId) { + byte[] partitionKey = new byte[PARTITIONID_PREFIX_SIZE]; + ByteUtils.writeUnsignedShort(partitionKey, partitionId, 0); + return partitionKey; + } + + public static int extractPartition(byte[] prefixedKeyArray) { + return ByteUtils.readUnsignedShort(prefixedKeyArray, 0); + } + + public static byte[] extractKey(byte[] prefixedKeyArray) { + byte[] key = new byte[prefixedKeyArray.length - PARTITIONID_PREFIX_SIZE]; + System.arraycopy(prefixedKeyArray, PARTITIONID_PREFIX_SIZE, key, 0, key.length); + return key; + } +} \ No newline at end of file diff --git a/src/java/voldemort/store/StoreUtils.java b/src/java/voldemort/store/StoreUtils.java index f3a7d96df9..d4122bad1f 100644 --- a/src/java/voldemort/store/StoreUtils.java +++ b/src/java/voldemort/store/StoreUtils.java @@ -18,6 +18,7 @@ import java.io.Closeable; import java.io.IOException; +import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; @@ -199,4 +200,19 @@ public static StoreDefinition getStoreDef(List list, String nam return def; return null; } + + /** + * Get the list of store names from a list of store definitions + * + * @param list + * @param ignoreViews + * @return list of store names + */ + public static List getStoreNames(List list, boolean ignoreViews) { + List storeNameSet = new ArrayList(); + for(StoreDefinition def: list) + if(!def.isView() || !ignoreViews) + storeNameSet.add(def.getName()); + return storeNameSet; + } } diff --git a/src/java/voldemort/store/bdb/BdbIterator.java b/src/java/voldemort/store/bdb/BdbIterator.java new file mode 100644 index 0000000000..6b2744a55b --- /dev/null +++ b/src/java/voldemort/store/bdb/BdbIterator.java @@ -0,0 +1,43 @@ +package voldemort.store.bdb; + +import voldemort.utils.ClosableIterator; + +import com.sleepycat.je.Cursor; +import com.sleepycat.je.DatabaseException; + +abstract class BdbIterator implements ClosableIterator { + + private volatile boolean isOpen; + final Cursor cursor; + final BdbStorageEngine bdbEngine; + + BdbIterator(Cursor cursor, BdbStorageEngine bdbEngine) { + this.cursor = cursor; + this.bdbEngine = bdbEngine; + isOpen = true; + + } + + public final void close() { + try { + if(isOpen) { + cursor.close(); + isOpen = false; + } + } catch(DatabaseException e) { + bdbEngine.getLogger().error(e); + } + } + + public final void remove() { + throw new UnsupportedOperationException("No removal"); + } + + @Override + protected final void finalize() { + if(isOpen) { + bdbEngine.getLogger().error("Failure to close cursor, will be forcibly closed."); + close(); + } + } +} \ No newline at end of file diff --git a/src/java/voldemort/store/bdb/BdbRuntimeConfig.java b/src/java/voldemort/store/bdb/BdbRuntimeConfig.java index e568a02049..a694788429 100644 --- a/src/java/voldemort/store/bdb/BdbRuntimeConfig.java +++ b/src/java/voldemort/store/bdb/BdbRuntimeConfig.java @@ -14,10 +14,14 @@ public class BdbRuntimeConfig { public static final long DEFAULT_STATS_CACHE_TTL_MS = 5 * Time.MS_PER_SECOND; public static final LockMode DEFAULT_LOCK_MODE = LockMode.READ_UNCOMMITTED; public static final boolean DEFAULT_EXPOSE_SPACE_UTIL = true; + public static final boolean DEFAULT_MINIMIZE_SCAN_IMPACT = false; + public static final boolean DEFAULT_TURNOFF_CHECKPOINTER_BATCH_WRITES = false; private long statsCacheTtlMs = DEFAULT_STATS_CACHE_TTL_MS; private LockMode lockMode = DEFAULT_LOCK_MODE; private boolean exposeSpaceUtil = DEFAULT_EXPOSE_SPACE_UTIL; + private boolean minimizeScanImpact = DEFAULT_MINIMIZE_SCAN_IMPACT; + private boolean checkpointerOffForBatchWrites = DEFAULT_TURNOFF_CHECKPOINTER_BATCH_WRITES; public BdbRuntimeConfig() { @@ -29,6 +33,8 @@ public BdbRuntimeConfig(VoldemortConfig config) { setLockMode(lockMode); setStatsCacheTtlMs(config.getBdbStatsCacheTtlMs()); setExposeSpaceUtil(config.getBdbExposeSpaceUtilization()); + setMinimizeScanImpact(config.getBdbMinimizeScanImpact()); + setCheckpointerOffForBatchWrites(config.getBdbCheckpointerOffForBatchWrites()); } public long getStatsCacheTtlMs() { @@ -56,4 +62,21 @@ public void setExposeSpaceUtil(boolean expose) { public boolean getExposeSpaceUtil() { return this.exposeSpaceUtil; } + + public boolean getMinimizeScanImpact() { + return minimizeScanImpact; + } + + public void setMinimizeScanImpact(boolean minimizeScanImpact) { + this.minimizeScanImpact = minimizeScanImpact; + } + + public boolean isCheckpointerOffForBatchWrites() { + return checkpointerOffForBatchWrites; + } + + public void setCheckpointerOffForBatchWrites(boolean checkpointerOffForBulkWrites) { + this.checkpointerOffForBatchWrites = checkpointerOffForBulkWrites; + } + } diff --git a/src/java/voldemort/store/bdb/BdbStorageConfiguration.java b/src/java/voldemort/store/bdb/BdbStorageConfiguration.java index 512daf4065..aeb58cd5ba 100644 --- a/src/java/voldemort/store/bdb/BdbStorageConfiguration.java +++ b/src/java/voldemort/store/bdb/BdbStorageConfiguration.java @@ -26,6 +26,7 @@ import voldemort.VoldemortException; import voldemort.annotations.jmx.JmxOperation; +import voldemort.routing.RoutingStrategy; import voldemort.server.VoldemortConfig; import voldemort.store.StorageConfiguration; import voldemort.store.StorageEngine; @@ -37,6 +38,7 @@ import voldemort.utils.Time; import com.google.common.collect.Maps; +import com.sleepycat.je.CacheMode; import com.sleepycat.je.Database; import com.sleepycat.je.DatabaseConfig; import com.sleepycat.je.DatabaseException; @@ -107,11 +109,25 @@ public BdbStorageConfiguration(VoldemortConfig config) { Integer.toString(config.getBdbLogFaultReadSize())); environmentConfig.setConfigParam(EnvironmentConfig.LOG_ITERATOR_READ_SIZE, Integer.toString(config.getBdbLogIteratorReadSize())); + environmentConfig.setConfigParam(EnvironmentConfig.CLEANER_LAZY_MIGRATION, + Boolean.toString(config.getBdbCleanerLazyMigration())); + environmentConfig.setConfigParam(EnvironmentConfig.CLEANER_BACKGROUND_PROACTIVE_MIGRATION, + Boolean.toString(config.getBdbProactiveBackgroundMigration())); + environmentConfig.setConfigParam(EnvironmentConfig.CLEANER_BYTES_INTERVAL, + Long.toString(config.getBdbCleanerBytesInterval())); environmentConfig.setLockTimeout(config.getBdbLockTimeoutMs(), TimeUnit.MILLISECONDS); + if(config.getBdbCacheModeEvictLN()) { + environmentConfig.setCacheMode(CacheMode.EVICT_LN); + } + if(config.isBdbLevelBasedEviction()) { + environmentConfig.setConfigParam(EnvironmentConfig.EVICTOR_LRU_ONLY, + Boolean.toString(false)); + } + databaseConfig = new DatabaseConfig(); databaseConfig.setAllowCreate(true); - databaseConfig.setSortedDuplicates(config.isBdbSortedDuplicatesEnabled()); + databaseConfig.setSortedDuplicates(false); databaseConfig.setNodeMaxEntries(config.getBdbBtreeFanout()); databaseConfig.setTransactional(true); bdbMasterDir = config.getBdbDataDirectory(); @@ -119,17 +135,24 @@ public BdbStorageConfiguration(VoldemortConfig config) { unreservedStores = new HashSet(); } - public StorageEngine getStore(StoreDefinition storeDef) { + public StorageEngine getStore(StoreDefinition storeDef, + RoutingStrategy strategy) { synchronized(lock) { try { String storeName = storeDef.getName(); Environment environment = getEnvironment(storeDef); Database db = environment.openDatabase(null, storeName, databaseConfig); BdbRuntimeConfig runtimeConfig = new BdbRuntimeConfig(voldemortConfig); - BdbStorageEngine engine = new BdbStorageEngine(storeName, - environment, - db, - runtimeConfig); + BdbStorageEngine engine = null; + if(voldemortConfig.getBdbPrefixKeysWithPartitionId()) { + engine = new PartitionPrefixedBdbStorageEngine(storeName, + environment, + db, + runtimeConfig, + strategy); + } else { + engine = new BdbStorageEngine(storeName, environment, db, runtimeConfig); + } if(voldemortConfig.isJmxEnabled()) { // register the environment stats mbean JmxUtils.registerMbean(storeName, engine.getBdbEnvironmentStats()); @@ -294,22 +317,6 @@ public void cleanLogs() { } } - @JmxOperation(description = "Obtain the number of k-v entries in the store") - public long getEntryCount(String storeName) throws Exception { - Environment storeEnv = environments.get(storeName); - if(storeEnv != null) { - Database storeDb = null; - try { - storeDb = storeEnv.openDatabase(null, storeName, databaseConfig); - return storeDb.count(); - } finally { - if(storeDb != null) - storeDb.close(); - } - } - return 0; - } - public void close() { synchronized(lock) { try { diff --git a/src/java/voldemort/store/bdb/BdbStorageEngine.java b/src/java/voldemort/store/bdb/BdbStorageEngine.java index b74e31cfe6..ca3ade7cf9 100644 --- a/src/java/voldemort/store/bdb/BdbStorageEngine.java +++ b/src/java/voldemort/store/bdb/BdbStorageEngine.java @@ -17,8 +17,12 @@ package voldemort.store.bdb; import java.io.File; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.NoSuchElementException; import java.util.concurrent.atomic.AtomicBoolean; import org.apache.commons.codec.binary.Hex; @@ -26,16 +30,12 @@ import voldemort.VoldemortException; import voldemort.annotations.jmx.JmxOperation; -import voldemort.serialization.IdentitySerializer; -import voldemort.serialization.Serializer; -import voldemort.serialization.VersionedSerializer; import voldemort.server.protocol.admin.AsyncOperationStatus; -import voldemort.store.NoSuchCapabilityException; +import voldemort.store.AbstractStorageEngine; import voldemort.store.PersistenceFailureException; -import voldemort.store.StorageEngine; import voldemort.store.StorageInitializationException; import voldemort.store.Store; -import voldemort.store.StoreCapabilityType; +import voldemort.store.StoreBinaryFormat; import voldemort.store.StoreUtils; import voldemort.store.backup.NativeBackupable; import voldemort.store.bdb.stats.BdbEnvironmentStats; @@ -46,17 +46,18 @@ import voldemort.utils.Utils; import voldemort.versioning.ObsoleteVersionException; import voldemort.versioning.Occurred; -import voldemort.versioning.VectorClock; import voldemort.versioning.Version; import voldemort.versioning.Versioned; -import com.google.common.collect.Lists; +import com.sleepycat.je.CacheMode; import com.sleepycat.je.Cursor; import com.sleepycat.je.Database; import com.sleepycat.je.DatabaseEntry; import com.sleepycat.je.DatabaseException; import com.sleepycat.je.DatabaseStats; import com.sleepycat.je.Environment; +import com.sleepycat.je.EnvironmentConfig; +import com.sleepycat.je.EnvironmentMutableConfig; import com.sleepycat.je.LockMode; import com.sleepycat.je.OperationStatus; import com.sleepycat.je.StatsConfig; @@ -65,72 +66,82 @@ /** * A store that uses BDB for persistence * - * */ -public class BdbStorageEngine implements StorageEngine, NativeBackupable { +public class BdbStorageEngine extends AbstractStorageEngine implements + NativeBackupable { private static final Logger logger = Logger.getLogger(BdbStorageEngine.class); private static final Hex hexCodec = new Hex(); - private final String name; private Database bdbDatabase; private final Environment environment; - private final VersionedSerializer versionedSerializer; private final AtomicBoolean isOpen; private final LockMode readLockMode; - private final Serializer versionSerializer; - private final BdbEnvironmentStats bdbEnvironmentStats; private final AtomicBoolean isTruncating = new AtomicBoolean(false); + protected final BdbEnvironmentStats bdbEnvironmentStats; + protected final boolean minimizeScanImpact; + protected final boolean checkpointerOffForBatchWrites; + private volatile int numOutstandingBatchWriteJobs = 0; + public BdbStorageEngine(String name, Environment environment, Database database, BdbRuntimeConfig config) { - this.name = Utils.notNull(name); + super(name); this.bdbDatabase = Utils.notNull(database); this.environment = Utils.notNull(environment); - this.versionedSerializer = new VersionedSerializer(new IdentitySerializer()); - this.versionSerializer = new Serializer() { - - public byte[] toBytes(Version object) { - return ((VectorClock) object).toBytes(); - } - - public Version toObject(byte[] bytes) { - return versionedSerializer.getVersion(bytes); - } - }; this.isOpen = new AtomicBoolean(true); this.readLockMode = config.getLockMode(); this.bdbEnvironmentStats = new BdbEnvironmentStats(environment, + database, config.getStatsCacheTtlMs(), config.getExposeSpaceUtil()); + this.minimizeScanImpact = config.getMinimizeScanImpact(); + this.checkpointerOffForBatchWrites = config.isCheckpointerOffForBatchWrites(); } - public String getName() { - return name; - } - + @Override public ClosableIterator>> entries() { try { Cursor cursor = getBdbDatabase().openCursor(null, null); - return new BdbEntriesIterator(cursor); + // evict data brought in by the cursor walk right away + if(this.minimizeScanImpact) + cursor.setCacheMode(CacheMode.EVICT_BIN); + return new BdbEntriesIterator(cursor, this); } catch(DatabaseException e) { + this.bdbEnvironmentStats.reportException(e); logger.error(e); throw new PersistenceFailureException(e); } } + @Override public ClosableIterator keys() { try { Cursor cursor = getBdbDatabase().openCursor(null, null); - return new BdbKeysIterator(cursor); + // evict data brought in by the cursor walk right away + if(this.minimizeScanImpact) + cursor.setCacheMode(CacheMode.EVICT_BIN); + return new BdbKeysIterator(cursor, this); } catch(DatabaseException e) { + this.bdbEnvironmentStats.reportException(e); logger.error(e); throw new PersistenceFailureException(e); } } + @Override + public ClosableIterator>> entries(int partition) { + throw new UnsupportedOperationException("Partition based entries scan not supported for this storage type"); + } + + @Override + public ClosableIterator keys(int partition) { + throw new UnsupportedOperationException("Partition based key scan not supported for this storage type"); + } + + @Override public void truncate() { if(isTruncating.compareAndSet(false, true)) { @@ -147,13 +158,12 @@ public void truncate() { environment.truncateDatabase(transaction, this.getName(), false); succeeded = true; } catch(DatabaseException e) { + this.bdbEnvironmentStats.reportException(e); logger.error(e); throw new VoldemortException("Failed to truncate Bdb store " + getName(), e); } finally { - commitOrAbort(succeeded, transaction); - // reopen the bdb database for future queries. if(reopenBdbDatabase()) { isTruncating.compareAndSet(true, false); @@ -190,214 +200,178 @@ private boolean reopenBdbDatabase() { this.bdbDatabase.getConfig()); return true; } catch(DatabaseException e) { + this.bdbEnvironmentStats.reportException(e); throw new StorageInitializationException("Failed to reinitialize BdbStorageEngine for store:" + getName() + " after truncation.", e); } } + /** + * truncate() operation mandates that all opened Database be closed before + * attempting truncation. + *

        + * This method throws an exception while truncation is happening to any + * request attempting in parallel with store truncation. + * + * @return + */ + protected Database getBdbDatabase() { + if(isTruncating.get()) { + throw new VoldemortException("Bdb Store " + getName() + + " is currently truncating cannot serve any request."); + } + return bdbDatabase; + } + + @Override public List getVersions(ByteArray key) { - return get(key, null, readLockMode, versionSerializer); + return StoreUtils.getVersions(get(key, null)); } + @Override public List> get(ByteArray key, byte[] transforms) throws PersistenceFailureException { - return get(key, transforms, readLockMode, versionedSerializer); - } - - private List get(ByteArray key, - @SuppressWarnings("unused") byte[] transforms, - LockMode lockMode, - Serializer serializer) throws PersistenceFailureException { StoreUtils.assertValidKey(key); + DatabaseEntry keyEntry = new DatabaseEntry(key.get()); + DatabaseEntry valueEntry = new DatabaseEntry(); long startTimeNs = -1; if(logger.isTraceEnabled()) startTimeNs = System.nanoTime(); - Cursor cursor = null; try { - cursor = getBdbDatabase().openCursor(null, null); - List result = get(cursor, key, lockMode, serializer); - - // If null, try again in different locking mode to - // avoid null result due to gap between delete and new write - if(result.size() == 0 && lockMode != LockMode.DEFAULT) { - return get(cursor, key, LockMode.DEFAULT, serializer); + // uncommitted reads are perfectly fine now, since we have no + // je-delete() in put() + OperationStatus status = getBdbDatabase().get(null, keyEntry, valueEntry, readLockMode); + if(OperationStatus.SUCCESS == status) { + return StoreBinaryFormat.fromByteArray(valueEntry.getData()); } else { - return result; + return Collections.emptyList(); } } catch(DatabaseException e) { + this.bdbEnvironmentStats.reportException(e); logger.error(e); throw new PersistenceFailureException(e); } finally { if(logger.isTraceEnabled()) { - logger.trace("Completed GET from key " + key + " (keyRef: " + logger.trace("Completed GET (" + getName() + ") from key " + key + " (keyRef: " + System.identityHashCode(key) + ") in " + (System.nanoTime() - startTimeNs) + " ns at " + System.currentTimeMillis()); } - - attemptClose(cursor); } } - /** - * truncate() operation mandates that all opened Database be closed before - * attempting truncation. - *

        - * This method throws an exception while truncation is happening to any - * request attempting in parallel with store truncation. - * - * @return - */ - private Database getBdbDatabase() { - if(isTruncating.get()) { - throw new VoldemortException("Bdb Store " + getName() - + " is currently truncating cannot serve any request."); - } - - return bdbDatabase; - } - + @Override public Map>> getAll(Iterable keys, Map transforms) throws VoldemortException { - + StoreUtils.assertValidKeys(keys); + Map>> results = null; long startTimeNs = -1; if(logger.isTraceEnabled()) startTimeNs = System.nanoTime(); - - StoreUtils.assertValidKeys(keys); - Map>> result = StoreUtils.newEmptyHashMap(keys); - Cursor cursor = null; - - String keyStr = ""; - try { - cursor = getBdbDatabase().openCursor(null, null); - for(ByteArray key: keys) { - - if(logger.isTraceEnabled()) - keyStr += ByteUtils.toHexString(key.get()) + " "; - - List> values = get(cursor, key, readLockMode, versionedSerializer); - if(!values.isEmpty()) - result.put(key, values); - } - } catch(DatabaseException e) { - logger.error(e); - throw new PersistenceFailureException(e); + results = StoreUtils.getAll(this, keys, transforms); + } catch(PersistenceFailureException pfe) { + throw pfe; } finally { - attemptClose(cursor); - } - - if(logger.isTraceEnabled()) - logger.trace("Completed GETALL from keys " + keyStr + " in " - + (System.nanoTime() - startTimeNs) + " ns at " - + System.currentTimeMillis()); - - return result; - } - - private static List get(Cursor cursor, - ByteArray key, - LockMode lockMode, - Serializer serializer) throws DatabaseException { - StoreUtils.assertValidKey(key); - - long startTimeNs = -1; - - if(logger.isTraceEnabled()) - startTimeNs = System.nanoTime(); - - DatabaseEntry keyEntry = new DatabaseEntry(key.get()); - DatabaseEntry valueEntry = new DatabaseEntry(); - List results = Lists.newArrayList(); - - for(OperationStatus status = cursor.getSearchKey(keyEntry, valueEntry, lockMode); status == OperationStatus.SUCCESS; status = cursor.getNextDup(keyEntry, - valueEntry, - lockMode)) { - results.add(serializer.toObject(valueEntry.getData())); - } - - if(logger.isTraceEnabled()) { - logger.trace("Completed GET from key " + ByteUtils.toHexString(key.get()) + " in " - + (System.nanoTime() - startTimeNs) + " ns at " - + System.currentTimeMillis()); + if(logger.isTraceEnabled()) { + String keyStr = ""; + for(ByteArray key: keys) + keyStr += key + " "; + logger.trace("Completed GETALL (" + getName() + ") from keys " + keyStr + " in " + + (System.nanoTime() - startTimeNs) + " ns at " + + System.currentTimeMillis()); + } } return results; } + @Override public void put(ByteArray key, Versioned value, byte[] transforms) throws PersistenceFailureException { - StoreUtils.assertValidKey(key); long startTimeNs = -1; if(logger.isTraceEnabled()) startTimeNs = System.nanoTime(); + StoreUtils.assertValidKey(key); DatabaseEntry keyEntry = new DatabaseEntry(key.get()); + DatabaseEntry valueEntry = new DatabaseEntry(); + boolean succeeded = false; Transaction transaction = null; - Cursor cursor = null; - try { - transaction = this.environment.beginTransaction(null, null); + List> vals = null; - // Check existing values - // if there is a version obsoleted by this value delete it - // if there is a version later than this one, throw an exception - DatabaseEntry valueEntry = new DatabaseEntry(); - cursor = getBdbDatabase().openCursor(transaction, null); - for(OperationStatus status = cursor.getSearchKey(keyEntry, valueEntry, LockMode.RMW); status == OperationStatus.SUCCESS; status = cursor.getNextDup(keyEntry, - valueEntry, - LockMode.RMW)) { - VectorClock clock = new VectorClock(valueEntry.getData()); - Occurred occurred = value.getVersion().compare(clock); - if(occurred == Occurred.BEFORE) - throw new ObsoleteVersionException("Key " - + new String(hexCodec.encode(key.get())) - + " " - + value.getVersion().toString() - + " is obsolete, it is no greater than the current version of " - + clock + "."); - else if(occurred == Occurred.AFTER) - // best effort delete of obsolete previous value! - cursor.delete(); + try { + transaction = environment.beginTransaction(null, null); + + // do a get for the existing values + OperationStatus status = getBdbDatabase().get(transaction, + keyEntry, + valueEntry, + LockMode.RMW); + if(OperationStatus.SUCCESS == status) { + // update + vals = StoreBinaryFormat.fromByteArray(valueEntry.getData()); + // compare vector clocks and throw out old ones, for updates + + Iterator> iter = vals.iterator(); + while(iter.hasNext()) { + Versioned curr = iter.next(); + Occurred occurred = value.getVersion().compare(curr.getVersion()); + if(occurred == Occurred.BEFORE) + throw new ObsoleteVersionException("Key " + + new String(hexCodec.encode(key.get())) + + " " + + value.getVersion().toString() + + " is obsolete, it is no greater than the current version of " + + curr.getVersion().toString() + "."); + else if(occurred == Occurred.AFTER) + iter.remove(); + } + } else { + // insert + vals = new ArrayList>(); } - // Okay so we cleaned up all the prior stuff, so now we are good to - // insert the new thing - valueEntry = new DatabaseEntry(versionedSerializer.toBytes(value)); - OperationStatus status = cursor.put(keyEntry, valueEntry); + // update the new value + vals.add(value); + + valueEntry.setData(StoreBinaryFormat.toByteArray(vals)); + status = getBdbDatabase().put(transaction, keyEntry, valueEntry); + if(status != OperationStatus.SUCCESS) throw new PersistenceFailureException("Put operation failed with status: " + status); succeeded = true; } catch(DatabaseException e) { + this.bdbEnvironmentStats.reportException(e); logger.error(e); throw new PersistenceFailureException(e); } finally { - attemptClose(cursor); if(succeeded) attemptCommit(transaction); else attemptAbort(transaction); - } - - if(logger.isTraceEnabled()) { - logger.trace("Completed PUT to key " + ByteUtils.toHexString(key.get()) + " (keyRef: " - + System.identityHashCode(key) + " value " + value + " in " - + (System.nanoTime() - startTimeNs) + " ns at " - + System.currentTimeMillis()); + if(logger.isTraceEnabled()) { + logger.trace("Completed PUT (" + getName() + ") to key " + key + " (keyRef: " + + System.identityHashCode(key) + " value " + value + " in " + + (System.nanoTime() - startTimeNs) + " ns at " + + System.currentTimeMillis()); + } } } + @Override public boolean delete(ByteArray key, Version version) throws PersistenceFailureException { + StoreUtils.assertValidKey(key); long startTimeNs = -1; @@ -405,53 +379,77 @@ public boolean delete(ByteArray key, Version version) throws PersistenceFailureE if(logger.isTraceEnabled()) startTimeNs = System.nanoTime(); - boolean deletedSomething = false; - Cursor cursor = null; Transaction transaction = null; try { transaction = this.environment.beginTransaction(null, null); DatabaseEntry keyEntry = new DatabaseEntry(key.get()); - DatabaseEntry valueEntry = new DatabaseEntry(); - cursor = getBdbDatabase().openCursor(transaction, null); - OperationStatus status = cursor.getSearchKey(keyEntry, - valueEntry, - LockMode.READ_UNCOMMITTED); - while(status == OperationStatus.SUCCESS) { - // if version is null no comparison is necessary - if(new VectorClock(valueEntry.getData()).compare(version) == Occurred.BEFORE) { - cursor.delete(); - deletedSomething = true; + + if(version == null) { + // unversioned delete. Just blow away the whole thing + OperationStatus status = getBdbDatabase().delete(transaction, keyEntry); + if(OperationStatus.SUCCESS == status) + return true; + else + return false; + } else { + // versioned deletes; need to determine what to delete + DatabaseEntry valueEntry = new DatabaseEntry(); + + // do a get for the existing values + OperationStatus status = getBdbDatabase().get(transaction, + keyEntry, + valueEntry, + LockMode.RMW); + // key does not exist to begin with. + if(OperationStatus.NOTFOUND == status) + return false; + + List> vals = StoreBinaryFormat.fromByteArray(valueEntry.getData()); + Iterator> iter = vals.iterator(); + int numVersions = vals.size(); + int numDeletedVersions = 0; + + // go over the versions and remove everything before the + // supplied version + while(iter.hasNext()) { + Versioned curr = iter.next(); + Version currentVersion = curr.getVersion(); + if(currentVersion.compare(version) == Occurred.BEFORE) { + iter.remove(); + numDeletedVersions++; + } + } + + if(numDeletedVersions < numVersions) { + // we still have some valid versions + valueEntry.setData(StoreBinaryFormat.toByteArray(vals)); + getBdbDatabase().put(transaction, keyEntry, valueEntry); + } else { + // we have deleted all the versions; so get rid of the entry + // in the database + getBdbDatabase().delete(transaction, keyEntry); } - status = cursor.getNextDup(keyEntry, valueEntry, LockMode.READ_UNCOMMITTED); + return numDeletedVersions > 0; } - return deletedSomething; } catch(DatabaseException e) { + this.bdbEnvironmentStats.reportException(e); logger.error(e); throw new PersistenceFailureException(e); } finally { - + attemptCommit(transaction); if(logger.isTraceEnabled()) { - logger.trace("Completed DELETE of key " + ByteUtils.toHexString(key.get()) - + " (keyRef: " + System.identityHashCode(key) + ") in " + logger.trace("Completed DELETE (" + getName() + ") of key " + + ByteUtils.toHexString(key.get()) + " (keyRef: " + + System.identityHashCode(key) + ") in " + (System.nanoTime() - startTimeNs) + " ns at " + System.currentTimeMillis()); } - - try { - attemptClose(cursor); - } finally { - attemptCommit(transaction); - } } } - public Object getCapability(StoreCapabilityType capability) { - throw new NoSuchCapabilityException(capability, getName()); - } - @Override public int hashCode() { - return name.hashCode(); + return getName().hashCode(); } @Override @@ -462,11 +460,13 @@ public boolean equals(Object o) { return s.getName().equals(s.getName()); } + @Override public void close() throws PersistenceFailureException { try { if(this.isOpen.compareAndSet(true, false)) this.getBdbDatabase().close(); } catch(DatabaseException e) { + this.bdbEnvironmentStats.reportException(e); logger.error(e); throw new PersistenceFailureException("Shutdown failed.", e); } @@ -476,37 +476,31 @@ private void attemptAbort(Transaction transaction) { try { if(transaction != null) transaction.abort(); - } catch(Exception e) { + } catch(DatabaseException e) { + this.bdbEnvironmentStats.reportException(e); logger.error("Abort failed!", e); } } private void attemptCommit(Transaction transaction) { try { - transaction.commit(); + if(transaction != null) + transaction.commit(); } catch(DatabaseException e) { + this.bdbEnvironmentStats.reportException(e); logger.error("Transaction commit failed!", e); attemptAbort(transaction); throw new PersistenceFailureException(e); } } - private static void attemptClose(Cursor cursor) { - try { - if(cursor != null) - cursor.close(); - } catch(DatabaseException e) { - logger.error("Error closing cursor.", e); - throw new PersistenceFailureException(e.getMessage(), e); - } - } - public DatabaseStats getStats(boolean setFast) { try { StatsConfig config = new StatsConfig(); config.setFast(setFast); return this.getBdbDatabase().getStats(config); } catch(DatabaseException e) { + this.bdbEnvironmentStats.reportException(e); logger.error(e); throw new VoldemortException(e); } @@ -528,134 +522,159 @@ public BdbEnvironmentStats getBdbEnvironmentStats() { return bdbEnvironmentStats; } - private static abstract class BdbIterator implements ClosableIterator { + protected Logger getLogger() { + return logger; + } - private final boolean noValues; - final Cursor cursor; + private static class BdbEntriesIterator extends BdbIterator>> { - private T current; - private volatile boolean isOpen; + private List>> cache; - public BdbIterator(Cursor cursor, boolean noValues) { - this.cursor = cursor; - isOpen = true; - this.noValues = noValues; - DatabaseEntry keyEntry = new DatabaseEntry(); - DatabaseEntry valueEntry = new DatabaseEntry(); - if(noValues) - valueEntry.setPartial(true); - try { - cursor.getFirst(keyEntry, valueEntry, LockMode.READ_UNCOMMITTED); - } catch(DatabaseException e) { - logger.error(e); - throw new PersistenceFailureException(e); - } - if(keyEntry.getData() != null) - current = get(keyEntry, valueEntry); + public BdbEntriesIterator(Cursor cursor, BdbStorageEngine bdbEngine) { + super(cursor, bdbEngine); + this.cache = new ArrayList>>(); } - protected abstract T get(DatabaseEntry key, DatabaseEntry value); - - protected abstract void moveCursor(DatabaseEntry key, DatabaseEntry value) - throws DatabaseException; - - public final boolean hasNext() { - return current != null; + @Override + public boolean hasNext() { + // we have a next element if there is at least one cached + // element or we can make more + return cache.size() > 0 || makeMore(); } - public final T next() { - if(!isOpen) - throw new PersistenceFailureException("Call to next() on a closed iterator."); + @Override + public Pair> next() { + if(cache.size() == 0) { + if(!makeMore()) + throw new NoSuchElementException("Iterated to end."); + } + // must now have at least one thing in the cache + return cache.remove(cache.size() - 1); + } + protected boolean makeMore() { DatabaseEntry keyEntry = new DatabaseEntry(); DatabaseEntry valueEntry = new DatabaseEntry(); - if(noValues) - valueEntry.setPartial(true); try { - moveCursor(keyEntry, valueEntry); - } catch(DatabaseException e) { - logger.error(e); - throw new PersistenceFailureException(e); - } - T previous = current; - if(keyEntry.getData() == null) - current = null; - else - current = get(keyEntry, valueEntry); - - return previous; - } - - public final void remove() { - throw new UnsupportedOperationException("No removal y'all."); - } + OperationStatus status = cursor.getNext(keyEntry, + valueEntry, + LockMode.READ_UNCOMMITTED); - public final void close() { - try { - cursor.close(); - isOpen = false; + if(OperationStatus.NOTFOUND == status) { + // we have reached the end of the cursor + return false; + } + ByteArray key = null; + if(bdbEngine.isPartitionScanSupported()) + key = new ByteArray(StoreBinaryFormat.extractKey(keyEntry.getData())); + else + key = new ByteArray(keyEntry.getData()); + + for(Versioned val: StoreBinaryFormat.fromByteArray(valueEntry.getData())) + this.cache.add(Pair.create(key, val)); + return true; } catch(DatabaseException e) { + bdbEngine.bdbEnvironmentStats.reportException(e); logger.error(e); + throw new PersistenceFailureException(e); } } - - @Override - protected final void finalize() { - if(isOpen) { - logger.error("Failure to close cursor, will be forcably closed."); - close(); - } - - } } private static class BdbKeysIterator extends BdbIterator { - public BdbKeysIterator(Cursor cursor) { - super(cursor, true); - } + ByteArray current = null; - @Override - protected ByteArray get(DatabaseEntry key, DatabaseEntry value) { - return new ByteArray(key.getData()); + public BdbKeysIterator(Cursor cursor, BdbStorageEngine bdbEngine) { + super(cursor, bdbEngine); } @Override - protected void moveCursor(DatabaseEntry key, DatabaseEntry value) throws DatabaseException { - cursor.getNextNoDup(key, value, LockMode.READ_UNCOMMITTED); - } - - } - - private static class BdbEntriesIterator extends BdbIterator>> { - - public BdbEntriesIterator(Cursor cursor) { - super(cursor, false); + public boolean hasNext() { + return current != null || fetchNextKey(); } @Override - protected Pair> get(DatabaseEntry key, DatabaseEntry value) { - VectorClock clock = new VectorClock(value.getData()); - byte[] bytes = ByteUtils.copy(value.getData(), - clock.sizeInBytes(), - value.getData().length); - return Pair.create(new ByteArray(key.getData()), new Versioned(bytes, clock)); + public ByteArray next() { + ByteArray result = null; + if(current == null) { + if(!fetchNextKey()) + throw new NoSuchElementException("Iterated to end."); + } + result = current; + current = null; + return result; } - @Override - protected void moveCursor(DatabaseEntry key, DatabaseEntry value) throws DatabaseException { - cursor.getNext(key, value, LockMode.READ_UNCOMMITTED); - } - } + private boolean fetchNextKey() { + DatabaseEntry keyEntry = new DatabaseEntry(); + DatabaseEntry valueEntry = new DatabaseEntry(); + valueEntry.setPartial(true); + try { + OperationStatus status = cursor.getNext(keyEntry, + valueEntry, + LockMode.READ_UNCOMMITTED); + if(OperationStatus.NOTFOUND == status) { + // we have reached the end of the cursor + return false; + } - public boolean isPartitionAware() { - return false; + if(bdbEngine.isPartitionScanSupported()) + current = new ByteArray(StoreBinaryFormat.extractKey(keyEntry.getData())); + else + current = new ByteArray(keyEntry.getData()); + return true; + } catch(DatabaseException e) { + bdbEngine.bdbEnvironmentStats.reportException(e); + logger.error(e); + throw new PersistenceFailureException(e); + } + } } + @Override public void nativeBackup(File toDir, boolean verifyFiles, boolean isIncremental, AsyncOperationStatus status) { new BdbNativeBackup(environment, verifyFiles, isIncremental).performBackup(toDir, status); } + + @Override + public boolean beginBatchModifications() { + if(checkpointerOffForBatchWrites) { + synchronized(this) { + numOutstandingBatchWriteJobs++; + // turn the checkpointer off for the first job + if(numOutstandingBatchWriteJobs == 1) { + logger.info("Turning checkpointer off for batch writes"); + EnvironmentMutableConfig mConfig = environment.getMutableConfig(); + mConfig.setConfigParam(EnvironmentConfig.ENV_RUN_CHECKPOINTER, + Boolean.toString(false)); + environment.setMutableConfig(mConfig); + return true; + } + } + } + return false; + } + + @Override + public boolean endBatchModifications() { + if(checkpointerOffForBatchWrites) { + synchronized(this) { + numOutstandingBatchWriteJobs--; + // turn the checkpointer back on if the last job finishes + if(numOutstandingBatchWriteJobs == 0) { + logger.info("Turning checkpointer on"); + EnvironmentMutableConfig mConfig = environment.getMutableConfig(); + mConfig.setConfigParam(EnvironmentConfig.ENV_RUN_CHECKPOINTER, + Boolean.toString(true)); + environment.setMutableConfig(mConfig); + return true; + } + } + } + return false; + } } diff --git a/src/java/voldemort/store/bdb/PartitionPrefixedBdbStorageEngine.java b/src/java/voldemort/store/bdb/PartitionPrefixedBdbStorageEngine.java new file mode 100644 index 0000000000..3784775300 --- /dev/null +++ b/src/java/voldemort/store/bdb/PartitionPrefixedBdbStorageEngine.java @@ -0,0 +1,283 @@ +/* + * Copyright 2008-2012 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.store.bdb; + +import java.util.ArrayList; +import java.util.List; +import java.util.NoSuchElementException; + +import org.apache.log4j.Logger; + +import voldemort.routing.RoutingStrategy; +import voldemort.store.PersistenceFailureException; +import voldemort.store.StoreBinaryFormat; +import voldemort.store.StoreUtils; +import voldemort.utils.ByteArray; +import voldemort.utils.ClosableIterator; +import voldemort.utils.Pair; +import voldemort.versioning.Version; +import voldemort.versioning.Versioned; + +import com.sleepycat.je.CacheMode; +import com.sleepycat.je.Cursor; +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseEntry; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.Environment; +import com.sleepycat.je.LockMode; +import com.sleepycat.je.OperationStatus; + +/** + * Extends BDB Storage Engine with capabilities to perform partition range + * scans, to speed up scan jobs, that filter on partition id + * + */ +public class PartitionPrefixedBdbStorageEngine extends BdbStorageEngine { + + private static final Logger logger = Logger.getLogger(PartitionPrefixedBdbStorageEngine.class); + private final RoutingStrategy routingStrategy; + + public PartitionPrefixedBdbStorageEngine(String name, + Environment environment, + Database database, + BdbRuntimeConfig config, + RoutingStrategy strategy) { + super(name, environment, database, config); + this.routingStrategy = strategy; + } + + @Override + public ClosableIterator>> entries(int partition) { + try { + Cursor cursor = getBdbDatabase().openCursor(null, null); + // evict data brought in by the cursor walk right away + if(this.minimizeScanImpact) + cursor.setCacheMode(CacheMode.EVICT_BIN); + return new BdbPartitionEntriesIterator(cursor, partition, this); + } catch(DatabaseException e) { + this.bdbEnvironmentStats.reportException(e); + logger.error(e); + throw new PersistenceFailureException(e); + } + } + + @Override + public ClosableIterator keys(int partition) { + try { + Cursor cursor = getBdbDatabase().openCursor(null, null); + // evict data brought in by the cursor walk right away + if(this.minimizeScanImpact) + cursor.setCacheMode(CacheMode.EVICT_BIN); + return new BdbPartitionKeysIterator(cursor, partition, this); + } catch(DatabaseException e) { + this.bdbEnvironmentStats.reportException(e); + logger.error(e); + throw new PersistenceFailureException(e); + } + } + + @Override + public List> get(ByteArray key, byte[] transforms) + throws PersistenceFailureException { + StoreUtils.assertValidKey(key); + int partition = routingStrategy.getMasterPartition(key.get()); + ByteArray prefixedKey = new ByteArray(StoreBinaryFormat.makePrefixedKey(key.get(), + partition)); + return super.get(prefixedKey, transforms); + } + + @Override + public void put(ByteArray key, Versioned value, byte[] transforms) + throws PersistenceFailureException { + + StoreUtils.assertValidKey(key); + int partition = routingStrategy.getMasterPartition(key.get()); + ByteArray prefixedKey = new ByteArray(StoreBinaryFormat.makePrefixedKey(key.get(), + partition)); + super.put(prefixedKey, value, transforms); + } + + @Override + public boolean delete(ByteArray key, Version version) throws PersistenceFailureException { + + StoreUtils.assertValidKey(key); + int partition = routingStrategy.getMasterPartition(key.get()); + ByteArray prefixedKey = new ByteArray(StoreBinaryFormat.makePrefixedKey(key.get(), + partition)); + return super.delete(prefixedKey, version); + } + + @Override + protected Logger getLogger() { + return logger; + } + + /** + * Implements a range scan over the partition entries + * + */ + private static class BdbPartitionEntriesIterator extends + BdbIterator>> { + + private List>> cache; + private int partition; + private boolean positioned; + + public BdbPartitionEntriesIterator(Cursor cursor, int partition, BdbStorageEngine bdbEngine) { + super(cursor, bdbEngine); + this.partition = partition; + this.cache = new ArrayList>>(); + this.positioned = false; + } + + @Override + public boolean hasNext() { + // we have a next element if there is at least one cached + // element or we can make more + return cache.size() > 0 || makeMore(); + } + + @Override + public Pair> next() { + if(cache.size() == 0) { + if(!makeMore()) + throw new NoSuchElementException("Iterated to end."); + } + // must now have at least one thing in the cache + return cache.remove(cache.size() - 1); + } + + /** + * Fetches the next entry from the DB, for the partition + * + * @return true if some new data was fetched, false if end of data + */ + private boolean makeMore() { + DatabaseEntry keyEntry = new DatabaseEntry(); + DatabaseEntry valueEntry = new DatabaseEntry(); + OperationStatus status; + try { + if(!positioned) { + positioned = true; + keyEntry.setData(StoreBinaryFormat.makePartitionKey(partition)); + status = cursor.getSearchKeyRange(keyEntry, + valueEntry, + LockMode.READ_UNCOMMITTED); + } else { + status = cursor.getNext(keyEntry, valueEntry, LockMode.READ_UNCOMMITTED); + } + + if(OperationStatus.NOTFOUND == status) { + // we have reached the end of the cursor + return false; + } + + // check if we are still in the same partition we need + if(StoreBinaryFormat.extractPartition(keyEntry.getData()) != partition) + return false; + + ByteArray key = new ByteArray(StoreBinaryFormat.extractKey(keyEntry.getData())); + for(Versioned val: StoreBinaryFormat.fromByteArray(valueEntry.getData())) + this.cache.add(Pair.create(key, val)); + return true; + } catch(DatabaseException e) { + bdbEngine.bdbEnvironmentStats.reportException(e); + logger.error(e); + throw new PersistenceFailureException(e); + } + } + } + + /** + * Implements a range scan over the key entries belonging to the partition + * + */ + private static class BdbPartitionKeysIterator extends BdbIterator { + + ByteArray current = null; + private int partition; + private boolean positioned; + + public BdbPartitionKeysIterator(Cursor cursor, int partition, BdbStorageEngine bdbEngine) { + super(cursor, bdbEngine); + this.partition = partition; + positioned = false; + } + + @Override + public boolean hasNext() { + return current != null || fetchNextKey(); + } + + @Override + public ByteArray next() { + ByteArray result = null; + if(current == null) { + if(!fetchNextKey()) + throw new NoSuchElementException("Iterated to end."); + } + result = current; + current = null; + return result; + } + + /** + * Fetches the next key for the partition from the DB + * + * @return true if successfully fetched one more key, false if end of + * keys + */ + private boolean fetchNextKey() { + DatabaseEntry keyEntry = new DatabaseEntry(); + DatabaseEntry valueEntry = new DatabaseEntry(); + OperationStatus status; + valueEntry.setPartial(true); + try { + if(!positioned) { + positioned = true; + keyEntry.setData(StoreBinaryFormat.makePartitionKey(partition)); + status = cursor.getSearchKeyRange(keyEntry, + valueEntry, + LockMode.READ_UNCOMMITTED); + } else { + status = cursor.getNext(keyEntry, valueEntry, LockMode.READ_UNCOMMITTED); + } + + if(OperationStatus.NOTFOUND == status) { + // we have reached the end of the cursor + return false; + } + + // check if we are still in the same partition we need + if(StoreBinaryFormat.extractPartition(keyEntry.getData()) != partition) + return false; + + current = new ByteArray(StoreBinaryFormat.extractKey(keyEntry.getData())); + return true; + } catch(DatabaseException e) { + bdbEngine.bdbEnvironmentStats.reportException(e); + logger.error(e); + throw new PersistenceFailureException(e); + } + } + } + + @Override + public boolean isPartitionScanSupported() { + return true; + } +} diff --git a/src/java/voldemort/store/bdb/dataconversion/AbstractBdbConversion.java b/src/java/voldemort/store/bdb/dataconversion/AbstractBdbConversion.java new file mode 100644 index 0000000000..2f529fdd11 --- /dev/null +++ b/src/java/voldemort/store/bdb/dataconversion/AbstractBdbConversion.java @@ -0,0 +1,100 @@ +package voldemort.store.bdb.dataconversion; + +import java.io.File; + +import org.apache.log4j.Logger; + +import voldemort.cluster.Cluster; +import voldemort.xml.ClusterMapper; + +import com.sleepycat.je.Cursor; +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseConfig; +import com.sleepycat.je.Durability; +import com.sleepycat.je.Environment; +import com.sleepycat.je.EnvironmentConfig; + +public abstract class AbstractBdbConversion { + + String storeName; + Database srcDB; + Environment srcEnv; + + Database dstDB; + Environment dstEnv; + Cluster cluster; + + Cursor cursor; + + Logger logger = Logger.getLogger(BdbConvertData.class); + + AbstractBdbConversion(String storeName, + String clusterXmlPath, + String sourceEnvPath, + String destEnvPath, + int logFileSize, + int nodeMax) throws Exception { + this.cluster = new ClusterMapper().readCluster(new File(clusterXmlPath)); + this.storeName = storeName; + + // Configure src environment handle + EnvironmentConfig envConfig = new EnvironmentConfig(); + envConfig.setTransactional(true); + envConfig.setAllowCreate(false); + envConfig.setReadOnly(true); + envConfig.setCacheSize(1024 * 1024 * 1024); + + DatabaseConfig dbConfig = new DatabaseConfig(); + dbConfig.setTransactional(true); + dbConfig.setAllowCreate(false); + dbConfig.setSortedDuplicates(areDuplicatesNeededForSrc()); + dbConfig.setReadOnly(true); + + srcEnv = new Environment(new File(sourceEnvPath), envConfig); + srcDB = srcEnv.openDatabase(null, storeName, dbConfig); + + // Configure dest environment handle + File newEnvDir = new File(destEnvPath); + if(!newEnvDir.exists()) { + newEnvDir.mkdirs(); + } + + envConfig = new EnvironmentConfig(); + envConfig.setTransactional(false); + envConfig.setAllowCreate(true); + envConfig.setReadOnly(false); + envConfig.setCacheSize(1024 * 1024 * 1024); + envConfig.setConfigParam(EnvironmentConfig.LOG_FILE_MAX, + Long.toString(logFileSize * 1024L * 1024L)); + envConfig.setDurability(Durability.COMMIT_NO_SYNC); + + dbConfig = new DatabaseConfig(); + dbConfig.setTransactional(false); + dbConfig.setAllowCreate(true); + dbConfig.setSortedDuplicates(areDuplicatesNeededForDest()); + dbConfig.setDeferredWrite(true); + dbConfig.setNodeMaxEntries(nodeMax); + + dstEnv = new Environment(newEnvDir, envConfig); + dstDB = dstEnv.openDatabase(null, storeName, dbConfig); + + } + + public void close() { + if(cursor != null) + cursor.close(); + + srcDB.close(); + srcEnv.close(); + + dstDB.sync(); + dstDB.close(); + dstEnv.close(); + } + + public abstract void transfer() throws Exception; + + public abstract boolean areDuplicatesNeededForSrc(); + + public abstract boolean areDuplicatesNeededForDest(); +} diff --git a/src/java/voldemort/store/bdb/dataconversion/BdbConvertBaseToNewDup.java b/src/java/voldemort/store/bdb/dataconversion/BdbConvertBaseToNewDup.java new file mode 100644 index 0000000000..cc38f72021 --- /dev/null +++ b/src/java/voldemort/store/bdb/dataconversion/BdbConvertBaseToNewDup.java @@ -0,0 +1,96 @@ +package voldemort.store.bdb.dataconversion; + +import java.util.ArrayList; +import java.util.List; + +import voldemort.store.StoreBinaryFormat; +import voldemort.utils.ByteUtils; +import voldemort.versioning.VectorClock; +import voldemort.versioning.Versioned; + +import com.sleepycat.je.DatabaseEntry; +import com.sleepycat.je.LockMode; +import com.sleepycat.je.OperationStatus; + +public class BdbConvertBaseToNewDup extends AbstractBdbConversion { + + BdbConvertBaseToNewDup(String storeName, + String clusterXmlPath, + String sourceEnvPath, + String destEnvPath, + int logFileSize, + int nodeMax) throws Exception { + super(storeName, clusterXmlPath, sourceEnvPath, destEnvPath, logFileSize, nodeMax); + } + + @Override + public void transfer() throws Exception { + cursor = srcDB.openCursor(null, null); + DatabaseEntry keyEntry = new DatabaseEntry(); + DatabaseEntry valueEntry = new DatabaseEntry(); + + byte[] prevKey = null; + List> vals = new ArrayList>(); + + long startTime = System.currentTimeMillis(); + int scanCount = 0; + int keyCount = 0; + while(cursor.getNext(keyEntry, valueEntry, LockMode.READ_UNCOMMITTED) == OperationStatus.SUCCESS) { + scanCount++; + if(scanCount % 1000000 == 0) + logger.info("Converted " + scanCount + "entries in " + + (System.currentTimeMillis() - startTime) / 1000 + " secs"); + + // read the value as a versioned Object + VectorClock clock = new VectorClock(valueEntry.getData()); + byte[] bytes = ByteUtils.copy(valueEntry.getData(), + clock.sizeInBytes(), + valueEntry.getData().length); + Versioned value = new Versioned(bytes, clock); + byte[] key = keyEntry.getData(); + + if(prevKey != null && (ByteUtils.compare(prevKey, key) != 0)) { + // there is a new key; write out the buffered values and + // previous key + OperationStatus putStatus = dstDB.put(null, + new DatabaseEntry(prevKey), + new DatabaseEntry(StoreBinaryFormat.toByteArray(vals))); + if(OperationStatus.SUCCESS != putStatus) { + String errorStr = "Put failed with " + putStatus + " for key" + + BdbConvertData.writeAsciiString(prevKey); + logger.error(errorStr); + throw new Exception(errorStr); + } + vals = new ArrayList>(); + keyCount++; + } + + vals.add(value); + prevKey = key; + } + if(vals.size() > 0) { + OperationStatus putStatus = dstDB.put(null, + new DatabaseEntry(prevKey), + new DatabaseEntry(StoreBinaryFormat.toByteArray(vals))); + if(OperationStatus.SUCCESS != putStatus) { + String errorStr = "Put failed with " + putStatus + " for key" + + BdbConvertData.writeAsciiString(prevKey); + logger.error(errorStr); + throw new Exception(errorStr); + } + keyCount++; + } + logger.info("Completed " + scanCount + "entries and " + keyCount + " keys in " + + (System.currentTimeMillis() - startTime) / 1000 + " secs"); + } + + @Override + public boolean areDuplicatesNeededForSrc() { + return true; + } + + @Override + public boolean areDuplicatesNeededForDest() { + return false; + } +} diff --git a/src/java/voldemort/store/bdb/dataconversion/BdbConvertBaseToPidScan.java b/src/java/voldemort/store/bdb/dataconversion/BdbConvertBaseToPidScan.java new file mode 100644 index 0000000000..3ec76da48b --- /dev/null +++ b/src/java/voldemort/store/bdb/dataconversion/BdbConvertBaseToPidScan.java @@ -0,0 +1,106 @@ +package voldemort.store.bdb.dataconversion; + +import java.util.ArrayList; +import java.util.List; + +import voldemort.store.StoreBinaryFormat; +import voldemort.utils.ByteUtils; +import voldemort.utils.FnvHashFunction; +import voldemort.utils.HashFunction; +import voldemort.versioning.VectorClock; +import voldemort.versioning.Versioned; + +import com.sleepycat.je.DatabaseEntry; +import com.sleepycat.je.LockMode; +import com.sleepycat.je.OperationStatus; + +public class BdbConvertBaseToPidScan extends AbstractBdbConversion { + + BdbConvertBaseToPidScan(String storeName, + String clusterXmlPath, + String sourceEnvPath, + String destEnvPath, + int logFileSize, + int nodeMax) throws Exception { + super(storeName, clusterXmlPath, sourceEnvPath, destEnvPath, logFileSize, nodeMax); + } + + @Override + public void transfer() throws Exception { + cursor = srcDB.openCursor(null, null); + DatabaseEntry keyEntry = new DatabaseEntry(); + DatabaseEntry valueEntry = new DatabaseEntry(); + + byte[] prevKey = null; + List> vals = new ArrayList>(); + HashFunction hash = new FnvHashFunction(); + int totalPartitions = cluster.getNumberOfPartitions(); + + long startTime = System.currentTimeMillis(); + int scanCount = 0; + int keyCount = 0; + while(cursor.getNext(keyEntry, valueEntry, LockMode.READ_UNCOMMITTED) == OperationStatus.SUCCESS) { + scanCount++; + if(scanCount % 1000000 == 0) + logger.info("Converted " + scanCount + "entries in " + + (System.currentTimeMillis() - startTime) / 1000 + " secs"); + + // read the value as a versioned Object + VectorClock clock = new VectorClock(valueEntry.getData()); + byte[] bytes = ByteUtils.copy(valueEntry.getData(), + clock.sizeInBytes(), + valueEntry.getData().length); + Versioned value = new Versioned(bytes, clock); + byte[] key = keyEntry.getData(); + + if(prevKey != null && (ByteUtils.compare(prevKey, key) != 0)) { + // there is a new key; write out the buffered values and + // previous key + int partition = BdbConvertData.abs(hash.hash(prevKey)) + % (Math.max(1, totalPartitions)); + + OperationStatus putStatus = dstDB.put(null, + new DatabaseEntry(StoreBinaryFormat.makePrefixedKey(prevKey, + partition)), + new DatabaseEntry(StoreBinaryFormat.toByteArray(vals))); + if(OperationStatus.SUCCESS != putStatus) { + String errorStr = "Put failed with " + putStatus + " for key" + + BdbConvertData.writeAsciiString(prevKey); + logger.error(errorStr); + throw new Exception(errorStr); + } + vals = new ArrayList>(); + keyCount++; + } + + vals.add(value); + prevKey = key; + } + if(vals.size() > 0) { + int partition = BdbConvertData.abs(hash.hash(prevKey)) % (Math.max(1, totalPartitions)); + OperationStatus putStatus = dstDB.put(null, + new DatabaseEntry(StoreBinaryFormat.makePrefixedKey(prevKey, + partition)), + new DatabaseEntry(StoreBinaryFormat.toByteArray(vals))); + if(OperationStatus.SUCCESS != putStatus) { + String errorStr = "Put failed with " + putStatus + " for key" + + BdbConvertData.writeAsciiString(prevKey); + logger.error(errorStr); + throw new Exception(errorStr); + } + keyCount++; + } + logger.info("Completed " + scanCount + "entries and " + keyCount + " keys in " + + (System.currentTimeMillis() - startTime) / 1000 + " secs"); + } + + @Override + public boolean areDuplicatesNeededForSrc() { + return true; + } + + @Override + public boolean areDuplicatesNeededForDest() { + return false; + } +} diff --git a/src/java/voldemort/store/bdb/dataconversion/BdbConvertData.java b/src/java/voldemort/store/bdb/dataconversion/BdbConvertData.java new file mode 100644 index 0000000000..c445ee62ab --- /dev/null +++ b/src/java/voldemort/store/bdb/dataconversion/BdbConvertData.java @@ -0,0 +1,167 @@ +package voldemort.store.bdb.dataconversion; + +import joptsimple.OptionParser; +import joptsimple.OptionSet; + +import org.apache.commons.codec.binary.Base64; +import org.apache.log4j.Logger; + +import voldemort.utils.CmdUtils; + +/** + * + * Conversion Utility to convert to-fro between 0.96 format and release 1.x+ BDB + * data formats + * + */ +public class BdbConvertData { + + static Logger logger = Logger.getLogger(BdbConvertData.class); + + /** + * @param args + */ + public static void main(String[] args) throws Exception { + OptionParser parser = new OptionParser(); + parser.accepts("cluster-xml", "[REQUIRED] path to cluster.xml file for the server") + .withRequiredArg() + .describedAs("cluster-xml") + .ofType(String.class); + parser.accepts("src", "[REQUIRED] Source environment to be converted") + .withRequiredArg() + .describedAs("source-env") + .ofType(String.class); + parser.accepts("dest", "[REQUIRED] Destination environment to place converted data into") + .withRequiredArg() + .describedAs("destination-env") + .ofType(String.class); + parser.accepts("store", "[REQUIRED] Store/BDB database to convert") + .withRequiredArg() + .describedAs("store") + .ofType(String.class); + parser.accepts("from-format", "[REQUIRED] source format") + .withRequiredArg() + .describedAs("from-format") + .ofType(String.class); + parser.accepts("to-format", "[REQUIRED] destination format") + .withRequiredArg() + .describedAs("to-format") + .ofType(String.class); + parser.accepts("je-log-size", "[Optional] Size of the converted JE log files") + .withRequiredArg() + .describedAs("je-log-size") + .ofType(Integer.class); + parser.accepts("btree-nodemax", "[Optional] Fanout of converted Btree nodes") + .withRequiredArg() + .describedAs("btree-nodemax") + .ofType(Integer.class); + + OptionSet options = parser.parse(args); + + if(!options.has("cluster-xml") || !options.has("src") || !options.has("dest") + || !options.has("store") || !options.has("from-format") || !options.has("to-format")) { + parser.printHelpOn(System.err); + System.exit(0); + } + + String clusterXmlPath = CmdUtils.valueOf(options, "cluster-xml", null); + String sourceEnvPath = CmdUtils.valueOf(options, "src", null); + String destEnvPath = CmdUtils.valueOf(options, "dest", null); + String storeName = CmdUtils.valueOf(options, "store", null); + + String fromFormat = CmdUtils.valueOf(options, "from-format", null); + String toFormat = CmdUtils.valueOf(options, "to-format", null); + + if(!isValidFormat(fromFormat) || !isValidFormat(toFormat)) { + parser.printHelpOn(System.err); + System.exit(0); + } + + Integer logFileSize = CmdUtils.valueOf(options, "je-log-size", 60); + Integer nodeMax = CmdUtils.valueOf(options, "btree-nodemax", 512); + + AbstractBdbConversion conversion = null; + try { + if(fromFormat.equals("Base") && toFormat.equals("NewDup")) { + conversion = new BdbConvertBaseToNewDup(storeName, + clusterXmlPath, + sourceEnvPath, + destEnvPath, + logFileSize, + nodeMax); + } else if(fromFormat.equals("Base") && toFormat.equals("PidScan")) { + conversion = new BdbConvertBaseToPidScan(storeName, + clusterXmlPath, + sourceEnvPath, + destEnvPath, + logFileSize, + nodeMax); + + } else if(fromFormat.equals("NewDup") && toFormat.equals("PidScan")) { + conversion = new BdbConvertNewDupToPidScan(storeName, + clusterXmlPath, + sourceEnvPath, + destEnvPath, + logFileSize, + nodeMax); + + } else if(fromFormat.equals("PidScan") && toFormat.equals("NewDup")) { + conversion = new BdbRevertPidScanToNewDup(storeName, + clusterXmlPath, + sourceEnvPath, + destEnvPath, + logFileSize, + nodeMax); + + } else if(fromFormat.equals("PidScan") && toFormat.equals("Base")) { + conversion = new BdbRevertPidScanToBase(storeName, + clusterXmlPath, + sourceEnvPath, + destEnvPath, + logFileSize, + nodeMax); + + } else if(fromFormat.equals("NewDup") && toFormat.equals("Base")) { + conversion = new BdbRevertNewDupToBase(storeName, + clusterXmlPath, + sourceEnvPath, + destEnvPath, + logFileSize, + nodeMax); + } else { + throw new Exception("Invalid conversion. Please check READMEFIRST file"); + } + // start the actual data conversion + conversion.transfer(); + } catch(Exception e) { + logger.error("Error converting data", e); + } finally { + if(conversion != null) + conversion.close(); + } + } + + static boolean isValidFormat(String format) { + if(format == null) + return false; + return format.equals("Base") || format.equals("NewDup") || format.equals("PidScan"); + } + + /** + * Returns a Base64 encoded version of the byte array + * + * @param key + * @return + */ + static String writeAsciiString(byte[] bytes) { + return new String(Base64.encodeBase64(bytes)); + } + + static int abs(int a) { + if(a >= 0) + return a; + else if(a != Integer.MIN_VALUE) + return -a; + return Integer.MAX_VALUE; + } +} diff --git a/src/java/voldemort/store/bdb/dataconversion/BdbConvertNewDupToPidScan.java b/src/java/voldemort/store/bdb/dataconversion/BdbConvertNewDupToPidScan.java new file mode 100644 index 0000000000..cfb21077af --- /dev/null +++ b/src/java/voldemort/store/bdb/dataconversion/BdbConvertNewDupToPidScan.java @@ -0,0 +1,74 @@ +package voldemort.store.bdb.dataconversion; + +import java.util.List; + +import voldemort.store.StoreBinaryFormat; +import voldemort.utils.FnvHashFunction; +import voldemort.utils.HashFunction; +import voldemort.versioning.Versioned; + +import com.sleepycat.je.DatabaseEntry; +import com.sleepycat.je.LockMode; +import com.sleepycat.je.OperationStatus; + +public class BdbConvertNewDupToPidScan extends AbstractBdbConversion { + + BdbConvertNewDupToPidScan(String storeName, + String clusterXmlPath, + String sourceEnvPath, + String destEnvPath, + int logFileSize, + int nodeMax) throws Exception { + super(storeName, clusterXmlPath, sourceEnvPath, destEnvPath, logFileSize, nodeMax); + } + + @Override + public void transfer() throws Exception { + cursor = srcDB.openCursor(null, null); + DatabaseEntry keyEntry = new DatabaseEntry(); + DatabaseEntry valueEntry = new DatabaseEntry(); + HashFunction hash = new FnvHashFunction(); + int totalPartitions = cluster.getNumberOfPartitions(); + + List> vals; + long startTime = System.currentTimeMillis(); + int scanCount = 0; + int keyCount = 0; + while(cursor.getNext(keyEntry, valueEntry, LockMode.READ_UNCOMMITTED) == OperationStatus.SUCCESS) { + keyCount++; + + vals = StoreBinaryFormat.fromByteArray(valueEntry.getData()); + scanCount += vals.size(); + + int partition = BdbConvertData.abs(hash.hash(keyEntry.getData())) + % (Math.max(1, totalPartitions)); + + OperationStatus putStatus = dstDB.put(null, + new DatabaseEntry(StoreBinaryFormat.makePrefixedKey(keyEntry.getData(), + partition)), + valueEntry); + if(OperationStatus.SUCCESS != putStatus) { + String errorStr = "Put failed with " + putStatus + " for key" + + BdbConvertData.writeAsciiString(keyEntry.getData()); + logger.error(errorStr); + throw new Exception(errorStr); + } + + if(scanCount % 1000000 == 0) + logger.info("Reverted " + scanCount + "entries in " + + (System.currentTimeMillis() - startTime) / 1000 + " secs"); + } + logger.info("Converted " + scanCount + "entries and " + keyCount + " keys in " + + (System.currentTimeMillis() - startTime) / 1000 + " secs"); + } + + @Override + public boolean areDuplicatesNeededForSrc() { + return false; + } + + @Override + public boolean areDuplicatesNeededForDest() { + return false; + } +} diff --git a/src/java/voldemort/store/bdb/dataconversion/BdbRevertNewDupToBase.java b/src/java/voldemort/store/bdb/dataconversion/BdbRevertNewDupToBase.java new file mode 100644 index 0000000000..c755476ecc --- /dev/null +++ b/src/java/voldemort/store/bdb/dataconversion/BdbRevertNewDupToBase.java @@ -0,0 +1,71 @@ +package voldemort.store.bdb.dataconversion; + +import java.util.List; + +import voldemort.serialization.IdentitySerializer; +import voldemort.serialization.VersionedSerializer; +import voldemort.store.StoreBinaryFormat; +import voldemort.versioning.Versioned; + +import com.sleepycat.je.DatabaseEntry; +import com.sleepycat.je.LockMode; +import com.sleepycat.je.OperationStatus; + +public class BdbRevertNewDupToBase extends AbstractBdbConversion { + + BdbRevertNewDupToBase(String storeName, + String clusterXmlPath, + String sourceEnvPath, + String destEnvPath, + int logFileSize, + int nodeMax) throws Exception { + super(storeName, clusterXmlPath, sourceEnvPath, destEnvPath, logFileSize, nodeMax); + } + + @Override + public void transfer() throws Exception { + cursor = srcDB.openCursor(null, null); + DatabaseEntry keyEntry = new DatabaseEntry(); + DatabaseEntry valueEntry = new DatabaseEntry(); + VersionedSerializer versionedSerializer = new VersionedSerializer(new IdentitySerializer()); + + List> vals; + long startTime = System.currentTimeMillis(); + + int scanCount = 0; + int keyCount = 0; + while(cursor.getNext(keyEntry, valueEntry, LockMode.READ_UNCOMMITTED) == OperationStatus.SUCCESS) { + keyCount++; + + vals = StoreBinaryFormat.fromByteArray(valueEntry.getData()); + + for(Versioned val: vals) { + OperationStatus putStatus = dstDB.put(null, + keyEntry, + new DatabaseEntry(versionedSerializer.toBytes(val))); + if(OperationStatus.SUCCESS != putStatus) { + String errorStr = "Put failed with " + putStatus + " for key" + + BdbConvertData.writeAsciiString(keyEntry.getData()); + logger.error(errorStr); + throw new Exception(errorStr); + } + scanCount++; + } + if(scanCount % 1000000 == 0) + logger.info("Reverted " + scanCount + "entries in " + + (System.currentTimeMillis() - startTime) / 1000 + " secs"); + } + logger.info("Reverted " + scanCount + "entries and " + keyCount + " keys in " + + (System.currentTimeMillis() - startTime) / 1000 + " secs"); + } + + @Override + public boolean areDuplicatesNeededForSrc() { + return false; + } + + @Override + public boolean areDuplicatesNeededForDest() { + return true; + } +} diff --git a/src/java/voldemort/store/bdb/dataconversion/BdbRevertPidScanToBase.java b/src/java/voldemort/store/bdb/dataconversion/BdbRevertPidScanToBase.java new file mode 100644 index 0000000000..c391899112 --- /dev/null +++ b/src/java/voldemort/store/bdb/dataconversion/BdbRevertPidScanToBase.java @@ -0,0 +1,73 @@ +package voldemort.store.bdb.dataconversion; + +import java.util.List; + +import voldemort.serialization.IdentitySerializer; +import voldemort.serialization.VersionedSerializer; +import voldemort.store.StoreBinaryFormat; +import voldemort.versioning.Versioned; + +import com.sleepycat.je.DatabaseEntry; +import com.sleepycat.je.LockMode; +import com.sleepycat.je.OperationStatus; + +public class BdbRevertPidScanToBase extends AbstractBdbConversion { + + BdbRevertPidScanToBase(String storeName, + String clusterXmlPath, + String sourceEnvPath, + String destEnvPath, + int logFileSize, + int nodeMax) throws Exception { + super(storeName, clusterXmlPath, sourceEnvPath, destEnvPath, logFileSize, nodeMax); + } + + @Override + public void transfer() throws Exception { + cursor = srcDB.openCursor(null, null); + DatabaseEntry keyEntry = new DatabaseEntry(); + DatabaseEntry valueEntry = new DatabaseEntry(); + VersionedSerializer versionedSerializer = new VersionedSerializer(new IdentitySerializer()); + + List> vals; + long startTime = System.currentTimeMillis(); + + int scanCount = 0; + int keyCount = 0; + while(cursor.getNext(keyEntry, valueEntry, LockMode.READ_UNCOMMITTED) == OperationStatus.SUCCESS) { + keyCount++; + + vals = StoreBinaryFormat.fromByteArray(valueEntry.getData()); + // pull out the real key + byte[] stripedKey = StoreBinaryFormat.extractKey(keyEntry.getData()); + + for(Versioned val: vals) { + OperationStatus putStatus = dstDB.put(null, + new DatabaseEntry(stripedKey), + new DatabaseEntry(versionedSerializer.toBytes(val))); + if(OperationStatus.SUCCESS != putStatus) { + String errorStr = "Put failed with " + putStatus + " for key" + + BdbConvertData.writeAsciiString(stripedKey); + logger.error(errorStr); + throw new Exception(errorStr); + } + scanCount++; + } + if(scanCount % 1000000 == 0) + logger.info("Reverted " + scanCount + "entries in " + + (System.currentTimeMillis() - startTime) / 1000 + " secs"); + } + logger.info("Reverted " + scanCount + "entries and " + keyCount + " keys in " + + (System.currentTimeMillis() - startTime) / 1000 + " secs"); + } + + @Override + public boolean areDuplicatesNeededForSrc() { + return false; + } + + @Override + public boolean areDuplicatesNeededForDest() { + return true; + } +} diff --git a/src/java/voldemort/store/bdb/dataconversion/BdbRevertPidScanToNewDup.java b/src/java/voldemort/store/bdb/dataconversion/BdbRevertPidScanToNewDup.java new file mode 100644 index 0000000000..624abee439 --- /dev/null +++ b/src/java/voldemort/store/bdb/dataconversion/BdbRevertPidScanToNewDup.java @@ -0,0 +1,66 @@ +package voldemort.store.bdb.dataconversion; + +import java.util.List; + +import voldemort.store.StoreBinaryFormat; +import voldemort.versioning.Versioned; + +import com.sleepycat.je.DatabaseEntry; +import com.sleepycat.je.LockMode; +import com.sleepycat.je.OperationStatus; + +public class BdbRevertPidScanToNewDup extends AbstractBdbConversion { + + BdbRevertPidScanToNewDup(String storeName, + String clusterXmlPath, + String sourceEnvPath, + String destEnvPath, + int logFileSize, + int nodeMax) throws Exception { + super(storeName, clusterXmlPath, sourceEnvPath, destEnvPath, logFileSize, nodeMax); + } + + @Override + public void transfer() throws Exception { + cursor = srcDB.openCursor(null, null); + DatabaseEntry keyEntry = new DatabaseEntry(); + DatabaseEntry valueEntry = new DatabaseEntry(); + + List> vals; + long startTime = System.currentTimeMillis(); + int scanCount = 0; + int keyCount = 0; + while(cursor.getNext(keyEntry, valueEntry, LockMode.READ_UNCOMMITTED) == OperationStatus.SUCCESS) { + keyCount++; + + vals = StoreBinaryFormat.fromByteArray(valueEntry.getData()); + scanCount += vals.size(); + + // pull out the real key + byte[] stripedKey = StoreBinaryFormat.extractKey(keyEntry.getData()); + OperationStatus putStatus = dstDB.put(null, new DatabaseEntry(stripedKey), valueEntry); + if(OperationStatus.SUCCESS != putStatus) { + String errorStr = "Put failed with " + putStatus + " for key" + + BdbConvertData.writeAsciiString(keyEntry.getData()); + logger.error(errorStr); + throw new Exception(errorStr); + } + + if(scanCount % 1000000 == 0) + logger.info("Reverted " + scanCount + "entries in " + + (System.currentTimeMillis() - startTime) / 1000 + " secs"); + } + logger.info("Reverted " + scanCount + "entries and " + keyCount + " keys in " + + (System.currentTimeMillis() - startTime) / 1000 + " secs"); + } + + @Override + public boolean areDuplicatesNeededForSrc() { + return false; + } + + @Override + public boolean areDuplicatesNeededForDest() { + return false; + } +} \ No newline at end of file diff --git a/src/java/voldemort/store/bdb/stats/BdbEnvironmentStats.java b/src/java/voldemort/store/bdb/stats/BdbEnvironmentStats.java index 1b97dbd5d3..248607fa4e 100644 --- a/src/java/voldemort/store/bdb/stats/BdbEnvironmentStats.java +++ b/src/java/voldemort/store/bdb/stats/BdbEnvironmentStats.java @@ -1,26 +1,46 @@ package voldemort.store.bdb.stats; import java.util.concurrent.Callable; +import java.util.concurrent.atomic.AtomicLong; import voldemort.VoldemortException; import voldemort.annotations.Experimental; import voldemort.annotations.jmx.JmxGetter; import voldemort.utils.CachedCallable; +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.DatabaseStats; import com.sleepycat.je.Environment; import com.sleepycat.je.EnvironmentConfig; +import com.sleepycat.je.EnvironmentFailureException; import com.sleepycat.je.EnvironmentStats; +import com.sleepycat.je.LockTimeoutException; import com.sleepycat.je.StatsConfig; public class BdbEnvironmentStats { + // Don't fetch entry count/btree stats more than twice a day + private final static long INVASIVE_STATS_TTL_MS = 12 * 3600 * 1000; + private final Environment environment; + private final Database database; private final CachedCallable fastStats; private final CachedCallable fastSpaceStats; + private final CachedCallable entryCount; + private final CachedCallable btreeStats; private final boolean exposeSpaceStats; - public BdbEnvironmentStats(Environment environment, long ttlMs, boolean exposeSpaceUtil) { + private final AtomicLong numExceptions; + private final AtomicLong numLockTimeoutExceptions; + private final AtomicLong numEnvironmentFailureExceptions; + + public BdbEnvironmentStats(Environment environment, + Database database, + long ttlMs, + boolean exposeSpaceUtil) { this.environment = environment; + this.database = database; this.exposeSpaceStats = exposeSpaceUtil; Callable fastStatsCallable = new Callable() { @@ -37,6 +57,26 @@ public SpaceUtilizationStats call() throws Exception { } }; fastSpaceStats = new CachedCallable(fastDbStatsCallable, ttlMs); + + Callable entryCountCallable = new Callable() { + + public Long call() throws Exception { + return getEntryCountUncached(); + } + }; + entryCount = new CachedCallable(entryCountCallable, INVASIVE_STATS_TTL_MS); + + Callable btreeStatsCallable = new Callable() { + + public DatabaseStats call() throws Exception { + return getBtreeStatsUncached(); + } + }; + btreeStats = new CachedCallable(btreeStatsCallable, INVASIVE_STATS_TTL_MS); + + numExceptions = new AtomicLong(0); + numLockTimeoutExceptions = new AtomicLong(0); + numEnvironmentFailureExceptions = new AtomicLong(0); } private EnvironmentStats getEnvironmentStats(boolean fast) { @@ -65,6 +105,25 @@ private EnvironmentStats getFastStats() { } } + private Long getEntryCountUncached() { + return database.count(); + } + + public DatabaseStats getBtreeStatsUncached() throws Exception { + // fast stats does not provide detailed Btree structure. + // This is invasive and will affect performance. + return database.getStats(new StatsConfig().setFast(false)); + } + + public void reportException(DatabaseException de) { + numExceptions.incrementAndGet(); + if(de instanceof LockTimeoutException) { + numLockTimeoutExceptions.incrementAndGet(); + } else if(de instanceof EnvironmentFailureException) { + numEnvironmentFailureExceptions.incrementAndGet(); + } + } + @JmxGetter(name = "FastStatsAsString") public String getFastStatsAsString() { return getFastStats().toString(); @@ -97,6 +156,60 @@ public long getEvictedLNs() { return getFastStats().getNEvictPasses(); } + @JmxGetter(name = "BINFetches") + public long getBINFetches() { + return getFastStats().getNBINsFetch(); + } + + @JmxGetter(name = "BINFetchMisses") + public long getBINFetchMisses() { + return getFastStats().getNBINsFetchMiss(); + } + + @JmxGetter(name = "INFetches") + public long getINFetches() { + return getFastStats().getNUpperINsFetch(); + } + + @JmxGetter(name = "INFetchMisses") + public long getINFetchMisses() { + return getFastStats().getNUpperINsFetchMiss(); + } + + @JmxGetter(name = "LNFetches") + public long getLNFetches() { + return getFastStats().getNLNsFetch(); + } + + @JmxGetter(name = "LNFetchMisses") + public long getLNFetchMisses() { + return getFastStats().getNLNsFetchMiss(); + } + + @JmxGetter(name = "CachedBINs") + public long getCachedBINs() { + return getFastStats().getNCachedBINs(); + } + + @JmxGetter(name = "CachedINs") + public long getCachedUpperINs() { + return getFastStats().getNCachedUpperINs(); + } + + @JmxGetter(name = "EvictedBINs") + public long getEvictedBINs() { + EnvironmentStats stats = getFastStats(); + return stats.getNBINsEvictedCacheMode() + stats.getNBINsEvictedCritical() + + stats.getNBINsEvictedDaemon() + stats.getNBINsEvictedManual(); + } + + @JmxGetter(name = "EvictedINs") + public long getEvictedINs() { + EnvironmentStats stats = getFastStats(); + return stats.getNUpperINsEvictedCacheMode() + stats.getNUpperINsEvictedCritical() + + stats.getNUpperINsEvictedDaemon() + stats.getNUpperINsEvictedManual(); + } + // 2. IO @JmxGetter(name = "NumRandomWrites") public long getNumRandomWrites() { @@ -221,6 +334,32 @@ public long getNumAcquiresNoWaiters() { return getFastStats().getNAcquiresNoWaiters(); } + // 5. Exceptions & general statistics + @JmxGetter(name = "numExceptions") + public long getNumExceptions() { + return numExceptions.longValue(); + } + + @JmxGetter(name = "numLockTimeoutExceptions") + public long getNumLockTimeoutExceptions() { + return numLockTimeoutExceptions.longValue(); + } + + @JmxGetter(name = "numEnvironmentFailureExceptions") + public long getNumEnvironmentFailureExceptions() { + return numEnvironmentFailureExceptions.longValue(); + } + + @JmxGetter(name = "getEntryCount", description = "Obtain the number of k-v entries in the store") + public long getEntryCount() throws Exception { + return entryCount.call(); + } + + @JmxGetter(name = "getBtreeStats", description = "Obtain statistics about the BTree Index for a store") + public String getBtreeStats() throws Exception { + return btreeStats.call().toString(); + } + // Compound statistics derived from raw statistics @JmxGetter(name = "NumWritesTotal") @@ -299,6 +438,21 @@ public double getPercentageUtilization() { return safeGetPercentage(getTotalSpaceUtilized(), getTotalSpace()); } + @JmxGetter(name = "PercentageBINMiss") + public double getPercentageBINMiss() { + return safeGetPercentage(getBINFetchMisses(), getBINFetches()); + } + + @JmxGetter(name = "PercentageINMiss") + public double getPercentageINMiss() { + return safeGetPercentage(getINFetchMisses(), getINFetches()); + } + + @JmxGetter(name = "PercentageLNMiss") + public double getPercentageLNMiss() { + return safeGetPercentage(getLNFetchMisses(), getLNFetches()); + } + public static double safeGetPercentage(long rawNum, long total) { return total == 0 ? 0.0d : rawNum / (float) total; } diff --git a/src/java/voldemort/store/compress/CompressingStore.java b/src/java/voldemort/store/compress/CompressingStore.java index b1cf571908..aae39b3a2e 100644 --- a/src/java/voldemort/store/compress/CompressingStore.java +++ b/src/java/voldemort/store/compress/CompressingStore.java @@ -22,6 +22,7 @@ import java.util.Map; import voldemort.VoldemortException; +import voldemort.store.AbstractStore; import voldemort.store.Store; import voldemort.store.StoreCapabilityType; import voldemort.store.StoreUtils; @@ -45,7 +46,7 @@ * @see NoopCompressionStrategy * @see GzipCompressionStrategy */ -public class CompressingStore implements Store { +public class CompressingStore extends AbstractStore { private final Store innerStore; private final CompressionStrategy keysCompressionStrategy; @@ -54,11 +55,13 @@ public class CompressingStore implements Store { public CompressingStore(Store innerStore, CompressionStrategy keysCompressionStrategy, CompressionStrategy valuesCompressionStrategy) { + super(innerStore.getName()); this.keysCompressionStrategy = Utils.notNull(keysCompressionStrategy); this.valuesCompressionStrategy = Utils.notNull(valuesCompressionStrategy); this.innerStore = Utils.notNull(innerStore); } + @Override public Map>> getAll(Iterable keys, Map transforms) throws VoldemortException { @@ -130,11 +133,13 @@ private byte[] deflate(CompressionStrategy compressionStrategy, byte[] data) } } + @Override public List> get(ByteArray key, byte[] transforms) throws VoldemortException { StoreUtils.assertValidKey(key); return inflateValues(innerStore.get(deflateKey(key), transforms)); } + @Override public List getVersions(ByteArray key) { return innerStore.getVersions(deflateKey(key)); } @@ -147,24 +152,24 @@ private List> inflateValues(List> result) { return inflated; } + @Override public void put(ByteArray key, Versioned value, byte[] transforms) throws VoldemortException { StoreUtils.assertValidKey(key); innerStore.put(deflateKey(key), deflateValue(value), transforms); } + @Override public void close() throws VoldemortException { innerStore.close(); } + @Override public Object getCapability(StoreCapabilityType capability) { return innerStore.getCapability(capability); } - public String getName() { - return innerStore.getName(); - } - + @Override public boolean delete(ByteArray key, Version version) throws VoldemortException { StoreUtils.assertValidKey(key); return innerStore.delete(deflateKey(key), version); diff --git a/src/java/voldemort/store/configuration/ConfigurationStorageEngine.java b/src/java/voldemort/store/configuration/ConfigurationStorageEngine.java index c342fb1ad9..dd35a0cb2d 100644 --- a/src/java/voldemort/store/configuration/ConfigurationStorageEngine.java +++ b/src/java/voldemort/store/configuration/ConfigurationStorageEngine.java @@ -27,7 +27,7 @@ import org.apache.log4j.Logger; import voldemort.VoldemortException; -import voldemort.store.StorageEngine; +import voldemort.store.AbstractStorageEngine; import voldemort.store.StoreCapabilityType; import voldemort.store.StoreUtils; import voldemort.store.metadata.MetadataStore; @@ -45,28 +45,35 @@ * * */ -public class ConfigurationStorageEngine implements StorageEngine { +public class ConfigurationStorageEngine extends AbstractStorageEngine { private final static Logger logger = Logger.getLogger(ConfigurationStorageEngine.class); - private final String name; private final File directory; public ConfigurationStorageEngine(String name, String directory) { - this.name = name; + super(name); this.directory = new File(directory); if(!this.directory.exists() && this.directory.canRead()) throw new IllegalArgumentException("Directory " + this.directory.getAbsolutePath() + " does not exist or can not be read."); } + @Override public ClosableIterator>> entries() { throw new VoldemortException("Iteration not supported in ConfigurationStorageEngine"); } - public void close() throws VoldemortException { + @Override + public ClosableIterator>> entries(int partition) { + throw new UnsupportedOperationException("Partition based entries scan not supported for this storage type"); + } + @Override + public ClosableIterator keys(int partition) { + throw new UnsupportedOperationException("Partition based key scan not supported for this storage type"); } + @Override public synchronized boolean delete(String key, Version version) throws VoldemortException { StoreUtils.assertValidKey(key); for(File file: getDirectory(key).listFiles()) { @@ -84,12 +91,14 @@ public synchronized boolean delete(String key, Version version) throws Voldemort return false; } + @Override public synchronized List> get(String key, String transforms) throws VoldemortException { StoreUtils.assertValidKey(key); return get(key, getDirectory(key).listFiles()); } + @Override public List getVersions(String key) { List> values = get(key, (String) null); List versions = new ArrayList(values.size()); @@ -99,6 +108,7 @@ public List getVersions(String key) { return versions; } + @Override public synchronized Map>> getAll(Iterable keys, Map transforms) throws VoldemortException { @@ -112,10 +122,7 @@ public synchronized Map>> getAll(Iterable return result; } - public String getName() { - return name; - } - + @Override public synchronized void put(String key, Versioned value, String transforms) throws VoldemortException { StoreUtils.assertValidKey(key); @@ -228,19 +235,18 @@ private File getTempDirectory() { return tempDir; } + @Override public Object getCapability(StoreCapabilityType capability) { throw new VoldemortException("No extra capability."); } + @Override public ClosableIterator keys() { throw new VoldemortException("keys iteration not supported."); } + @Override public void truncate() { throw new VoldemortException("Truncate not supported in ConfigurationStorageEngine"); } - - public boolean isPartitionAware() { - return false; - } } diff --git a/src/java/voldemort/store/configuration/FileBackedCachingStorageConfiguration.java b/src/java/voldemort/store/configuration/FileBackedCachingStorageConfiguration.java index 8cd93aa82c..6af41190b5 100644 --- a/src/java/voldemort/store/configuration/FileBackedCachingStorageConfiguration.java +++ b/src/java/voldemort/store/configuration/FileBackedCachingStorageConfiguration.java @@ -16,6 +16,7 @@ package voldemort.store.configuration; +import voldemort.routing.RoutingStrategy; import voldemort.server.VoldemortConfig; import voldemort.store.StorageConfiguration; import voldemort.store.StorageEngine; @@ -38,7 +39,8 @@ public FileBackedCachingStorageConfiguration(VoldemortConfig config) { this.inputPath = config.getMetadataDirectory(); } - public StorageEngine getStore(StoreDefinition storeDef) { + public StorageEngine getStore(StoreDefinition storeDef, + RoutingStrategy strategy) { return new FileBackedCachingStorageEngine(storeDef.getName(), inputPath); } @@ -51,5 +53,4 @@ public void close() {} public void update(StoreDefinition storeDef) { } - } diff --git a/src/java/voldemort/store/configuration/FileBackedCachingStorageEngine.java b/src/java/voldemort/store/configuration/FileBackedCachingStorageEngine.java index 38817ce272..4dce7d6023 100644 --- a/src/java/voldemort/store/configuration/FileBackedCachingStorageEngine.java +++ b/src/java/voldemort/store/configuration/FileBackedCachingStorageEngine.java @@ -35,7 +35,7 @@ import org.apache.log4j.Logger; import voldemort.VoldemortException; -import voldemort.store.StorageEngine; +import voldemort.store.AbstractStorageEngine; import voldemort.store.StoreCapabilityType; import voldemort.store.StoreUtils; import voldemort.utils.ByteArray; @@ -59,7 +59,8 @@ * @author csoman * */ -public class FileBackedCachingStorageEngine implements StorageEngine { +public class FileBackedCachingStorageEngine extends + AbstractStorageEngine { private final static Logger logger = Logger.getLogger(FileBackedCachingStorageEngine.class); private static final CharSequence NEW_PROPERTY_SEPARATOR = "[name="; @@ -67,12 +68,11 @@ public class FileBackedCachingStorageEngine implements StorageEngine metadataMap; private VectorClock cachedVersion = null; public FileBackedCachingStorageEngine(String name, String inputDirectory) { - this.name = name; + super(name); this.inputDirectory = inputDirectory; File directory = new File(this.inputDirectory); if(!directory.exists() && directory.canRead()) { @@ -89,7 +89,7 @@ public FileBackedCachingStorageEngine(String name, String inputDirectory) { } private File getVersionFile() { - return new File(this.inputDirectory, this.name + ".version"); + return new File(this.inputDirectory, getName() + ".version"); } // Read the Vector clock stored in '${name}.version' file @@ -106,7 +106,7 @@ private VectorClock readVersion() { } return this.cachedVersion; } catch(Exception e) { - throw new VoldemortException("Failed to read Version for file :" + this.name, e); + throw new VoldemortException("Failed to read Version for file :" + getName(), e); } } @@ -121,7 +121,7 @@ private void writeVersion(VectorClock newClock) { } } catch(Exception e) { throw new VoldemortException("Failed to write Version for the current file :" - + this.name, e); + + getName(), e); } } @@ -190,33 +190,28 @@ private synchronized void flushData() { } } - public String getName() { - return this.name; - } - - public void close() throws VoldemortException {} - + @Override public Object getCapability(StoreCapabilityType capability) { throw new VoldemortException("No extra capability."); } + @Override public ClosableIterator>> entries() { return new FileBackedStorageIterator(this.metadataMap, this); } + @Override public ClosableIterator keys() { return StoreUtils.keys(entries()); } + @Override public void truncate() { throw new VoldemortException("Truncate not supported in FileBackedCachingStorageEngine"); } - public boolean isPartitionAware() { - return false; - } - // Assigning new Vector clock here: TODO: Decide what vector clock to use ? + @Override public List> get(ByteArray key, byte[] transforms) throws VoldemortException { StoreUtils.assertValidKey(key); String keyString = new String(key.get()); @@ -230,6 +225,7 @@ public List> get(ByteArray key, byte[] transforms) throws Vold return found; } + @Override public Map>> getAll(Iterable keys, Map transforms) throws VoldemortException { @@ -243,6 +239,7 @@ public Map>> getAll(Iterable keys, return result; } + @Override public List getVersions(ByteArray key) { List> values = get(key, null); List versions = new ArrayList(values.size()); @@ -252,6 +249,7 @@ public List getVersions(ByteArray key) { return versions; } + @Override public void put(ByteArray key, Versioned value, byte[] transforms) throws VoldemortException { StoreUtils.assertValidKey(key); @@ -262,7 +260,7 @@ public void put(ByteArray key, Versioned value, byte[] transforms) if(value.getVersion().compare(clock) == Occurred.BEFORE) { throw new ObsoleteVersionException("A successor version " + clock + " to this " + value.getVersion() - + " exists for the current file : " + this.name); + + " exists for the current file : " + getName()); } else if(value.getVersion().compare(clock) == Occurred.CONCURRENTLY) { throw new ObsoleteVersionException("Concurrent Operation not allowed on Metadata."); } @@ -278,6 +276,7 @@ public void put(ByteArray key, Versioned value, byte[] transforms) writeVersion((VectorClock) value.getVersion()); } + @Override public boolean delete(ByteArray key, Version version) throws VoldemortException { boolean deleteSuccessful = false; StoreUtils.assertValidKey(key); @@ -307,10 +306,12 @@ public FileBackedStorageIterator(Map metadataMap, storageEngineRef = storageEngine; } + @Override public boolean hasNext() { return iterator.hasNext(); } + @Override public Pair> next() { Entry entry = iterator.next(); Pair> nextValue = null; @@ -325,12 +326,23 @@ public Pair> next() { return nextValue; } + @Override public void remove() { throw new UnsupportedOperationException("No removal y'all."); } + @Override public void close() {} } + @Override + public ClosableIterator>> entries(int partition) { + throw new UnsupportedOperationException("Partition based entries scan not supported for this storage type"); + } + + @Override + public ClosableIterator keys(int partition) { + throw new UnsupportedOperationException("Partition based keys scan not supported for this storage type"); + } } diff --git a/src/java/voldemort/store/gzip/GzipStore.java b/src/java/voldemort/store/gzip/GzipStore.java index dcdc3ad465..6a36f4c97e 100644 --- a/src/java/voldemort/store/gzip/GzipStore.java +++ b/src/java/voldemort/store/gzip/GzipStore.java @@ -34,8 +34,7 @@ * * */ -public class GzipStore extends DelegatingStore implements - Store { +public class GzipStore extends DelegatingStore { public GzipStore(Store innerStore) { super(innerStore); diff --git a/src/java/voldemort/store/http/HttpStore.java b/src/java/voldemort/store/http/HttpStore.java index 9c476178df..a67aa56e91 100644 --- a/src/java/voldemort/store/http/HttpStore.java +++ b/src/java/voldemort/store/http/HttpStore.java @@ -33,9 +33,7 @@ import voldemort.VoldemortException; import voldemort.client.protocol.RequestFormat; import voldemort.server.RequestRoutingType; -import voldemort.store.NoSuchCapabilityException; -import voldemort.store.Store; -import voldemort.store.StoreCapabilityType; +import voldemort.store.AbstractStore; import voldemort.store.StoreUtils; import voldemort.store.UnreachableStoreException; import voldemort.utils.ByteArray; @@ -49,9 +47,8 @@ * the VoldemortHttpServer. * */ -public class HttpStore implements Store { +public class HttpStore extends AbstractStore { - private final String storeName; private final HttpClient httpClient; private final RequestFormat requestFormat; private final RequestRoutingType reroute; @@ -63,13 +60,14 @@ public HttpStore(String storeName, HttpClient client, RequestFormat format, boolean reroute) { - this.storeName = storeName; + super(storeName); this.httpClient = client; this.requestFormat = format; this.reroute = RequestRoutingType.getRequestRoutingType(reroute, false); this.storeUrl = "http://" + host + ":" + port + "/stores"; } + @Override public boolean delete(ByteArray key, Version version) throws VoldemortException { StoreUtils.assertValidKey(key); DataInputStream input = null; @@ -77,7 +75,7 @@ public boolean delete(ByteArray key, Version version) throws VoldemortException HttpPost method = new HttpPost(this.storeUrl); ByteArrayOutputStream outputBytes = new ByteArrayOutputStream(); requestFormat.writeDeleteRequest(new DataOutputStream(outputBytes), - storeName, + getName(), key, (VectorClock) version, reroute); @@ -85,12 +83,13 @@ public boolean delete(ByteArray key, Version version) throws VoldemortException return requestFormat.readDeleteResponse(input); } catch(IOException e) { throw new UnreachableStoreException("Could not connect to " + storeUrl + " for " - + storeName, e); + + getName(), e); } finally { IOUtils.closeQuietly(input); } } + @Override public List> get(ByteArray key, byte[] transforms) throws VoldemortException { StoreUtils.assertValidKey(key); DataInputStream input = null; @@ -98,7 +97,7 @@ public List> get(ByteArray key, byte[] transforms) throws Vold HttpPost method = new HttpPost(this.storeUrl); ByteArrayOutputStream outputBytes = new ByteArrayOutputStream(); requestFormat.writeGetRequest(new DataOutputStream(outputBytes), - storeName, + getName(), key, transforms, reroute); @@ -106,12 +105,13 @@ public List> get(ByteArray key, byte[] transforms) throws Vold return requestFormat.readGetResponse(input); } catch(IOException e) { throw new UnreachableStoreException("Could not connect to " + storeUrl + " for " - + storeName, e); + + getName(), e); } finally { IOUtils.closeQuietly(input); } } + @Override public Map>> getAll(Iterable keys, Map transforms) throws VoldemortException { @@ -121,7 +121,7 @@ public Map>> getAll(Iterable keys, HttpPost method = new HttpPost(this.storeUrl); ByteArrayOutputStream outputBytes = new ByteArrayOutputStream(); requestFormat.writeGetAllRequest(new DataOutputStream(outputBytes), - storeName, + getName(), keys, transforms, reroute); @@ -129,12 +129,13 @@ public Map>> getAll(Iterable keys, return requestFormat.readGetAllResponse(input); } catch(IOException e) { throw new UnreachableStoreException("Could not connect to " + storeUrl + " for " - + storeName, e); + + getName(), e); } finally { IOUtils.closeQuietly(input); } } + @Override public void put(ByteArray key, Versioned versioned, byte[] transforms) throws VoldemortException { StoreUtils.assertValidKey(key); @@ -143,7 +144,7 @@ public void put(ByteArray key, Versioned versioned, byte[] transforms) HttpPost method = new HttpPost(this.storeUrl); ByteArrayOutputStream outputBytes = new ByteArrayOutputStream(); requestFormat.writePutRequest(new DataOutputStream(outputBytes), - storeName, + getName(), key, versioned.getValue(), transforms, @@ -153,7 +154,7 @@ public void put(ByteArray key, Versioned versioned, byte[] transforms) requestFormat.readPutResponse(input); } catch(IOException e) { throw new UnreachableStoreException("Could not connect to " + storeUrl + " for " - + storeName, e); + + getName(), e); } finally { IOUtils.closeQuietly(input); } @@ -168,7 +169,7 @@ private DataInputStream executeRequest(HttpPost method, ByteArrayOutputStream ou if(statusCode != HttpURLConnection.HTTP_OK) { String message = response.getStatusLine().getReasonPhrase(); VoldemortIOUtils.closeQuietly(response); - throw new UnreachableStoreException("HTTP request to store " + storeName + throw new UnreachableStoreException("HTTP request to store " + getName() + " returned status code " + statusCode + " " + message); } @@ -176,20 +177,11 @@ private DataInputStream executeRequest(HttpPost method, ByteArrayOutputStream ou } catch(IOException e) { VoldemortIOUtils.closeQuietly(response); throw new UnreachableStoreException("Could not connect to " + storeUrl + " for " - + storeName, e); + + getName(), e); } } - public void close() {} - - public String getName() { - return storeName; - } - - public Object getCapability(StoreCapabilityType capability) { - throw new NoSuchCapabilityException(capability, getName()); - } - + @Override public List getVersions(ByteArray key) { StoreUtils.assertValidKey(key); DataInputStream input = null; @@ -197,14 +189,14 @@ public List getVersions(ByteArray key) { HttpPost method = new HttpPost(this.storeUrl); ByteArrayOutputStream outputBytes = new ByteArrayOutputStream(); requestFormat.writeGetVersionRequest(new DataOutputStream(outputBytes), - storeName, + getName(), key, reroute); input = executeRequest(method, outputBytes); return requestFormat.readGetVersionResponse(input); } catch(IOException e) { throw new UnreachableStoreException("Could not connect to " + storeUrl + " for " - + storeName, e); + + getName(), e); } finally { IOUtils.closeQuietly(input); } diff --git a/src/java/voldemort/store/logging/LoggingStore.java b/src/java/voldemort/store/logging/LoggingStore.java index cac0acebb9..eb29fcb2a7 100644 --- a/src/java/voldemort/store/logging/LoggingStore.java +++ b/src/java/voldemort/store/logging/LoggingStore.java @@ -24,6 +24,7 @@ import voldemort.store.DelegatingStore; import voldemort.store.Store; import voldemort.store.StoreCapabilityType; +import voldemort.store.CompositeVoldemortRequest; import voldemort.utils.SystemTime; import voldemort.utils.Time; import voldemort.versioning.Version; @@ -141,4 +142,49 @@ public Object getCapability(StoreCapabilityType capability) { return getInnerStore().getCapability(capability); } + @Override + public List> get(CompositeVoldemortRequest request) throws VoldemortException { + long startTimeNs = 0; + boolean succeeded = false; + if(logger.isDebugEnabled()) + startTimeNs = time.getNanoseconds(); + try { + List> l = getInnerStore().get(request); + succeeded = true; + return l; + } finally { + printTimedMessage("GET", succeeded, startTimeNs); + } + } + + @Override + public void put(CompositeVoldemortRequest request) throws VoldemortException { + long startTimeNs = 0; + boolean succeeded = false; + if(logger.isDebugEnabled()) { + startTimeNs = time.getNanoseconds(); + } + try { + getInnerStore().put(request); + succeeded = true; + } finally { + printTimedMessage("PUT", succeeded, startTimeNs); + } + } + + @Override + public boolean delete(CompositeVoldemortRequest request) throws VoldemortException { + long startTimeNs = 0; + boolean succeeded = false; + if(logger.isDebugEnabled()) + startTimeNs = time.getNanoseconds(); + try { + boolean deletedSomething = getInnerStore().delete(request); + succeeded = true; + return deletedSomething; + } finally { + printTimedMessage("DELETE", succeeded, startTimeNs); + } + } + } diff --git a/src/java/voldemort/store/memory/CacheStorageConfiguration.java b/src/java/voldemort/store/memory/CacheStorageConfiguration.java index 431c0a849a..a408b4b203 100644 --- a/src/java/voldemort/store/memory/CacheStorageConfiguration.java +++ b/src/java/voldemort/store/memory/CacheStorageConfiguration.java @@ -20,6 +20,7 @@ import java.util.concurrent.ConcurrentMap; import voldemort.VoldemortException; +import voldemort.routing.RoutingStrategy; import voldemort.server.VoldemortConfig; import voldemort.store.StorageConfiguration; import voldemort.store.StorageEngine; @@ -47,7 +48,8 @@ public CacheStorageConfiguration(VoldemortConfig config) {} public void close() {} - public StorageEngine getStore(StoreDefinition storeDef) { + public StorageEngine getStore(StoreDefinition storeDef, + RoutingStrategy strategy) { ConcurrentMap>> backingMap = new MapMaker().softValues() .makeMap(); return new InMemoryStorageEngine(storeDef.getName(), backingMap); diff --git a/src/java/voldemort/store/memory/InMemoryStorageConfiguration.java b/src/java/voldemort/store/memory/InMemoryStorageConfiguration.java index df40f6ae37..fc29e2dabb 100644 --- a/src/java/voldemort/store/memory/InMemoryStorageConfiguration.java +++ b/src/java/voldemort/store/memory/InMemoryStorageConfiguration.java @@ -20,6 +20,7 @@ import java.util.concurrent.ConcurrentHashMap; import voldemort.VoldemortException; +import voldemort.routing.RoutingStrategy; import voldemort.server.VoldemortConfig; import voldemort.store.StorageConfiguration; import voldemort.store.StorageEngine; @@ -41,7 +42,8 @@ public InMemoryStorageConfiguration() {} @SuppressWarnings("unused") public InMemoryStorageConfiguration(VoldemortConfig config) {} - public StorageEngine getStore(StoreDefinition storeDef) { + public StorageEngine getStore(StoreDefinition storeDef, + RoutingStrategy strategy) { return new InMemoryStorageEngine(storeDef.getName(), new ConcurrentHashMap>>()); } diff --git a/src/java/voldemort/store/memory/InMemoryStorageEngine.java b/src/java/voldemort/store/memory/InMemoryStorageEngine.java index a561e100f1..f53d0ef60b 100644 --- a/src/java/voldemort/store/memory/InMemoryStorageEngine.java +++ b/src/java/voldemort/store/memory/InMemoryStorageEngine.java @@ -26,9 +26,7 @@ import voldemort.VoldemortException; import voldemort.annotations.concurrency.NotThreadsafe; -import voldemort.store.NoSuchCapabilityException; -import voldemort.store.StorageEngine; -import voldemort.store.StoreCapabilityType; +import voldemort.store.AbstractStorageEngine; import voldemort.store.StoreUtils; import voldemort.utils.ClosableIterator; import voldemort.utils.Pair; @@ -43,23 +41,20 @@ * * */ -public class InMemoryStorageEngine implements StorageEngine { +public class InMemoryStorageEngine extends AbstractStorageEngine { private final ConcurrentMap>> map; - private final String name; public InMemoryStorageEngine(String name) { - this.name = Utils.notNull(name); + super(name); this.map = new ConcurrentHashMap>>(); } public InMemoryStorageEngine(String name, ConcurrentMap>> map) { - this.name = Utils.notNull(name); + super(name); this.map = Utils.notNull(map); } - public void close() {} - public void deleteAll() { this.map.clear(); } @@ -68,6 +63,7 @@ public boolean delete(K key) { return delete(key, null); } + @Override public boolean delete(K key, Version version) { StoreUtils.assertValidKey(key); @@ -99,10 +95,12 @@ public boolean delete(K key, Version version) { } } + @Override public List getVersions(K key) { return StoreUtils.getVersions(get(key, null)); } + @Override public List> get(K key, T transform) throws VoldemortException { StoreUtils.assertValidKey(key); List> results = map.get(key); @@ -114,12 +112,14 @@ public List> get(K key, T transform) throws VoldemortException { } } + @Override public Map>> getAll(Iterable keys, Map transforms) throws VoldemortException { StoreUtils.assertValidKeys(keys); return StoreUtils.getAll(this, keys, transforms); } + @Override public void put(K key, Versioned value, T transforms) throws VoldemortException { StoreUtils.assertValidKey(key); @@ -159,25 +159,30 @@ public void put(K key, Versioned value, T transforms) throws VoldemortExcepti } } - public Object getCapability(StoreCapabilityType capability) { - throw new NoSuchCapabilityException(capability, getName()); - } - + @Override public ClosableIterator>> entries() { return new InMemoryIterator(map); } + @Override public ClosableIterator keys() { // TODO Implement more efficient version. return StoreUtils.keys(entries()); } - public void truncate() { - map.clear(); + @Override + public ClosableIterator>> entries(int partition) { + throw new UnsupportedOperationException("Partition based entries scan not supported for this storage type"); + } + + @Override + public ClosableIterator keys(int partition) { + throw new UnsupportedOperationException("Partition based key scan not supported for this storage type"); } - public String getName() { - return name; + @Override + public void truncate() { + map.clear(); } @Override @@ -214,6 +219,7 @@ public InMemoryIterator(ConcurrentMap>> map) { this.iterator = map.entrySet().iterator(); } + @Override public boolean hasNext() { return hasNextInCurrentValues() || iterator.hasNext(); } @@ -227,6 +233,7 @@ private Pair> nextInCurrentValues() { return Pair.create(currentKey, item); } + @Override public Pair> next() { if(hasNextInCurrentValues()) { return nextInCurrentValues(); @@ -253,17 +260,15 @@ public Pair> next() { } } + @Override public void remove() { throw new UnsupportedOperationException("No removal y'all."); } + @Override public void close() { - // nothing to do here + // nothing to do here } } - - public boolean isPartitionAware() { - return false; - } } diff --git a/src/java/voldemort/store/metadata/MetadataStore.java b/src/java/voldemort/store/metadata/MetadataStore.java index 97a31aba64..c6f1f1b46f 100644 --- a/src/java/voldemort/store/metadata/MetadataStore.java +++ b/src/java/voldemort/store/metadata/MetadataStore.java @@ -42,7 +42,7 @@ import voldemort.routing.RoutingStrategy; import voldemort.routing.RoutingStrategyFactory; import voldemort.server.rebalance.RebalancerState; -import voldemort.store.StorageEngine; +import voldemort.store.AbstractStorageEngine; import voldemort.store.Store; import voldemort.store.StoreCapabilityType; import voldemort.store.StoreDefinition; @@ -68,7 +68,7 @@ * Metadata is persisted as strings in inner store for ease of readability.
        * Metadata Store keeps an in memory write-through-cache for performance. */ -public class MetadataStore implements StorageEngine { +public class MetadataStore extends AbstractStorageEngine { public static final String METADATA_STORE_NAME = "metadata"; @@ -114,14 +114,15 @@ public static enum VoldemortState { public final Lock readLock = lock.readLock(); public final Lock writeLock = lock.writeLock(); - private final ConcurrentHashMap storeNameTolisteners; + private final ConcurrentHashMap> storeNameTolisteners; private static final Logger logger = Logger.getLogger(MetadataStore.class); public MetadataStore(Store innerStore, int nodeId) { + super(innerStore.getName()); this.innerStore = innerStore; this.metadataCache = new HashMap>(); - this.storeNameTolisteners = new ConcurrentHashMap(); + this.storeNameTolisteners = new ConcurrentHashMap>(); init(nodeId); } @@ -130,10 +131,12 @@ public void addMetadataStoreListener(String storeName, MetadataStoreListener lis if(this.storeNameTolisteners == null) throw new VoldemortException("MetadataStoreListener must be non-null"); - this.storeNameTolisteners.put(storeName, listener); + if(!this.storeNameTolisteners.containsKey(storeName)) + this.storeNameTolisteners.put(storeName, new ArrayList(2)); + this.storeNameTolisteners.get(storeName).add(listener); } - public void remoteMetadataStoreListener(String storeName) { + public void removeMetadataStoreListener(String storeName) { if(this.storeNameTolisteners == null) throw new VoldemortException("MetadataStoreListener must be non-null"); @@ -152,6 +155,7 @@ public static MetadataStore readFromDirectory(File dir, int nodeId) { return new MetadataStore(innerStore, nodeId); } + @Override public String getName() { return METADATA_STORE_NAME; } @@ -211,6 +215,7 @@ public void put(String key, Object value) { * definitions * @throws VoldemortException */ + @Override public synchronized void put(ByteArray keyBytes, Versioned valueBytes, byte[] transforms) throws VoldemortException { String key = ByteUtils.getString(keyBytes.get(), "UTF-8"); @@ -223,10 +228,12 @@ public synchronized void put(ByteArray keyBytes, Versioned valueBytes, b this.put(key, valueObject); } + @Override public void close() throws VoldemortException { innerStore.close(); } + @Override public Object getCapability(StoreCapabilityType capability) { return innerStore.getCapability(capability); } @@ -237,6 +244,7 @@ public Object getCapability(StoreCapabilityType capability) { * bytes for cluster xml definitions * @throws VoldemortException */ + @Override public synchronized List> get(ByteArray keyBytes, byte[] transforms) throws VoldemortException { try { @@ -284,6 +292,7 @@ public synchronized void cleanAllRebalancingState() { init(getNodeId()); } + @Override public List getVersions(ByteArray key) { List> values = get(key, null); List versions = new ArrayList(values.size()); @@ -351,6 +360,19 @@ public RoutingStrategy getRoutingStrategy(String storeName) { return strategy; } + /** + * Returns the list of store defs as a map + * + * @param storeDefs + * @return + */ + private HashMap makeStoreDefinitionMap(List storeDefs) { + HashMap storeDefMap = new HashMap(); + for(StoreDefinition storeDef: storeDefs) + storeDefMap.put(storeDef.getName(), storeDef); + return storeDefMap; + } + /** * Changes to cluster OR store definition metadata results in routing * strategies changing. These changes need to be propagated to all the @@ -365,8 +387,9 @@ private void updateRoutingStrategies(Cluster cluster, List stor clock = (VectorClock) metadataCache.get(ROUTING_STRATEGY_KEY).getVersion(); logger.info("Updating routing strategy for all stores"); + HashMap storeDefMap = makeStoreDefinitionMap(storeDefs); HashMap routingStrategyMap = createRoutingStrategyMap(cluster, - storeDefs); + storeDefMap); this.metadataCache.put(ROUTING_STRATEGY_KEY, new Versioned(routingStrategyMap, clock.incremented(getNodeId(), @@ -376,8 +399,10 @@ private void updateRoutingStrategies(Cluster cluster, List stor RoutingStrategy updatedRoutingStrategy = routingStrategyMap.get(storeName); if(updatedRoutingStrategy != null) { try { - storeNameTolisteners.get(storeName) - .updateRoutingStrategy(updatedRoutingStrategy); + for(MetadataStoreListener listener: storeNameTolisteners.get(storeName)) { + listener.updateRoutingStrategy(updatedRoutingStrategy); + listener.updateStoreDefinition(storeDefMap.get(storeName)); + } } catch(Exception e) { if(logger.isEnabledFor(Level.WARN)) logger.warn(e, e); @@ -393,7 +418,7 @@ private void updateRoutingStrategies(Cluster cluster, List stor */ private void initSystemRoutingStrategies(Cluster cluster) { HashMap routingStrategyMap = createRoutingStrategyMap(cluster, - getSystemStoreDefList()); + makeStoreDefinitionMap(getSystemStoreDefList())); this.metadataCache.put(SYSTEM_ROUTING_STRATEGY_KEY, new Versioned(routingStrategyMap)); } @@ -455,22 +480,37 @@ public void deleteRebalancingState(RebalancePartitionsInfo stealInfo) { } } + @Override public ClosableIterator>> entries() { throw new VoldemortException("You cannot iterate over all entries in Metadata"); } + @Override public ClosableIterator keys() { throw new VoldemortException("You cannot iterate over all keys in Metadata"); } + @Override + public ClosableIterator>> entries(int partition) { + throw new UnsupportedOperationException("Partition based entries scan not supported for this storage type"); + } + + @Override + public ClosableIterator keys(int partition) { + throw new UnsupportedOperationException("Partition based key scan not supported for this storage type"); + } + + @Override public void truncate() { throw new VoldemortException("You cannot truncate entries in Metadata"); } + @Override public boolean delete(ByteArray key, Version version) throws VoldemortException { throw new VoldemortException("You cannot delete your metadata fool !!"); } + @Override public Map>> getAll(Iterable keys, Map transforms) throws VoldemortException { @@ -529,10 +569,10 @@ private void initCache(String key, Object defaultValue) { } private HashMap createRoutingStrategyMap(Cluster cluster, - List storeDefs) { + HashMap storeDefs) { HashMap map = new HashMap(); - for(StoreDefinition store: storeDefs) { + for(StoreDefinition store: storeDefs.values()) { map.put(store.getName(), routingFactory.updateRoutingStrategy(store, cluster)); } @@ -624,8 +664,4 @@ private Versioned getInnerValue(String key) throws VoldemortException { throw new VoldemortException("No metadata found for required key:" + key); } - - public boolean isPartitionAware() { - return false; - } } diff --git a/src/java/voldemort/store/metadata/MetadataStoreListener.java b/src/java/voldemort/store/metadata/MetadataStoreListener.java index 67e2b6792a..1def439d88 100644 --- a/src/java/voldemort/store/metadata/MetadataStoreListener.java +++ b/src/java/voldemort/store/metadata/MetadataStoreListener.java @@ -1,8 +1,11 @@ package voldemort.store.metadata; import voldemort.routing.RoutingStrategy; +import voldemort.store.StoreDefinition; public interface MetadataStoreListener { void updateRoutingStrategy(RoutingStrategy routingStrategyMap); + + void updateStoreDefinition(StoreDefinition storeDef); } diff --git a/src/java/voldemort/store/mysql/MysqlStorageConfiguration.java b/src/java/voldemort/store/mysql/MysqlStorageConfiguration.java index 57e5bce400..dc2e84b3a4 100644 --- a/src/java/voldemort/store/mysql/MysqlStorageConfiguration.java +++ b/src/java/voldemort/store/mysql/MysqlStorageConfiguration.java @@ -21,6 +21,7 @@ import org.apache.commons.dbcp.BasicDataSource; import voldemort.VoldemortException; +import voldemort.routing.RoutingStrategy; import voldemort.server.VoldemortConfig; import voldemort.store.StorageConfiguration; import voldemort.store.StorageEngine; @@ -43,7 +44,8 @@ public MysqlStorageConfiguration(VoldemortConfig config) { this.dataSource = ds; } - public StorageEngine getStore(StoreDefinition storeDef) { + public StorageEngine getStore(StoreDefinition storeDef, + RoutingStrategy strategy) { return new MysqlStorageEngine(storeDef.getName(), dataSource); } diff --git a/src/java/voldemort/store/mysql/MysqlStorageEngine.java b/src/java/voldemort/store/mysql/MysqlStorageEngine.java index 06aae081a8..bf26b4be36 100644 --- a/src/java/voldemort/store/mysql/MysqlStorageEngine.java +++ b/src/java/voldemort/store/mysql/MysqlStorageEngine.java @@ -28,10 +28,8 @@ import org.apache.log4j.Logger; import voldemort.VoldemortException; -import voldemort.store.NoSuchCapabilityException; +import voldemort.store.AbstractStorageEngine; import voldemort.store.PersistenceFailureException; -import voldemort.store.StorageEngine; -import voldemort.store.StoreCapabilityType; import voldemort.store.StoreUtils; import voldemort.utils.ByteArray; import voldemort.utils.ClosableIterator; @@ -49,17 +47,16 @@ * * */ -public class MysqlStorageEngine implements StorageEngine { +public class MysqlStorageEngine extends AbstractStorageEngine { private static final Logger logger = Logger.getLogger(MysqlStorageEngine.class); private static int MYSQL_ERR_DUP_KEY = 1022; private static int MYSQL_ERR_DUP_ENTRY = 1062; - private final String name; private final DataSource datasource; public MysqlStorageEngine(String name, DataSource datasource) { - this.name = name; + super(name); this.datasource = datasource; if(!tableExists()) { @@ -112,14 +109,16 @@ public void execute(String query) { } } + @Override public ClosableIterator keys() { return StoreUtils.keys(entries()); } + @Override public void truncate() { Connection conn = null; PreparedStatement stmt = null; - String select = "delete from " + name; + String select = "delete from " + getName(); try { conn = datasource.getConnection(); stmt = conn.prepareStatement(select); @@ -132,11 +131,12 @@ public void truncate() { } } + @Override public ClosableIterator>> entries() { Connection conn = null; PreparedStatement stmt = null; ResultSet rs = null; - String select = "select key_, version_, value_ from " + name; + String select = "select key_, version_, value_ from " + getName(); try { conn = datasource.getConnection(); stmt = conn.prepareStatement(select); @@ -147,20 +147,28 @@ public ClosableIterator>> entries() { } } - public void close() throws PersistenceFailureException { - // don't close datasource cause others could be using it + @Override + public ClosableIterator>> entries(int partition) { + throw new UnsupportedOperationException("Partition based entries scan not supported for this storage type"); + } + + @Override + public ClosableIterator keys(int partition) { + throw new UnsupportedOperationException("Partition based key scan not supported for this storage type"); } - public Object getCapability(StoreCapabilityType capability) { - throw new NoSuchCapabilityException(capability, getName()); + @Override + public void close() throws PersistenceFailureException { + // don't close datasource cause others could be using it } + @Override public boolean delete(ByteArray key, Version maxVersion) throws PersistenceFailureException { StoreUtils.assertValidKey(key); Connection conn = null; PreparedStatement selectStmt = null; ResultSet rs = null; - String select = "select key_, version_ from " + name + " where key_ = ? for update"; + String select = "select key_, version_ from " + getName() + " where key_ = ? for update"; try { conn = datasource.getConnection(); @@ -188,7 +196,7 @@ public boolean delete(ByteArray key, Version maxVersion) throws PersistenceFailu } private void delete(Connection connection, byte[] key, byte[] version) throws SQLException { - String delete = "delete from " + name + " where key_ = ? and version_ = ?"; + String delete = "delete from " + getName() + " where key_ = ? and version_ = ?"; PreparedStatement deleteStmt = null; try { @@ -201,6 +209,7 @@ private void delete(Connection connection, byte[] key, byte[] version) throws SQ } } + @Override public Map>> getAll(Iterable keys, Map transforms) throws VoldemortException { @@ -208,7 +217,7 @@ public Map>> getAll(Iterable keys, Connection conn = null; PreparedStatement stmt = null; ResultSet rs = null; - String select = "select version_, value_ from " + name + " where key_ = ?"; + String select = "select version_, value_ from " + getName() + " where key_ = ?"; try { conn = datasource.getConnection(); stmt = conn.prepareStatement(select); @@ -235,16 +244,14 @@ public Map>> getAll(Iterable keys, } } + @Override public List> get(ByteArray key, byte[] transforms) throws PersistenceFailureException { StoreUtils.assertValidKey(key); return StoreUtils.get(this, key, transforms); } - public String getName() { - return name; - } - + @Override public void put(ByteArray key, Versioned value, byte[] transforms) throws PersistenceFailureException { StoreUtils.assertValidKey(key); @@ -253,8 +260,9 @@ public void put(ByteArray key, Versioned value, byte[] transforms) PreparedStatement insert = null; PreparedStatement select = null; ResultSet results = null; - String insertSql = "insert into " + name + " (key_, version_, value_) values (?, ?, ?)"; - String selectSql = "select key_, version_ from " + name + " where key_ = ?"; + String insertSql = "insert into " + getName() + + " (key_, version_, value_) values (?, ?, ?)"; + String selectSql = "select key_, version_ from " + getName() + " where key_ = ?"; try { conn = datasource.getConnection(); conn.setAutoCommit(false); @@ -355,16 +363,19 @@ public MysqlClosableIterator(Connection connection, this.statement = statement; } + @Override public void close() { tryClose(rs); tryClose(statement); tryClose(connection); } + @Override public boolean hasNext() { return this.hasMore; } + @Override public Pair> next() { try { if(!this.hasMore) @@ -379,6 +390,7 @@ public Pair> next() { } } + @Override public void remove() { try { rs.deleteRow(); @@ -389,11 +401,8 @@ public void remove() { } + @Override public List getVersions(ByteArray key) { return StoreUtils.getVersions(get(key, null)); } - - public boolean isPartitionAware() { - return false; - } } diff --git a/src/java/voldemort/store/readonly/JsonStoreBuilder.java b/src/java/voldemort/store/readonly/JsonStoreBuilder.java index 46fcea0ebc..88e8e69d24 100644 --- a/src/java/voldemort/store/readonly/JsonStoreBuilder.java +++ b/src/java/voldemort/store/readonly/JsonStoreBuilder.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2009 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -32,9 +32,9 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.Random; import java.util.Set; -import java.util.Map.Entry; import joptsimple.OptionParser; import joptsimple.OptionSet; @@ -56,9 +56,9 @@ import voldemort.store.compress.CompressionStrategy; import voldemort.store.compress.CompressionStrategyFactory; import voldemort.utils.ByteUtils; +import voldemort.utils.ClusterUtils; import voldemort.utils.CmdUtils; import voldemort.utils.Pair; -import voldemort.utils.RebalanceUtils; import voldemort.utils.Utils; import voldemort.xml.ClusterMapper; import voldemort.xml.StoreDefinitionsMapper; @@ -516,8 +516,9 @@ public void buildVersion2() throws IOException { valueStream.flush(); - previousElements.put(key, Pair.create(previousElement.getFirst(), - stream.toByteArray())); + previousElements.put(key, + Pair.create(previousElement.getFirst(), + stream.toByteArray())); } else { // ...else, flush the previous element to disk @@ -588,7 +589,7 @@ public void buildVersion2() throws IOException { // Start moving files over to their correct node RoutingStrategy strategy = new RoutingStrategyFactory().updateRoutingStrategy(storeDefinition, cluster); - Map replicaMapping = RebalanceUtils.getCurrentPartitionMapping(cluster); + Map replicaMapping = ClusterUtils.getCurrentPartitionMapping(cluster); for(File file: tempDirectory.listFiles()) { String fileName = file.getName(); if(fileName.matches("^[\\d]+_[\\d]+_[\\d]+\\.(data|index)")) { diff --git a/src/java/voldemort/store/readonly/ReadOnlyStorageConfiguration.java b/src/java/voldemort/store/readonly/ReadOnlyStorageConfiguration.java index ce6759e3c0..1229d7f8a5 100644 --- a/src/java/voldemort/store/readonly/ReadOnlyStorageConfiguration.java +++ b/src/java/voldemort/store/readonly/ReadOnlyStorageConfiguration.java @@ -46,15 +46,17 @@ public class ReadOnlyStorageConfiguration implements StorageConfiguration { private final int nodeId; private RoutingStrategy routingStrategy = null; private final int deleteBackupMs; + private boolean enforceMlock = false; public ReadOnlyStorageConfiguration(VoldemortConfig config) { this.storageDir = new File(config.getReadOnlyDataStorageDirectory()); - this.numBackups = config.getReadOnlyBackups(); + this.numBackups = config.getNumReadOnlyVersions(); this.registeredBeans = Collections.synchronizedSet(new HashSet()); this.searcher = (SearchStrategy) ReflectUtils.callConstructor(ReflectUtils.loadClass(config.getReadOnlySearchStrategy() .trim())); this.nodeId = config.getNodeId(); this.deleteBackupMs = config.getReadOnlyDeleteBackupMs(); + this.enforceMlock = config.isUseMlock(); } public void close() { @@ -67,7 +69,9 @@ public void setRoutingStrategy(RoutingStrategy routingStrategy) { this.routingStrategy = routingStrategy; } - public StorageEngine getStore(StoreDefinition storeDef) { + public StorageEngine getStore(StoreDefinition storeDef, + RoutingStrategy strategy) { + this.setRoutingStrategy(strategy); ReadOnlyStorageEngine store = new ReadOnlyStorageEngine(storeDef.getName(), this.searcher, this.routingStrategy, @@ -75,7 +79,8 @@ public StorageEngine getStore(StoreDefinition storeDe new File(storageDir, storeDef.getName()), numBackups, - deleteBackupMs); + deleteBackupMs, + enforceMlock); ObjectName objName = JmxUtils.createObjectName(JmxUtils.getPackageName(store.getClass()), storeDef.getName() + nodeId); JmxUtils.registerMbean(ManagementFactory.getPlatformMBeanServer(), diff --git a/src/java/voldemort/store/readonly/ReadOnlyStorageEngine.java b/src/java/voldemort/store/readonly/ReadOnlyStorageEngine.java index 71faa3e40a..dece869483 100644 --- a/src/java/voldemort/store/readonly/ReadOnlyStorageEngine.java +++ b/src/java/voldemort/store/readonly/ReadOnlyStorageEngine.java @@ -34,9 +34,7 @@ import voldemort.annotations.jmx.JmxGetter; import voldemort.annotations.jmx.JmxOperation; import voldemort.routing.RoutingStrategy; -import voldemort.store.NoSuchCapabilityException; -import voldemort.store.StorageEngine; -import voldemort.store.StoreCapabilityType; +import voldemort.store.AbstractStorageEngine; import voldemort.store.StoreUtils; import voldemort.store.readonly.chunk.ChunkedFileSet; import voldemort.utils.ByteArray; @@ -54,11 +52,10 @@ * * */ -public class ReadOnlyStorageEngine implements StorageEngine { +public class ReadOnlyStorageEngine extends AbstractStorageEngine { private static Logger logger = Logger.getLogger(ReadOnlyStorageEngine.class); - private final String name; private final int numBackups, nodeId; private long currentVersionId; private final File storeDir; @@ -69,6 +66,7 @@ public class ReadOnlyStorageEngine implements StorageEngine keys() { if(!(fileSet.getReadOnlyStorageFormat().compareTo(ReadOnlyStorageFormat.READONLY_V2) == 0)) throw new UnsupportedOperationException("Iteration is not supported for " @@ -458,6 +475,7 @@ public ClosableIterator keys() { return new ChunkedFileSet.ROKeyIterator(fileSet, fileModificationLock); } + @Override public ClosableIterator>> entries() { if(!(fileSet.getReadOnlyStorageFormat().compareTo(ReadOnlyStorageFormat.READONLY_V2) == 0)) throw new UnsupportedOperationException("Iteration is not supported for " @@ -467,6 +485,17 @@ public ClosableIterator>> entries() { return new ChunkedFileSet.ROEntriesIterator(fileSet, fileModificationLock); } + @Override + public ClosableIterator>> entries(int partition) { + throw new UnsupportedOperationException("Partition based entries scan not supported for this storage type"); + } + + @Override + public ClosableIterator keys(int partition) { + throw new UnsupportedOperationException("Partition based key scan not supported for this storage type"); + } + + @Override public void truncate() { if(isOpen) close(); @@ -474,6 +503,7 @@ public void truncate() { logger.debug("Truncate successful for read-only store "); } + @Override public List> get(ByteArray key, byte[] transforms) throws VoldemortException { StoreUtils.assertValidKey(key); try { @@ -501,6 +531,7 @@ public List> get(ByteArray key, byte[] transforms) throws Vold } } + @Override public Map>> getAll(Iterable keys, Map transforms) throws VoldemortException { @@ -535,6 +566,7 @@ public Map>> getAll(Iterable keys, /** * Not supported, throws UnsupportedOperationException if called */ + @Override public boolean delete(ByteArray key, Version version) throws VoldemortException { throw new UnsupportedOperationException("Delete is not supported on this store, it is read-only."); } @@ -542,18 +574,16 @@ public boolean delete(ByteArray key, Version version) throws VoldemortException /** * Not supported, throws UnsupportedOperationException if called */ + @Override public void put(ByteArray key, Versioned value, byte[] transforms) throws VoldemortException { throw new VoldemortUnsupportedOperationalException("Put is not supported on this store, it is read-only."); } @JmxGetter(name = "name", description = "The name of the store.") + @Override public String getName() { - return name; - } - - public Object getCapability(StoreCapabilityType capability) { - throw new NoSuchCapabilityException(capability, getName()); + return super.getName(); } private final static class KeyValueLocation implements Comparable { @@ -581,6 +611,7 @@ public int getValueLocation() { return valueLocation; } + @Override public int compareTo(KeyValueLocation kvl) { if(chunk == kvl.getChunk()) { if(valueLocation == kvl.getValueLocation()) @@ -593,10 +624,12 @@ public int compareTo(KeyValueLocation kvl) { } } + @Override public List getVersions(ByteArray key) { return StoreUtils.getVersions(get(key, null)); } + @Override public boolean isPartitionAware() { return true; } diff --git a/src/java/voldemort/store/readonly/chunk/ChunkedFileSet.java b/src/java/voldemort/store/readonly/chunk/ChunkedFileSet.java index a66f94a2f1..c3aa3d78e0 100644 --- a/src/java/voldemort/store/readonly/chunk/ChunkedFileSet.java +++ b/src/java/voldemort/store/readonly/chunk/ChunkedFileSet.java @@ -22,6 +22,7 @@ import voldemort.store.readonly.ReadOnlyStorageFormat; import voldemort.store.readonly.ReadOnlyStorageMetadata; import voldemort.store.readonly.ReadOnlyUtils; +import voldemort.store.readonly.io.MappedFileReader; import voldemort.utils.ByteArray; import voldemort.utils.ByteUtils; import voldemort.utils.Pair; @@ -45,6 +46,8 @@ public class ChunkedFileSet { private final List indexFileSizes; private final List dataFileSizes; private final List indexFiles; + + private List mappedIndexFileReader; private final List dataFiles; private final HashMap chunkIdToChunkStart; private final HashMap chunkIdToNumChunks; @@ -52,7 +55,14 @@ public class ChunkedFileSet { private RoutingStrategy routingStrategy; private ReadOnlyStorageFormat storageFormat; - public ChunkedFileSet(File directory, RoutingStrategy routingStrategy, int nodeId) { + private boolean enforceMlock = false; + + public ChunkedFileSet(File directory, + RoutingStrategy routingStrategy, + int nodeId, + boolean enforceMlock) { + + this.enforceMlock = enforceMlock; this.baseDir = directory; if(!Utils.isReadableDir(directory)) throw new VoldemortException(directory.getAbsolutePath() @@ -76,6 +86,8 @@ public ChunkedFileSet(File directory, RoutingStrategy routingStrategy, int nodeI this.indexFileSizes = new ArrayList(); this.dataFileSizes = new ArrayList(); this.indexFiles = new ArrayList(); + this.mappedIndexFileReader = new ArrayList(); + this.dataFiles = new ArrayList(); this.chunkIdToChunkStart = new HashMap(); this.chunkIdToNumChunks = new HashMap(); @@ -101,6 +113,11 @@ public ChunkedFileSet(File directory, RoutingStrategy routingStrategy, int nodeI + " chunks and format " + storageFormat); } + public ChunkedFileSet(File directory, RoutingStrategy routingStrategy, int nodeId) { + this(directory, routingStrategy, nodeId, false); + + } + public DataFileChunkSet toDataFileChunkSet() { // Convert the index file into chunk set @@ -148,7 +165,18 @@ else if(index.exists() ^ data.exists()) /* Add the file channel for data */ dataFiles.add(openChannel(data)); - indexFiles.add(mapFile(index)); + + MappedFileReader idxFileReader = null; + try { + idxFileReader = new MappedFileReader(index); + mappedIndexFileReader.add(idxFileReader); + indexFiles.add(idxFileReader.map(enforceMlock)); + } catch(IOException e) { + + logger.error("Error in mlock", e); + } + + // indexFiles.add(mapFile(index)); chunkId++; } if(chunkId == 0) @@ -200,7 +228,17 @@ public void initVersion1() { /* Add the file channel for data */ dataFiles.add(openChannel(data)); - indexFiles.add(mapFile(index)); + + MappedFileReader idxFileReader = null; + try { + idxFileReader = new MappedFileReader(index); + mappedIndexFileReader.add(idxFileReader); + indexFiles.add(idxFileReader.map(enforceMlock)); + } catch(IOException e) { + logger.error("Error in mlock", e); + } + + // indexFiles.add(mapFile(index)); chunkId++; globalChunkId++; } @@ -282,7 +320,17 @@ public void initVersion2() { /* Add the file channel for data */ dataFiles.add(openChannel(data)); - indexFiles.add(mapFile(index)); + + MappedFileReader idxFileReader = null; + try { + idxFileReader = new MappedFileReader(index); + mappedIndexFileReader.add(idxFileReader); + indexFiles.add(idxFileReader.map(enforceMlock)); + } catch(IOException e) { + logger.error("Error in mlock", e); + } + + // indexFiles.add(mapFile(index)); chunkId++; globalChunkId++; } @@ -348,6 +396,14 @@ public void close() { } catch(IOException e) { logger.error("Error while closing file.", e); } + + MappedFileReader idxFileReader = mappedIndexFileReader.get(chunk); + try { + idxFileReader.close(); + } catch(IOException e) { + + logger.error("Error while closing file.", e); + } } } diff --git a/src/java/voldemort/store/readonly/io/BaseCloser.java b/src/java/voldemort/store/readonly/io/BaseCloser.java new file mode 100644 index 0000000000..9efa6cadbf --- /dev/null +++ b/src/java/voldemort/store/readonly/io/BaseCloser.java @@ -0,0 +1,83 @@ +package voldemort.store.readonly.io; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +public abstract class BaseCloser { + + protected List delegates = new ArrayList(); + + private Throwable cause = null; + + private boolean executed = false; + + public BaseCloser() {} + + public BaseCloser(List delegates) { + this.delegates = delegates; + } + + public BaseCloser(T... delegates) { + add(delegates); + } + + public void add(T delegate) { + delegates.add(delegate); + } + + public void add(T... delegates) { + for(T current: delegates) { + this.delegates.add(current); + } + } + + public void setCause(Throwable cause) { + this.cause = cause; + } + + protected boolean executed() { + return executed; + } + + protected void exec() throws GroupIOException { + + if(executed) + return; + + GroupIOException exc = null; + + if(cause != null) + exc = new GroupIOException(cause); + + for(T current: delegates) { + + if(current == null) + continue; + + try { + + onDelegate(current); + + } catch(Throwable t) { + + if(exc == null) { + exc = new GroupIOException(t); + } else { + exc.addSuppressed(t); + } + + } + + } + + executed = true; + + if(exc != null) + throw exc; + + } + + protected abstract void onDelegate(T delegate) throws IOException; + +} diff --git a/src/java/voldemort/store/readonly/io/BaseMappedFile.java b/src/java/voldemort/store/readonly/io/BaseMappedFile.java new file mode 100644 index 0000000000..5dc8cb2018 --- /dev/null +++ b/src/java/voldemort/store/readonly/io/BaseMappedFile.java @@ -0,0 +1,42 @@ +package voldemort.store.readonly.io; + +import java.io.File; +import java.nio.channels.FileChannel; + +/** + * + */ +public class BaseMappedFile { + + protected FileChannel channel; + + protected long offset = 0; + + protected long length = 0; + + protected Closer closer = new Closer(); + + protected File file; + + protected int fd; + + protected boolean fadvise = true; + + public File getFile() { + return file; + } + + public int getFd() { + return fd; + } + + public boolean isClosed() { + return closer.isClosed(); + } + + @Override + public String toString() { + return file.toString(); + } + +} diff --git a/src/java/voldemort/store/readonly/io/ByteBufferCloser.java b/src/java/voldemort/store/readonly/io/ByteBufferCloser.java new file mode 100644 index 0000000000..73a0bc3d07 --- /dev/null +++ b/src/java/voldemort/store/readonly/io/ByteBufferCloser.java @@ -0,0 +1,29 @@ +package voldemort.store.readonly.io; + +import java.io.Closeable; +import java.io.IOException; +import java.nio.ByteBuffer; + +/** + * A closeable which is smart enough to work on byte buffers. + */ +public class ByteBufferCloser implements Closeable { + + private ByteBuffer buff; + + public ByteBufferCloser(ByteBuffer buff) { + this.buff = buff; + } + + @Override + public void close() throws IOException { + + sun.misc.Cleaner cl = ((sun.nio.ch.DirectBuffer) buff).cleaner(); + + if(cl != null) { + cl.clean(); + } + + } + +} diff --git a/src/java/voldemort/store/readonly/io/Closer.java b/src/java/voldemort/store/readonly/io/Closer.java new file mode 100644 index 0000000000..2ba2c874ad --- /dev/null +++ b/src/java/voldemort/store/readonly/io/Closer.java @@ -0,0 +1,36 @@ +package voldemort.store.readonly.io; + +import java.io.Closeable; +import java.io.IOException; +import java.util.List; + +public class Closer extends BaseCloser implements Closeable { + + public Closer() {} + + public Closer(List delegates) { + this.delegates = (List) delegates; + } + + public Closer(Closeable... delegates) { + add(delegates); + } + + @Override + public void close() throws IOException { + exec(); + } + + public boolean closed() { + return executed(); + } + + public boolean isClosed() { + return executed(); + } + + protected void onDelegate(Closeable delegate) throws IOException { + delegate.close(); + } + +} diff --git a/src/java/voldemort/store/readonly/io/GroupIOException.java b/src/java/voldemort/store/readonly/io/GroupIOException.java new file mode 100644 index 0000000000..27bd34a6bc --- /dev/null +++ b/src/java/voldemort/store/readonly/io/GroupIOException.java @@ -0,0 +1,46 @@ +package voldemort.store.readonly.io; + +import java.io.IOException; +import java.io.PrintStream; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.List; + +public class GroupIOException extends IOException { + + private static final long serialVersionUID = 1L; + List suppressed = new ArrayList(); + + public GroupIOException(Throwable cause) { + suppressed.add(cause); + } + + public void addSuppressed(Throwable t) { + suppressed.add(t); + } + + @Override + public void printStackTrace(PrintStream out) { + + for(Throwable current: suppressed) { + current.printStackTrace(out); + } + + // this will print ourselves AND the cause... + super.printStackTrace(out); + + } + + @Override + public void printStackTrace(PrintWriter out) { + + for(Throwable current: suppressed) { + current.printStackTrace(out); + } + + // this will print ourselves AND the cause... + super.printStackTrace(out); + + } + +} diff --git a/src/java/voldemort/store/readonly/io/MappedFileReader.java b/src/java/voldemort/store/readonly/io/MappedFileReader.java new file mode 100644 index 0000000000..e2e42711b6 --- /dev/null +++ b/src/java/voldemort/store/readonly/io/MappedFileReader.java @@ -0,0 +1,116 @@ +package voldemort.store.readonly.io; + +import java.io.Closeable; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; + +import org.apache.log4j.Logger; + +/** + * Facade around a MappedByteBuffer but we also support mlock on the mapped + * pages, and closing all dependent resources. + * + */ +public class MappedFileReader extends BaseMappedFile implements Closeable { + + private static final Logger log = Logger.getLogger(MappedFileReader.class); + + protected FileInputStream in; + + protected MappedByteBuffer mappedByteBuffer = null; + + public MappedFileReader(String path) throws IOException { + this(new File(path)); + } + + public MappedFileReader(File file) throws IOException { + + init(file); + + } + + private void init(File file) throws IOException { + + this.file = file; + + this.in = new FileInputStream(file); + this.channel = in.getChannel(); + this.fd = Native.getFd(in.getFD()); + + this.length = file.length(); + + } + + /** + * Read from this mapped file. + */ + public MappedByteBuffer map(boolean setAutoLock) throws IOException { + + try { + + if(mappedByteBuffer == null) { + + if(setAutoLock) { + closer.add(new MemLock(file, in.getFD(), offset, length)); + } + + mappedByteBuffer = channel.map(FileChannel.MapMode.READ_ONLY, offset, length); + + closer.add(new MappedByteBufferCloser(mappedByteBuffer)); + + } + + return mappedByteBuffer; + + } catch(IOException e) { + + log.error(String.format("Failed to map %s of length %,d at %,d", + file.getPath(), + length, + offset), e); + + throw new IOException(String.format("Failed to map %s of length %,d at %,d", + file.getPath(), + length, + offset), e); + + } + + } + + @Override + public void close() throws IOException { + + if(closer.isClosed()) + return; + + closer.add(channel); + closer.add(in); + + closer.close(); + + } + + /** + * A closeable which is smart enough to work on mapped byte buffers. + */ + class MappedByteBufferCloser extends ByteBufferCloser { + + public MappedByteBufferCloser(ByteBuffer buff) { + super(buff); + } + + @Override + public void close() throws IOException { + + super.close(); + + } + + } + +} diff --git a/src/java/voldemort/store/readonly/io/MemLock.java b/src/java/voldemort/store/readonly/io/MemLock.java new file mode 100644 index 0000000000..c514151612 --- /dev/null +++ b/src/java/voldemort/store/readonly/io/MemLock.java @@ -0,0 +1,85 @@ +package voldemort.store.readonly.io; + +import java.io.Closeable; +import java.io.File; +import java.io.FileDescriptor; +import java.io.IOException; + +import org.apache.log4j.Logger; + +import voldemort.store.readonly.io.jna.mman; + +import com.sun.jna.Pointer; + +public class MemLock implements Closeable { + + private static final Logger logger = Logger.getLogger(MemLock.class); + + private Pointer pa; + private long length; + private File file; + private FileDescriptor descriptor; + + /** + * Call mmap a file descriptor, then lock the pages with MAP_LOCKED. This + * will then prevent the pages from being swapped out due to VFS cache + * pressure. + * + * @param descriptor The file we should mmap and MAP_LOCKED + * @param offset + * @param length + * @see #close() + * @throws IOException + */ + public MemLock(File file, FileDescriptor descriptor, long offset, long length) + throws IOException { + if(logger.isDebugEnabled()) + logger.debug("mlocking " + file + " with length " + length); + + this.setFile(file); + this.setDescriptor(descriptor); + this.length = length; + + int fd = voldemort.store.readonly.io.Native.getFd(descriptor); + + pa = mman.mmap(length, mman.PROT_READ, mman.MAP_SHARED | mman.MAP_ALIGN, fd, offset); + + // even though technically we have specified MAP_LOCKED this isn't + // supported on OpenSolaris or older Linux kernels (or OS X). + + mman.mlock(pa, length); + + } + + /** + * Release this lock so that the memory can be returned to the OS if it + * wants to us it. + */ + @Override + public void close() throws IOException { + + mman.munlock(pa, length); + mman.munmap(pa, length); + + if(logger.isDebugEnabled()) + logger.debug("munlocking " + file + " with length " + length); + + } + + public File getFile() { + return file; + } + + public void setFile(File file) { + this.file = file; + } + + public FileDescriptor getDescriptor() { + return descriptor; + } + + public void setDescriptor(FileDescriptor descriptor) { + this.descriptor = descriptor; + } + +} diff --git a/src/java/voldemort/store/readonly/io/Native.java b/src/java/voldemort/store/readonly/io/Native.java new file mode 100644 index 0000000000..854fa5f62c --- /dev/null +++ b/src/java/voldemort/store/readonly/io/Native.java @@ -0,0 +1,51 @@ +package voldemort.store.readonly.io; + +import java.io.FileDescriptor; +import java.lang.reflect.Field; + +import org.apache.log4j.Logger; + +public class Native { + + private static final Logger log = Logger.getLogger(Native.class); + + /** + * Used to get access to protected/private field of the specified class + * + * @param klass - name of the class + * @param fieldName - name of the field + * @return Field or null on error + */ + @SuppressWarnings("rawtypes") + public static Field getProtectedField(Class className, String fieldName) { + + Field field; + + try { + field = className.getDeclaredField(fieldName); + field.setAccessible(true); + } catch(Exception e) { + throw new AssertionError(e); + } + + return field; + } + + public static int getFd(FileDescriptor descriptor) { + + Field field = getProtectedField(descriptor.getClass(), "fd"); + + if(field == null) + return -1; + + try { + return field.getInt(descriptor); + } catch(Exception e) { + log.warn("unable to read fd field from FileDescriptor"); + } + + return -1; + + } + +} diff --git a/src/java/voldemort/store/readonly/io/jna/errno.java b/src/java/voldemort/store/readonly/io/jna/errno.java new file mode 100644 index 0000000000..64388d581e --- /dev/null +++ b/src/java/voldemort/store/readonly/io/jna/errno.java @@ -0,0 +1,115 @@ +package voldemort.store.readonly.io.jna; + +import com.sun.jna.Library; +import com.sun.jna.Native; + +public class errno { + + private static InterfaceDelegate delegate = (InterfaceDelegate) Native.loadLibrary("c", + InterfaceDelegate.class); + + /** + * The routine perror() produces a message on the standard error output, + * describing the last error encountered during a call to a system or + * library function. First (if s is not NULL and *s is not a null byte + * ('\0')) the argument string s is printed, followed by a colon and a + * blank. Then the message and a new-line. + * + * To be of most use, the argument string should include the name of the + * function that incurred the error. The error number is taken from the + * external variable errno, which is set when errors occur but not cleared + * when non-erroneous calls are made. + * + * The global error list sys_errlist[] indexed by errno can be used to + * obtain the error message without the newline. The largest message number + * provided in the table is sys_nerr -1. Be careful when directly accessing + * this list because new error values may not have been added to + * sys_errlist[]. + * + * When a system call fails, it usually returns -1 and sets the variable + * errno to a value describing what went wrong. (These values can be found + * in .) Many library functions do likewise. The function perror() + * serves to translate this error code into human-readable form. Note that + * errno is undefined after a successful library call: this call may well + * change this variable, even though it succeeds, for example because it + * internally used some other library function that failed. Thus, if a + * failing call is not immediately followed by a call to perror(), the value + * of errno should be saved. + */ + public static int perror(String s) { + return delegate.perror(s); + } + + /** + * The strerror() function returns a string describing the error code passed + * in the argument errnum, possibly using the LC_MESSAGES part of the + * current locale to select the appropriate language. This string must not + * be modified by the application, but may be modified by a subsequent call + * to perror() or strerror(). No library function will modify this string. + * + * The strerror_r() function is similar to strerror(), but is thread safe. + * This function is available in two versions: an XSI-compliant version + * specified in POSIX.1-2001, and a GNU-specific version (available since + * glibc 2.0). If _XOPEN_SOURCE is defined with the value 600, then the + * XSI-compliant version is provided, otherwise the GNU-specific version is + * provided. + * + * The XSI-compliant strerror_r() is preferred for portable applications. It + * returns the error string in the user-supplied buffer buf of length + * buflen. + * + * The GNU-specific strerror_r() returns a pointer to a string containing + * the error message. This may be either a pointer to a string that the + * function stores in buf, or a pointer to some (immutable) static string + * (in which case buf is unused). If the function stores a string in buf, + * then at most buflen bytes are stored (the string may be truncated if + * buflen is too small) and the string always includes a terminating null + * byte. + * + */ + public static String strerror(int errnum) { + return delegate.strerror(errnum); + } + + public static String strerror() { + return strerror(errno()); + } + + /** + * The header file defines the integer variable errno, which is + * set by system calls and some library functions in the event of an error + * to indicate what went wrong. Its value is significant only when the call + * returned an error (usually -1), and a function that does succeed is + * allowed to change errno. + * + * Sometimes, when -1 is also a valid successful return value one has to + * zero errno before the call in order to detect possible errors. + * + * errno is defined by the ISO C standard to be a modifiable lvalue of type + * int, and must not be explicitly declared; errno may be a macro. errno is + * thread-local; setting it in one thread does not affect its value in any + * other thread. + * + * Valid error numbers are all non-zero; errno is never set to zero by any + * library function. All the error names specified by POSIX.1 must have + * distinct values, with the exception of EAGAIN and EWOULDBLOCK, which may + * be the same. + * + * Below is a list of the symbolic error names that are defined on Linux. + * Some of these are marked POSIX.1, indicating that the name is defined by + * POSIX.1-2001, or C99, indicating that the name is defined by C99. + * + */ + public static int errno() { + return Native.getLastError(); + } + + interface InterfaceDelegate extends Library { + + int perror(String s); + + String strerror(int errnum); + + } + +} diff --git a/src/java/voldemort/store/readonly/io/jna/fcntl.java b/src/java/voldemort/store/readonly/io/jna/fcntl.java new file mode 100644 index 0000000000..eee65850e4 --- /dev/null +++ b/src/java/voldemort/store/readonly/io/jna/fcntl.java @@ -0,0 +1,171 @@ +package voldemort.store.readonly.io.jna; + +import java.io.IOException; + +import org.apache.log4j.Logger; + +import com.sun.jna.Native; + +public class fcntl { + + public static final int POSIX_FADV_NORMAL = 0; /* fadvise.h */ + public static final int POSIX_FADV_RANDOM = 1; /* fadvise.h */ + public static final int POSIX_FADV_SEQUENTIAL = 2; /* fadvise.h */ + public static final int POSIX_FADV_WILLNEED = 3; /* fadvise.h */ + public static final int POSIX_FADV_DONTNEED = 4; /* fadvise.h */ + public static final int POSIX_FADV_NOREUSE = 5; /* fadvise.h */ + + private static final Logger logger = Logger.getLogger(fcntl.class); + + /** + * posix documentation + * + * Actual Linux implementation resides here: + * + * http://lxr.linux.no/linux+v3.0.3/mm/fadvise.c#L77 + * + *

        + * posix_fadvise - predeclare an access pattern for file data + * + *

        + * Synopsis + * + *

        + * #include + * + *

        + * int posix_fadvise(int fd, off_t offset, off_t len, int advice); + * + *

        + * Feature Test Macro Requirements for glibc (see feature_test_macros(7)): + * posix_fadvise(): _XOPEN_SOURCE >= 600 || _POSIX_C_SOURCE >= 200112L + * + *

        + * Description + * + *

        + * Programs can use posix_fadvise() to announce an intention to access file + * data in a specific pattern in the future, thus allowing the kernel to + * perform appropriate optimizations. The advice applies to a (not + * necessarily existent) region starting at offset and extending for len + * bytes (or until the end of the file if len is 0) within the file referred + * to by fd. The advice is not binding; it merely constitutes an expectation + * on behalf of the application. + * + *

        + * Permissible values for advice include: + * + *

        + * POSIX_FADV_NORMAL + * + *

        + * Indicates that the application has no advice to give about its access + * pattern for the specified data. If no advice is given for an open file, + * this is the default assumption. + * + *

        + * POSIX_FADV_SEQUENTIAL + * + *

        + * The application expects to access the specified data sequentially (with + * lower offsets read before higher ones). + * + *

        + * POSIX_FADV_RANDOM + * + *

        + * The specified data will be accessed in random order. + * + *

        + * POSIX_FADV_NOREUSE + * + *

        + * The specified data will be accessed only once. + * + *

        + * POSIX_FADV_WILLNEED + * + *

        + * The specified data will be accessed in the near future. + * + *

        + * POSIX_FADV_DONTNEED + * + *

        + * The specified data will not be accessed in the near future. + * + *

        + * Return Value + * + * On success, zero is returned. On error, an error number is returned. + * + * java documentation + * + * We do not return -1 if we fail but instead throw an IOException + * + * @throws IOException if this call fails. + */ + public static int posix_fadvise(int fd, long offset, long len, int advice) throws IOException { + + int result = Delegate.posix_fadvise(fd, offset, len, advice); + + if(result != 0) + throw new IOException(errno.strerror(result)); + + return result; + + } + + /** + * posix documentation + * + *

        + * The function posix_fallocate() ensures that disk space is allocated for + * the file referred to by the descriptor fd for the bytes in the range + * starting at offset and continuing for len bytes. After a successful call + * to posix_fallocate(), subsequent writes to bytes in the specified range + * are guaranteed not to fail because of lack of disk space. + * + *

        + * If the size of the file is less than offset+len, then the file is + * increased to this size; otherwise the file size is left unchanged. + * + *

        + * Return Value + * + *

        + * posix_fallocate() returns zero on success, or an error number on failure. + * Note that errno is not set. + * + * java documentation + * + * We do not return -1 if we fail but instead throw an IOException + * + * @throws IOException if this call fails. + * + */ + public static int posix_fallocate(int fd, long offset, long len) throws IOException { + + int result = Delegate.posix_fallocate(fd, offset, len); + + if(result != 0) { + logger.warn(errno.strerror(result)); + } + + return result; + + } + + static class Delegate { + + public static native int posix_fadvise(int fd, long offset, long len, int advice); + + public static native int posix_fallocate(int fd, long offset, long len); + + static { + Native.register("c"); + } + + } + +} diff --git a/src/java/voldemort/store/readonly/io/jna/mman.java b/src/java/voldemort/store/readonly/io/jna/mman.java new file mode 100644 index 0000000000..f59f8efaf8 --- /dev/null +++ b/src/java/voldemort/store/readonly/io/jna/mman.java @@ -0,0 +1,143 @@ +package voldemort.store.readonly.io.jna; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; + +import org.apache.log4j.Logger; + +import com.sun.jna.Native; +import com.sun.jna.NativeLong; +import com.sun.jna.Pointer; + +public class mman { + + private static final Logger logger = Logger.getLogger(mman.class); + public static final int PROT_READ = 0x1; /* Page can be read. */ + public static final int PROT_WRITE = 0x2; /* Page can be written. */ + public static final int PROT_EXEC = 0x4; /* Page can be executed. */ + public static final int PROT_NONE = 0x0; /* Page can not be accessed. */ + + public static final int MAP_SHARED = 0x01; /* Share changes. */ + public static final int MAP_PRIVATE = 0x02; /* Changes are private. */ + + public static final int MAP_ALIGN = 0x200; /* addr specifies alignment */ + + public static final int MAP_LOCKED = 0x02000; /* Lock the mapping. */ + + // http://linux.die.net/man/2/mmap + // http://www.opengroup.org/sud/sud1/xsh/mmap.htm + // http://linux.die.net/include/sys/mman.h + // http://linux.die.net/include/bits/mman.h + + // off_t = 8 + // size_t = 8 + public static Pointer mmap(long len, int prot, int flags, int fildes, long off) + throws IOException { + + // we don't really have a need to change the recommended pointer. + Pointer addr = new Pointer(0); + + Pointer result = Delegate.mmap(addr, + new NativeLong(len), + prot, + flags, + fildes, + new NativeLong(off)); + + if(Pointer.nativeValue(result) == -1) { + if(logger.isDebugEnabled()) + logger.debug(errno.strerror()); + } + + return result; + + } + + public static int munmap(Pointer addr, long len) throws IOException { + + int result = Delegate.munmap(addr, new NativeLong(len)); + + if(result != 0) { + if(logger.isDebugEnabled()) + logger.debug(errno.strerror()); + } + + return result; + + } + + public static void mlock(Pointer addr, long len) throws IOException { + + int res = Delegate.mlock(addr, new NativeLong(len)); + if(res != 0) { + if(logger.isDebugEnabled()) { + logger.debug("Mlock failed probably because of insufficient privileges, errno:" + + errno.strerror() + ", return value:" + res); + } + } else { + if(logger.isDebugEnabled()) + logger.debug("Mlock successfull"); + + } + + } + + /** + * Unlock the given region, throw an IOException if we fail. + */ + public static void munlock(Pointer addr, long len) throws IOException { + + if(Delegate.munlock(addr, new NativeLong(len)) != 0) { + if(logger.isDebugEnabled()) + logger.debug("munlocking failed with errno:" + errno.strerror()); + } else { + if(logger.isDebugEnabled()) + logger.debug("munlocking region"); + } + } + + static class Delegate { + + public static native Pointer mmap(Pointer addr, + NativeLong len, + int prot, + int flags, + int fildes, + NativeLong off); + + public static native int munmap(Pointer addr, NativeLong len); + + public static native int mlock(Pointer addr, NativeLong len); + + public static native int munlock(Pointer addr, NativeLong len); + + static { + Native.register("c"); + } + + } + + public static void main(String[] args) throws Exception { + + String path = args[0]; + + File file = new File(path); + FileInputStream in = new FileInputStream(file); + int fd = voldemort.store.readonly.io.Native.getFd(in.getFD()); + if(logger.isDebugEnabled()) + logger.debug("File descriptor is: " + fd); + + // mmap a large file... + Pointer addr = mmap(file.length(), PROT_READ, mman.MAP_SHARED | mman.MAP_ALIGN, fd, 0L); + if(logger.isDebugEnabled()) + logger.debug("mmap address is: " + Pointer.nativeValue(addr)); + + // try to mlock it directly + mlock(addr, file.length()); + munlock(addr, file.length()); + + munmap(addr, file.length()); + + } +} diff --git a/src/java/voldemort/store/readonly/swapper/AdminStoreSwapper.java b/src/java/voldemort/store/readonly/swapper/AdminStoreSwapper.java index 51f41f1ff3..6954f6f5ed 100644 --- a/src/java/voldemort/store/readonly/swapper/AdminStoreSwapper.java +++ b/src/java/voldemort/store/readonly/swapper/AdminStoreSwapper.java @@ -77,7 +77,7 @@ public void invokeRollback(final String storeName, final long pushVersion) { try { logger.info("Attempting rollback for node " + node.getId() + " storeName = " + storeName); - adminClient.rollbackStore(node.getId(), storeName, pushVersion); + adminClient.readonlyOps.rollbackStore(node.getId(), storeName, pushVersion); logger.info("Rollback succeeded for node " + node.getId()); } catch(Exception e) { exception = e; @@ -103,11 +103,11 @@ public List invokeFetch(final String storeName, public String call() throws Exception { String storeDir = basePath + "/node-" + node.getId(); logger.info("Invoking fetch for node " + node.getId() + " for " + storeDir); - String response = adminClient.fetchStore(node.getId(), - storeName, - storeDir, - pushVersion, - timeoutMs); + String response = adminClient.readonlyOps.fetchStore(node.getId(), + storeName, + storeDir, + pushVersion, + timeoutMs); if(response == null) throw new VoldemortException("Fetch request on node " + node.getId() + " (" + node.getHost() + ") failed"); @@ -139,9 +139,9 @@ public String call() throws Exception { try { logger.info("Deleting fetched data from node " + successfulNodeId); - adminClient.failedFetchStore(successfulNodeId, - storeName, - results.get(successfulNodeId)); + adminClient.readonlyOps.failedFetchStore(successfulNodeId, + storeName, + results.get(successfulNodeId)); } catch(Exception e) { logger.error("Exception thrown during delete operation on node " + successfulNodeId + " : ", e); @@ -172,7 +172,7 @@ public void invokeSwap(final String storeName, final List fetchFiles) { try { String dir = fetchFiles.get(nodeId); logger.info("Attempting swap for node " + nodeId + " dir = " + dir); - previousDirs.put(nodeId, adminClient.swapStore(nodeId, storeName, dir)); + previousDirs.put(nodeId, adminClient.readonlyOps.swapStore(nodeId, storeName, dir)); logger.info("Swap succeeded for node " + nodeId); } catch(Exception e) { exceptions.put(nodeId, e); @@ -186,9 +186,9 @@ public void invokeSwap(final String storeName, final List fetchFiles) { for(int successfulNodeId: previousDirs.keySet()) { try { logger.info("Rolling back data on successful node " + successfulNodeId); - adminClient.rollbackStore(successfulNodeId, - storeName, - ReadOnlyUtils.getVersionId(new File(previousDirs.get(successfulNodeId)))); + adminClient.readonlyOps.rollbackStore(successfulNodeId, + storeName, + ReadOnlyUtils.getVersionId(new File(previousDirs.get(successfulNodeId)))); logger.info("Rollback succeeded for node " + successfulNodeId); } catch(Exception e) { logger.error("Exception thrown during rollback ( after swap ) operation on node " diff --git a/src/java/voldemort/store/readonly/swapper/StoreSwapper.java b/src/java/voldemort/store/readonly/swapper/StoreSwapper.java index 9ff5f159d1..790f87fd6e 100644 --- a/src/java/voldemort/store/readonly/swapper/StoreSwapper.java +++ b/src/java/voldemort/store/readonly/swapper/StoreSwapper.java @@ -1,3 +1,18 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ package voldemort.store.readonly.swapper; import java.io.File; @@ -18,6 +33,7 @@ import org.apache.http.params.HttpParams; import org.apache.log4j.Logger; +import voldemort.client.ClientConfig; import voldemort.client.protocol.admin.AdminClient; import voldemort.client.protocol.admin.AdminClientConfig; import voldemort.cluster.Cluster; @@ -118,7 +134,7 @@ public static void main(String[] args) throws Exception { DefaultHttpClient httpClient = null; if(useAdminServices) { - adminClient = new AdminClient(cluster, new AdminClientConfig()); + adminClient = new AdminClient(cluster, new AdminClientConfig(), new ClientConfig()); swapper = new AdminStoreSwapper(cluster, executor, adminClient, timeoutMs); } else { int numConnections = cluster.getNumberOfNodes() + 3; @@ -146,7 +162,7 @@ public static void main(String[] args) throws Exception { + " seconds."); } finally { if(useAdminServices && adminClient != null) - adminClient.stop(); + adminClient.close(); executor.shutdownNow(); executor.awaitTermination(1, TimeUnit.SECONDS); VoldemortIOUtils.closeQuietly(httpClient); diff --git a/src/java/voldemort/store/rebalancing/RebootstrappingStore.java b/src/java/voldemort/store/rebalancing/RebootstrappingStore.java index 139122777d..4f73c00fba 100644 --- a/src/java/voldemort/store/rebalancing/RebootstrappingStore.java +++ b/src/java/voldemort/store/rebalancing/RebootstrappingStore.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2010 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -82,7 +82,7 @@ private void reinit() { routedStore.updateRoutingStrategy(metadata.getRoutingStrategy(getName())); } finally { - adminClient.stop(); + adminClient.close(); } } diff --git a/src/java/voldemort/store/retention/RetentionEnforcingStore.java b/src/java/voldemort/store/retention/RetentionEnforcingStore.java new file mode 100644 index 0000000000..9ab151d00e --- /dev/null +++ b/src/java/voldemort/store/retention/RetentionEnforcingStore.java @@ -0,0 +1,112 @@ +package voldemort.store.retention; + +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import voldemort.VoldemortException; +import voldemort.routing.RoutingStrategy; +import voldemort.store.DelegatingStore; +import voldemort.store.Store; +import voldemort.store.StoreDefinition; +import voldemort.store.StoreUtils; +import voldemort.store.metadata.MetadataStoreListener; +import voldemort.utils.ByteArray; +import voldemort.utils.Time; +import voldemort.versioning.VectorClock; +import voldemort.versioning.Versioned; + +/** + * Wraps the storage layer and ensures we don't return any values that are + * stale. Optionally, deletes the expired versions. + * + */ +public class RetentionEnforcingStore extends DelegatingStore implements + MetadataStoreListener { + + private volatile StoreDefinition storeDef; + private boolean deleteExpiredEntries; + private volatile long retentionTimeMs; + private Time time; + + public RetentionEnforcingStore(Store innerStore, + StoreDefinition storeDef, + boolean deleteExpiredEntries, + Time time) { + super(innerStore); + updateStoreDefinition(storeDef); + this.deleteExpiredEntries = deleteExpiredEntries; + this.time = time; + } + + @Override + public void updateRoutingStrategy(RoutingStrategy routingStrategyMap) { + return; // no-op + } + + /** + * Updates the store definition object and the retention time based on the + * updated store definition + */ + @Override + public void updateStoreDefinition(StoreDefinition storeDef) { + this.storeDef = storeDef; + if(storeDef.hasRetentionPeriod()) + this.retentionTimeMs = storeDef.getRetentionDays() * Time.MS_PER_DAY; + } + + /** + * Performs the filtering of the expired entries based on retention time. + * Optionally, deletes them also + * + * @param key the key whose value is to be deleted if needed + * @param vals set of values to be filtered out + * @return filtered list of values which are currently valid + */ + private List> filterExpiredEntries(ByteArray key, List> vals) { + Iterator> valsIterator = vals.iterator(); + while(valsIterator.hasNext()) { + Versioned val = valsIterator.next(); + VectorClock clock = (VectorClock) val.getVersion(); + // omit if expired + if(clock.getTimestamp() < (time.getMilliseconds() - this.retentionTimeMs)) { + valsIterator.remove(); + // delete stale value if configured + if(deleteExpiredEntries) { + getInnerStore().delete(key, clock); + } + } + } + return vals; + } + + @Override + public Map>> getAll(Iterable keys, + Map transforms) + throws VoldemortException { + StoreUtils.assertValidKeys(keys); + Map>> results = getInnerStore().getAll(keys, transforms); + if(!storeDef.hasRetentionPeriod()) + return results; + + for(ByteArray key: results.keySet()) { + List> filteredVals = filterExpiredEntries(key, results.get(key)); + // remove/update the entry for the key, depending on how much is + // filtered + if(!filteredVals.isEmpty()) + results.put(key, filteredVals); + else + results.remove(key); + } + return results; + } + + @Override + public List> get(ByteArray key, byte[] transforms) throws VoldemortException { + StoreUtils.assertValidKey(key); + List> vals = getInnerStore().get(key, transforms); + if(!storeDef.hasRetentionPeriod()) + return vals; + return filterExpiredEntries(key, vals); + } +} diff --git a/src/java/voldemort/store/routed/NodeValue.java b/src/java/voldemort/store/routed/NodeValue.java index 01dda95a6a..4bcd61c75b 100644 --- a/src/java/voldemort/store/routed/NodeValue.java +++ b/src/java/voldemort/store/routed/NodeValue.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2009 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -35,6 +35,8 @@ */ public final class NodeValue implements Serializable, Cloneable { + // TODO: (refactor) Rename NodeValue to NodeKeyValue + private static final long serialVersionUID = 1; private final int nodeId; diff --git a/src/java/voldemort/store/routed/PipelineRoutedStore.java b/src/java/voldemort/store/routed/PipelineRoutedStore.java index 968ff5a89e..11b794aab5 100644 --- a/src/java/voldemort/store/routed/PipelineRoutedStore.java +++ b/src/java/voldemort/store/routed/PipelineRoutedStore.java @@ -33,6 +33,7 @@ import voldemort.store.StoreDefinition; import voldemort.store.StoreRequest; import voldemort.store.StoreUtils; +import voldemort.store.CompositeVoldemortRequest; import voldemort.store.nonblockingstore.NonblockingStore; import voldemort.store.routed.Pipeline.Event; import voldemort.store.routed.Pipeline.Operation; @@ -212,7 +213,14 @@ private AbstractConfigureNodes>, BasicPipeline } + @Override public List> get(final ByteArray key, final byte[] transforms) { + return get(key, transforms, timeoutConfig.getOperationTimeout(VoldemortOpCode.GET_OP_CODE)); + } + + public List> get(final ByteArray key, + final byte[] transforms, + final long getOpTimeout) { StoreUtils.assertValidKey(key); long startTimeMs = -1; @@ -230,13 +238,12 @@ public List> get(final ByteArray key, final byte[] transforms) pipelineData.setZonesRequired(null); pipelineData.setStats(stats); - final Pipeline pipeline = new Pipeline(Operation.GET, - timeoutConfig.getOperationTimeout(VoldemortOpCode.GET_OP_CODE), - TimeUnit.MILLISECONDS); + final Pipeline pipeline = new Pipeline(Operation.GET, getOpTimeout, TimeUnit.MILLISECONDS); boolean allowReadRepair = repairReads && transforms == null; StoreRequest>> blockingStoreRequest = new StoreRequest>>() { + @Override public List> request(Store store) { return store.get(key, transforms); } @@ -259,7 +266,7 @@ public List> request(Store store) { failureDetector, storeDef.getPreferredReads(), storeDef.getRequiredReads(), - timeoutConfig.getOperationTimeout(VoldemortOpCode.GET_OP_CODE), + getOpTimeout, nonblockingStores, Event.INSUFFICIENT_SUCCESSES, Event.INSUFFICIENT_ZONES)); @@ -280,7 +287,7 @@ public List> request(Store store) { new ReadRepair>>>(pipelineData, Event.COMPLETED, storeDef.getPreferredReads(), - timeoutConfig.getOperationTimeout(VoldemortOpCode.GET_OP_CODE), + getOpTimeout, nonblockingStores, readRepairer)); @@ -344,9 +351,19 @@ private String formatNodeValuesFromGet(List>> getAll(Iterable keys, Map transforms) throws VoldemortException { + return getAll(keys, + transforms, + timeoutConfig.getOperationTimeout(VoldemortOpCode.GET_ALL_OP_CODE)); + } + + public Map>> getAll(Iterable keys, + Map transforms, + long getAllOpTimeoutInMs) + throws VoldemortException { StoreUtils.assertValidKeys(keys); long startTimeMs = -1; @@ -367,7 +384,7 @@ public Map>> getAll(Iterable keys, pipelineData.setStats(stats); Pipeline pipeline = new Pipeline(Operation.GET_ALL, - timeoutConfig.getOperationTimeout(VoldemortOpCode.GET_ALL_OP_CODE), + getAllOpTimeoutInMs, TimeUnit.MILLISECONDS); pipeline.addEventAction(Event.STARTED, new GetAllConfigureNodes(pipelineData, @@ -383,7 +400,7 @@ public Map>> getAll(Iterable keys, new PerformParallelGetAllRequests(pipelineData, Event.INSUFFICIENT_SUCCESSES, failureDetector, - timeoutConfig.getOperationTimeout(VoldemortOpCode.GET_ALL_OP_CODE), + getAllOpTimeoutInMs, nonblockingStores)); pipeline.addEventAction(Event.INSUFFICIENT_SUCCESSES, new PerformSerialGetAllRequests(pipelineData, @@ -401,7 +418,7 @@ public Map>> getAll(Iterable keys, new GetAllReadRepair(pipelineData, Event.COMPLETED, storeDef.getPreferredReads(), - timeoutConfig.getOperationTimeout(VoldemortOpCode.GET_ALL_OP_CODE), + getAllOpTimeoutInMs, nonblockingStores, readRepairer)); @@ -451,6 +468,7 @@ private String formatNodeValuesFromGetAll(List, Map return builder.toString(); } + @Override public List getVersions(final ByteArray key) { StoreUtils.assertValidKey(key); @@ -474,6 +492,7 @@ public List getVersions(final ByteArray key) { StoreRequest> blockingStoreRequest = new StoreRequest>() { + @Override public List request(Store store) { return store.getVersions(key); } @@ -566,7 +585,15 @@ private String formatNodeValuesFromGetVersions(List>(pipelineData, @@ -616,7 +641,7 @@ public boolean delete(final ByteArray key, final Version version) throws Voldemo failureDetector, storeDef.getPreferredWrites(), storeDef.getRequiredWrites(), - timeoutConfig.getOperationTimeout(VoldemortOpCode.DELETE_OP_CODE), + deleteOpTimeout, nonblockingStores, hintedHandoff, version)); @@ -709,8 +734,19 @@ private AbstractConfigureNodes makeNodeConfigu } + @Override public void put(ByteArray key, Versioned versioned, byte[] transforms) throws VoldemortException { + put(key, + versioned, + transforms, + timeoutConfig.getOperationTimeout(VoldemortOpCode.PUT_OP_CODE)); + } + + public void put(ByteArray key, + Versioned versioned, + byte[] transforms, + long putOpTimeoutInMs) throws VoldemortException { long startTimeMs = -1; long startTimeNs = -1; @@ -727,12 +763,10 @@ public void put(ByteArray key, Versioned versioned, byte[] transforms) else pipelineData.setZonesRequired(null); pipelineData.setStartTimeNs(System.nanoTime()); - pipelineData.setStoreName(name); + pipelineData.setStoreName(getName()); pipelineData.setStats(stats); - Pipeline pipeline = new Pipeline(Operation.PUT, - timeoutConfig.getOperationTimeout(VoldemortOpCode.PUT_OP_CODE), - TimeUnit.MILLISECONDS); + Pipeline pipeline = new Pipeline(Operation.PUT, putOpTimeoutInMs, TimeUnit.MILLISECONDS); pipeline.setEnableHintedHandoff(isHintedHandoffEnabled()); HintedHandoff hintedHandoff = null; @@ -748,7 +782,7 @@ public void put(ByteArray key, Versioned versioned, byte[] transforms) nonblockingSlopStores, handoffStrategy, pipelineData.getFailedNodes(), - timeoutConfig.getOperationTimeout(VoldemortOpCode.PUT_OP_CODE)); + putOpTimeoutInMs); pipeline.addEventAction(Event.STARTED, configureNodes); @@ -772,7 +806,7 @@ public void put(ByteArray key, Versioned versioned, byte[] transforms) failureDetector, storeDef.getPreferredWrites(), storeDef.getRequiredWrites(), - timeoutConfig.getOperationTimeout(VoldemortOpCode.PUT_OP_CODE), + putOpTimeoutInMs, nonblockingStores, hintedHandoff)); if(isHintedHandoffEnabled()) { @@ -847,4 +881,27 @@ public void close() { super.close(); } + + @Override + public List> get(CompositeVoldemortRequest request) + throws VoldemortException { + return get(request.getKey(), null, request.getRoutingTimeoutInMs()); + } + + @Override + public Map>> getAll(CompositeVoldemortRequest request) + throws VoldemortException { + return getAll(request.getIterableKeys(), null, request.getRoutingTimeoutInMs()); + } + + @Override + public void put(CompositeVoldemortRequest request) throws VoldemortException { + put(request.getKey(), request.getValue(), null, request.getRoutingTimeoutInMs()); + } + + @Override + public boolean delete(CompositeVoldemortRequest request) + throws VoldemortException { + return delete(request.getKey(), request.getVersion(), request.getRoutingTimeoutInMs()); + } } diff --git a/src/java/voldemort/store/routed/RoutedStore.java b/src/java/voldemort/store/routed/RoutedStore.java index a634b16447..3c4156c510 100644 --- a/src/java/voldemort/store/routed/RoutedStore.java +++ b/src/java/voldemort/store/routed/RoutedStore.java @@ -27,6 +27,7 @@ import voldemort.cluster.failuredetector.FailureDetector; import voldemort.routing.RoutingStrategy; import voldemort.routing.RoutingStrategyFactory; +import voldemort.store.AbstractStore; import voldemort.store.NoSuchCapabilityException; import voldemort.store.Store; import voldemort.store.StoreCapabilityType; @@ -40,9 +41,8 @@ * * */ -public abstract class RoutedStore implements Store { +public abstract class RoutedStore extends AbstractStore { - protected final String name; protected final Map> innerStores; protected final boolean repairReads; protected final ReadRepairer readRepairer; @@ -61,6 +61,7 @@ protected RoutedStore(String name, TimeoutConfig timeoutConfig, FailureDetector failureDetector, Time time) { + super(name); if(storeDef.getRequiredReads() < 1) throw new IllegalArgumentException("Cannot have a storeDef.getRequiredReads() number less than 1."); if(storeDef.getRequiredWrites() < 1) @@ -74,7 +75,6 @@ protected RoutedStore(String name, if(storeDef.getPreferredWrites() > innerStores.size()) throw new IllegalArgumentException("storeDef.getPreferredWrites() is larger than the total number of nodes!"); - this.name = name; this.innerStores = new ConcurrentHashMap>(innerStores); this.repairReads = repairReads; this.readRepairer = new ReadRepairer(); @@ -90,10 +90,7 @@ public void updateRoutingStrategy(RoutingStrategy routingStrategy) { this.routingStrategy = routingStrategy; } - public String getName() { - return this.name; - } - + @Override public void close() { VoldemortException exception = null; @@ -113,6 +110,7 @@ public Map> getInnerStores() { return this.innerStores; } + @Override public Object getCapability(StoreCapabilityType capability) { switch(capability) { case ROUTING_STRATEGY: diff --git a/src/java/voldemort/store/routed/ThreadPoolRoutedStore.java b/src/java/voldemort/store/routed/ThreadPoolRoutedStore.java index 947b44f42b..e1c3c8ee90 100644 --- a/src/java/voldemort/store/routed/ThreadPoolRoutedStore.java +++ b/src/java/voldemort/store/routed/ThreadPoolRoutedStore.java @@ -68,6 +68,7 @@ public class ThreadPoolRoutedStore extends RoutedStore { private final static StoreOp> VERSIONED_OP = new StoreOp>() { + @Override public List> execute(Store store, ByteArray key, byte[] transforms) { @@ -77,6 +78,7 @@ public List> execute(Store store, private final static StoreOp VERSION_OP = new StoreOp() { + @Override public List execute(Store store, ByteArray key, byte[] transforms) { @@ -150,6 +152,7 @@ public ThreadPoolRoutedStore(String name, this.executor = threadPool; } + @Override public boolean delete(final ByteArray key, final Version version) throws VoldemortException { StoreUtils.assertValidKey(key); final List nodes = availableNodes(routingStrategy.routeRequest(key.get())); @@ -178,6 +181,7 @@ public boolean delete(final ByteArray key, final Version version) throws Voldemo for(final Node node: nodes) { this.executor.execute(new Runnable() { + @Override public void run() { long startNs = System.nanoTime(); try { @@ -238,6 +242,7 @@ public void run() { return deletedSomething.get(); } + @Override public Map>> getAll(Iterable keys, Map transforms) throws VoldemortException { @@ -414,9 +419,11 @@ public Map>> getAll(Iterable keys, return result; } + @Override public List> get(ByteArray key, final byte[] transforms) { Function>>, Void> readRepairFunction = new Function>>, Void>() { + @Override public Void apply(List>> nodeResults) { List> nodeValues = Lists.newArrayListWithExpectedSize(nodeResults.size()); for(GetResult> getResult: nodeResults) @@ -581,6 +588,7 @@ private void repairReads(List> nodeValues, boolean this.executor.execute(new Runnable() { + @Override public void run() { for(NodeValue v: toReadRepair) { try { @@ -631,6 +639,7 @@ private String formatNodeValues(List> results) { return builder.toString(); } + @Override public void put(final ByteArray key, final Versioned versioned, final byte[] transforms) throws VoldemortException { long startNs = System.nanoTime(); @@ -697,6 +706,7 @@ public void put(final ByteArray key, final Versioned versioned, final by final Node node = nodes.get(currentNode); this.executor.execute(new Runnable() { + @Override public void run() { long startNsLocal = System.nanoTime(); try { @@ -806,6 +816,7 @@ private List availableNodes(List list) { return available; } + @Override public List getVersions(ByteArray key) { return get(key, null, VERSION_OP, null); } @@ -832,6 +843,7 @@ public GetCallable(Node node, ByteArray key, byte[] transforms, StoreOp fetch this.fetcher = fetcher; } + @Override public GetResult call() throws Exception { List fetched = Collections.emptyList(); Throwable exception = null; @@ -885,6 +897,7 @@ private GetAllCallable(Node node, this.transforms = transforms; } + @Override public GetAllResult call() { Map>> retrieved = Collections.emptyMap(); Throwable exception = null; diff --git a/src/java/voldemort/store/routed/action/AbstractAction.java b/src/java/voldemort/store/routed/action/AbstractAction.java index bdd457dc27..906db19e6b 100644 --- a/src/java/voldemort/store/routed/action/AbstractAction.java +++ b/src/java/voldemort/store/routed/action/AbstractAction.java @@ -1,5 +1,5 @@ /* - * Copyright 2010 LinkedIn, Inc + * Copyright 2010-2012 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -29,6 +29,7 @@ import voldemort.store.routed.PipelineData; import voldemort.store.routed.Response; import voldemort.utils.Utils; +import voldemort.versioning.ObsoleteVersionException; public abstract class AbstractAction> implements Action { @@ -58,13 +59,18 @@ protected boolean handleResponseError(Exception e, long requestTime, Pipeline pipeline, FailureDetector failureDetector) { - if(logger.isEnabledFor(Level.WARN)) { - if(e instanceof StoreTimeoutException) - logger.warn("Error in " + pipeline.getOperation().getSimpleName() + " on node " - + node.getId() + "(" + node.getHost() + ") : " + e.getMessage()); - else + if(e instanceof StoreTimeoutException || e instanceof ObsoleteVersionException + || e instanceof UnreachableStoreException) { + // Quietly mask all errors that are "expected" regularly. + if(logger.isEnabledFor(Level.DEBUG)) { + logger.debug("Error in " + pipeline.getOperation().getSimpleName() + " on node " + + node.getId() + " (" + node.getHost() + ") : " + e.getMessage()); + } + } else { + if(logger.isEnabledFor(Level.WARN)) { logger.warn("Error in " + pipeline.getOperation().getSimpleName() + " on node " - + node.getId() + "(" + node.getHost() + ")", e); + + node.getId() + " (" + node.getHost() + ")", e); + } } if(e instanceof UnreachableStoreException) { diff --git a/src/java/voldemort/store/routed/action/PerformParallelDeleteRequests.java b/src/java/voldemort/store/routed/action/PerformParallelDeleteRequests.java index 634b470805..2eb799e630 100644 --- a/src/java/voldemort/store/routed/action/PerformParallelDeleteRequests.java +++ b/src/java/voldemort/store/routed/action/PerformParallelDeleteRequests.java @@ -1,3 +1,19 @@ +/* + * Copyright 2012 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + package voldemort.store.routed.action; import java.util.ArrayList; @@ -286,5 +302,4 @@ public void requestComplete(Object result, long requestTime) { } } } - } diff --git a/src/java/voldemort/store/routed/action/PerformParallelRequests.java b/src/java/voldemort/store/routed/action/PerformParallelRequests.java index 18385a5f7f..f0a6c4f1e1 100644 --- a/src/java/voldemort/store/routed/action/PerformParallelRequests.java +++ b/src/java/voldemort/store/routed/action/PerformParallelRequests.java @@ -167,6 +167,7 @@ else if(pipeline.getOperation() == Operation.GET_VERSIONS) return; } else { pipelineData.incrementSuccesses(); + Response rCast = Utils.uncheckedCast(response); pipelineData.getResponses().add(rCast); failureDetector.recordSuccess(response.getNode(), response.getRequestTime()); @@ -232,5 +233,4 @@ else if(pipeline.getOperation() == Operation.GET_VERSIONS) } } } - } diff --git a/src/java/voldemort/store/serialized/SerializingStorageEngine.java b/src/java/voldemort/store/serialized/SerializingStorageEngine.java index 949e485e54..8ab9843b4b 100644 --- a/src/java/voldemort/store/serialized/SerializingStorageEngine.java +++ b/src/java/voldemort/store/serialized/SerializingStorageEngine.java @@ -53,14 +53,27 @@ public static SerializingStorageEngine wrap(StorageEngi return new SerializingStorageEngine(s, k, v, t); } + @Override public ClosableIterator>> entries() { return new EntriesIterator(storageEngine.entries()); } + @Override public ClosableIterator keys() { return new KeysIterator(storageEngine.keys()); } + @Override + public ClosableIterator>> entries(int partition) { + return new EntriesIterator(storageEngine.entries(partition)); + } + + @Override + public ClosableIterator keys(int partition) { + return new KeysIterator(storageEngine.keys(partition)); + } + + @Override public void truncate() { storageEngine.truncate(); } @@ -73,10 +86,12 @@ public KeysIterator(ClosableIterator iterator) { this.iterator = iterator; } + @Override public boolean hasNext() { return iterator.hasNext(); } + @Override public K next() { ByteArray key = iterator.next(); if(key == null) @@ -84,10 +99,12 @@ public K next() { return getKeySerializer().toObject(key.get()); } + @Override public void remove() { iterator.remove(); } + @Override public void close() { iterator.close(); } @@ -101,10 +118,12 @@ public EntriesIterator(ClosableIterator>> iter this.iterator = iterator; } + @Override public boolean hasNext() { return iterator.hasNext(); } + @Override public Pair> next() { Pair> keyAndVal = iterator.next(); if(keyAndVal == null) { @@ -118,16 +137,34 @@ public Pair> next() { } + @Override public void remove() { iterator.remove(); } + @Override public void close() { iterator.close(); } } + @Override public boolean isPartitionAware() { return storageEngine.isPartitionAware(); } + + @Override + public boolean isPartitionScanSupported() { + return storageEngine.isPartitionScanSupported(); + } + + @Override + public boolean beginBatchModifications() { + return false; + } + + @Override + public boolean endBatchModifications() { + return false; + } } diff --git a/src/java/voldemort/store/serialized/SerializingStore.java b/src/java/voldemort/store/serialized/SerializingStore.java index ffc5065d94..9ed67f1df4 100644 --- a/src/java/voldemort/store/serialized/SerializingStore.java +++ b/src/java/voldemort/store/serialized/SerializingStore.java @@ -22,6 +22,7 @@ import voldemort.VoldemortException; import voldemort.serialization.Serializer; +import voldemort.store.AbstractStore; import voldemort.store.Store; import voldemort.store.StoreCapabilityType; import voldemort.store.StoreUtils; @@ -42,7 +43,7 @@ * @param The type of the value being stored * @param The type of transform */ -public class SerializingStore implements Store { +public class SerializingStore extends AbstractStore { private final Store store; private final Serializer keySerializer; @@ -53,6 +54,7 @@ public SerializingStore(Store store, Serializer keySerializer, Serializer valueSerializer, Serializer transformsSerializer) { + super(store.getName()); this.store = Utils.notNull(store); this.keySerializer = Utils.notNull(keySerializer); this.valueSerializer = Utils.notNull(valueSerializer); @@ -66,6 +68,7 @@ public static SerializingStore wrap(Store(s, k, v, t); } + @Override public boolean delete(K key, Version version) throws VoldemortException { return store.delete(keyToBytes(key), version); } @@ -99,6 +102,7 @@ private Map transformsToBytes(Map transforms) { return result; } + @Override public List> get(K key, T transforms) throws VoldemortException { List> found = store.get(keyToBytes(key), (transformsSerializer != null && transforms != null) ? transformsSerializer.toBytes(transforms) @@ -110,6 +114,7 @@ public List> get(K key, T transforms) throws VoldemortException { return results; } + @Override public Map>> getAll(Iterable keys, Map transforms) throws VoldemortException { StoreUtils.assertValidKeys(keys); @@ -129,10 +134,7 @@ public Map>> getAll(Iterable keys, Map transforms) return result; } - public String getName() { - return store.getName(); - } - + @Override public void put(K key, Versioned value, T transforms) throws VoldemortException { store.put(keyToBytes(key), new Versioned(valueSerializer.toBytes(value.getValue()), @@ -140,10 +142,12 @@ public void put(K key, Versioned value, T transforms) throws VoldemortExcepti transformToBytes(transforms)); } + @Override public List getVersions(K key) { return store.getVersions(keyToBytes(key)); } + @Override public void close() { store.close(); } @@ -156,6 +160,7 @@ protected Serializer getKeySerializer() { return keySerializer; } + @Override public Object getCapability(StoreCapabilityType capability) { switch(capability) { case KEY_SERIALIZER: @@ -166,5 +171,4 @@ public Object getCapability(StoreCapabilityType capability) { return store.getCapability(capability); } } - } diff --git a/src/java/voldemort/store/slop/HintedHandoff.java b/src/java/voldemort/store/slop/HintedHandoff.java index 5e506a85e2..00182e4250 100644 --- a/src/java/voldemort/store/slop/HintedHandoff.java +++ b/src/java/voldemort/store/slop/HintedHandoff.java @@ -1,5 +1,5 @@ /* - * Copyright 2010 LinkedIn, Inc + * Copyright 2010-2012 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -27,10 +27,10 @@ import voldemort.serialization.SlopSerializer; import voldemort.store.Store; import voldemort.store.UnreachableStoreException; -import voldemort.store.slop.strategy.HintedHandoffStrategy; import voldemort.store.nonblockingstore.NonblockingStore; import voldemort.store.nonblockingstore.NonblockingStoreCallback; import voldemort.store.routed.Response; +import voldemort.store.slop.strategy.HintedHandoffStrategy; import voldemort.utils.ByteArray; import voldemort.utils.Time; import voldemort.utils.Utils; @@ -67,7 +67,8 @@ public class HintedHandoff { * Create a Hinted Handoff object * * @param failureDetector The failure detector - * @param nonblockingSlopStores A map of node ids to nonb-locking slop stores + * @param nonblockingSlopStores A map of node ids to nonb-locking slop + * stores * @param slopStores A map of node ids to blocking slop stores * @param handoffStrategy The {@link HintedHandoffStrategy} implementation * @param failedNodes A list of nodes in the original preflist for the @@ -89,15 +90,18 @@ public HintedHandoff(FailureDetector failureDetector, } /** - * Like {@link #sendHintSerial(voldemort.cluster.Node, voldemort.versioning.Version, Slop)}, - * but doesn't block the pipeline. Intended for handling prolonged failures without - * incurring a performance cost. - * - * @see #sendHintSerial(voldemort.cluster.Node, voldemort.versioning.Version, Slop) + * Like + * {@link #sendHintSerial(voldemort.cluster.Node, voldemort.versioning.Version, Slop)} + * , but doesn't block the pipeline. Intended for handling prolonged + * failures without incurring a performance cost. + * + * @see #sendHintSerial(voldemort.cluster.Node, + * voldemort.versioning.Version, Slop) */ public void sendHintParallel(final Node failedNode, final Version version, final Slop slop) { final ByteArray slopKey = slop.makeKey(); - Versioned slopVersioned = new Versioned(slopSerializer.toBytes(slop), version); + Versioned slopVersioned = new Versioned(slopSerializer.toBytes(slop), + version); for(final Node node: handoffStrategy.routeHint(failedNode)) { int nodeId = node.getId(); @@ -115,6 +119,7 @@ public void sendHintParallel(final Node failedNode, final Version version, final + " to node " + node); NonblockingStoreCallback callback = new NonblockingStoreCallback() { + public void requestComplete(Object result, long requestTime) { Response response = new Response(node, slopKey, @@ -123,6 +128,11 @@ public void requestComplete(Object result, long requestTime) { if(response.getValue() instanceof Exception) { if(response.getValue() instanceof ObsoleteVersionException) { // Ignore + + // TODO: Treating ObsoleteVersionException as + // "success", but there is no logger.debug to + // note that the slop was written, nor is there + // a failureDetector.recordSuccess invocation. } else { // Use the blocking approach if(!failedNodes.contains(node)) @@ -130,15 +140,16 @@ public void requestComplete(Object result, long requestTime) { if(response.getValue() instanceof UnreachableStoreException) { UnreachableStoreException use = (UnreachableStoreException) response.getValue(); - if(logger.isDebugEnabled()) - logger.debug("Write of key " + slop.getKey() + " for " - + failedNode + " to node " + node - + " failed due to unreachable: " - + use.getMessage()); + if(logger.isDebugEnabled()) { + logger.debug("Write of key " + slop.getKey() + " for " + + failedNode + " to node " + node + + " failed due to unreachable: " + + use.getMessage()); + } failureDetector.recordException(node, (System.nanoTime() - startNs) - / Time.NS_PER_MS, + / Time.NS_PER_MS, use); } sendHintSerial(failedNode, version, slop); @@ -157,16 +168,12 @@ public void requestComplete(Object result, long requestTime) { } }; - nonblockingStore.submitPutRequest(slopKey, - slopVersioned, - null, - callback, - timeoutMs); + nonblockingStore.submitPutRequest(slopKey, slopVersioned, null, callback, timeoutMs); break; } } } - + /** * Send a hint of a request originally meant for the failed node to another * node in the ring, as selected by the {@link HintedHandoffStrategy} @@ -215,12 +222,17 @@ public boolean sendHintSerial(Node failedNode, Version version, Slop slop) { if(logger.isDebugEnabled()) logger.debug("Slop write of key " + slop.getKey() + " (keyRef: " - + System.identityHashCode(slop.getKey()) + " for " + failedNode + + System.identityHashCode(slop.getKey()) + ") for " + failedNode + " to node " + node + " succeeded in " + (System.nanoTime() - startNs) + " ns"); } } + if(!persisted) { + logger.error("Slop write of key " + slop.getKey() + " (keyRef: " + + System.identityHashCode(slop.getKey()) + ") for " + failedNode + + " was not written."); + } return persisted; } } diff --git a/src/java/voldemort/store/slop/SlopStorageEngine.java b/src/java/voldemort/store/slop/SlopStorageEngine.java index 2f17aaa2e8..7f95b0b03a 100644 --- a/src/java/voldemort/store/slop/SlopStorageEngine.java +++ b/src/java/voldemort/store/slop/SlopStorageEngine.java @@ -25,6 +25,7 @@ import voldemort.serialization.ByteArraySerializer; import voldemort.serialization.IdentitySerializer; import voldemort.serialization.SlopSerializer; +import voldemort.store.AbstractStorageEngine; import voldemort.store.StorageEngine; import voldemort.store.StoreCapabilityType; import voldemort.store.serialized.SerializingStorageEngine; @@ -41,7 +42,7 @@ * last run * */ -public class SlopStorageEngine implements StorageEngine { +public class SlopStorageEngine extends AbstractStorageEngine { public static final String SLOP_STORE_NAME = "slop"; @@ -50,6 +51,7 @@ public class SlopStorageEngine implements StorageEngine slopEngine, Cluster cluster) { + super(slopEngine.getName()); this.slopEngine = slopEngine; this.slopSerializer = new SlopSerializer(); this.slopStats = new SlopStats(cluster); @@ -81,54 +83,86 @@ public StorageEngine asSlopStore() { new IdentitySerializer()); } + @Override public ClosableIterator>> entries() { return slopEngine.entries(); } + @Override public ClosableIterator keys() { return slopEngine.keys(); } + @Override + public ClosableIterator>> entries(int partition) { + return slopEngine.entries(partition); + } + + @Override + public ClosableIterator keys(int partition) { + return slopEngine.keys(partition); + } + + @Override public void truncate() { slopEngine.truncate(); } + @Override public List> get(ByteArray key, byte[] transforms) throws VoldemortException { return slopEngine.get(key, transforms); } + @Override public Map>> getAll(Iterable keys, Map transforms) throws VoldemortException { return slopEngine.getAll(keys, transforms); } + @Override public void put(ByteArray key, Versioned value, byte[] transforms) throws VoldemortException { slopEngine.put(key, value, transforms); } + @Override public boolean delete(ByteArray key, Version version) throws VoldemortException { return slopEngine.delete(key, version); } - public String getName() { - return slopEngine.getName(); - } - + @Override public void close() throws VoldemortException { slopEngine.close(); } + @Override public Object getCapability(StoreCapabilityType capability) { return slopEngine.getCapability(capability); } + @Override public List getVersions(ByteArray key) { return slopEngine.getVersions(key); } + @Override public boolean isPartitionAware() { return slopEngine.isPartitionAware(); } + + @Override + public boolean isPartitionScanSupported() { + return slopEngine.isPartitionScanSupported(); + } + + @Override + public boolean beginBatchModifications() { + return slopEngine.beginBatchModifications(); + } + + @Override + public boolean endBatchModifications() { + return slopEngine.endBatchModifications(); + } } diff --git a/src/java/voldemort/store/socket/SocketStore.java b/src/java/voldemort/store/socket/SocketStore.java index 06433b7c24..a65673aa05 100644 --- a/src/java/voldemort/store/socket/SocketStore.java +++ b/src/java/voldemort/store/socket/SocketStore.java @@ -26,8 +26,8 @@ import voldemort.client.protocol.RequestFormat; import voldemort.client.protocol.RequestFormatFactory; import voldemort.server.RequestRoutingType; +import voldemort.store.AbstractStore; import voldemort.store.NoSuchCapabilityException; -import voldemort.store.Store; import voldemort.store.StoreCapabilityType; import voldemort.store.StoreUtils; import voldemort.store.UnreachableStoreException; @@ -59,11 +59,11 @@ * {@link ClientRequestExecutorPool pool} and adds an appropriate * {@link ClientRequest request} to be processed by the NIO thread. */ -public class SocketStore implements Store, NonblockingStore { +public class SocketStore extends AbstractStore implements + NonblockingStore { private final RequestFormatFactory requestFormatFactory = new RequestFormatFactory(); - private final String storeName; private final long timeoutMs; private final ClientRequestExecutorPool pool; private final SocketDestination destination; @@ -76,7 +76,7 @@ public SocketStore(String storeName, SocketDestination dest, ClientRequestExecutorPool pool, RequestRoutingType requestRoutingType) { - this.storeName = Utils.notNull(storeName); + super(storeName); this.timeoutMs = timeoutMs; this.pool = Utils.notNull(pool); this.destination = dest; @@ -84,12 +84,13 @@ public SocketStore(String storeName, this.requestRoutingType = requestRoutingType; } + @Override public void submitDeleteRequest(ByteArray key, Version version, NonblockingStoreCallback callback, long timeoutMs) { StoreUtils.assertValidKey(key); - DeleteClientRequest clientRequest = new DeleteClientRequest(storeName, + DeleteClientRequest clientRequest = new DeleteClientRequest(getName(), requestFormat, requestRoutingType, key, @@ -100,12 +101,13 @@ public void submitDeleteRequest(ByteArray key, requestAsync(clientRequest, callback, timeoutMs, "delete"); } + @Override public void submitGetRequest(ByteArray key, byte[] transforms, NonblockingStoreCallback callback, long timeoutMs) { StoreUtils.assertValidKey(key); - GetClientRequest clientRequest = new GetClientRequest(storeName, + GetClientRequest clientRequest = new GetClientRequest(getName(), requestFormat, requestRoutingType, key, @@ -116,12 +118,13 @@ public void submitGetRequest(ByteArray key, requestAsync(clientRequest, callback, timeoutMs, "get"); } + @Override public void submitGetAllRequest(Iterable keys, Map transforms, NonblockingStoreCallback callback, long timeoutMs) { StoreUtils.assertValidKeys(keys); - GetAllClientRequest clientRequest = new GetAllClientRequest(storeName, + GetAllClientRequest clientRequest = new GetAllClientRequest(getName(), requestFormat, requestRoutingType, keys, @@ -132,11 +135,12 @@ public void submitGetAllRequest(Iterable keys, requestAsync(clientRequest, callback, timeoutMs, "get all"); } + @Override public void submitGetVersionsRequest(ByteArray key, NonblockingStoreCallback callback, long timeoutMs) { StoreUtils.assertValidKey(key); - GetVersionsClientRequest clientRequest = new GetVersionsClientRequest(storeName, + GetVersionsClientRequest clientRequest = new GetVersionsClientRequest(getName(), requestFormat, requestRoutingType, key); @@ -146,13 +150,14 @@ public void submitGetVersionsRequest(ByteArray key, requestAsync(clientRequest, callback, timeoutMs, "get versions"); } + @Override public void submitPutRequest(ByteArray key, Versioned value, byte[] transforms, NonblockingStoreCallback callback, long timeoutMs) { StoreUtils.assertValidKey(key); - PutClientRequest clientRequest = new PutClientRequest(storeName, + PutClientRequest clientRequest = new PutClientRequest(getName(), requestFormat, requestRoutingType, key, @@ -164,9 +169,10 @@ public void submitPutRequest(ByteArray key, requestAsync(clientRequest, callback, timeoutMs, "put"); } + @Override public boolean delete(ByteArray key, Version version) throws VoldemortException { StoreUtils.assertValidKey(key); - DeleteClientRequest clientRequest = new DeleteClientRequest(storeName, + DeleteClientRequest clientRequest = new DeleteClientRequest(getName(), requestFormat, requestRoutingType, key, @@ -177,9 +183,10 @@ public boolean delete(ByteArray key, Version version) throws VoldemortException return request(clientRequest, "delete"); } + @Override public List> get(ByteArray key, byte[] transforms) throws VoldemortException { StoreUtils.assertValidKey(key); - GetClientRequest clientRequest = new GetClientRequest(storeName, + GetClientRequest clientRequest = new GetClientRequest(getName(), requestFormat, requestRoutingType, key, @@ -190,11 +197,12 @@ public List> get(ByteArray key, byte[] transforms) throws Vold return request(clientRequest, "get"); } + @Override public Map>> getAll(Iterable keys, Map transforms) throws VoldemortException { StoreUtils.assertValidKeys(keys); - GetAllClientRequest clientRequest = new GetAllClientRequest(storeName, + GetAllClientRequest clientRequest = new GetAllClientRequest(getName(), requestFormat, requestRoutingType, keys, @@ -205,9 +213,10 @@ public Map>> getAll(Iterable keys, return request(clientRequest, "getAll"); } + @Override public List getVersions(ByteArray key) { StoreUtils.assertValidKey(key); - GetVersionsClientRequest clientRequest = new GetVersionsClientRequest(storeName, + GetVersionsClientRequest clientRequest = new GetVersionsClientRequest(getName(), requestFormat, requestRoutingType, key); @@ -217,10 +226,11 @@ public List getVersions(ByteArray key) { return request(clientRequest, "getVersions"); } + @Override public void put(ByteArray key, Versioned versioned, byte[] transforms) throws VoldemortException { StoreUtils.assertValidKey(key); - PutClientRequest clientRequest = new PutClientRequest(storeName, + PutClientRequest clientRequest = new PutClientRequest(getName(), requestFormat, requestRoutingType, key, @@ -232,6 +242,7 @@ public void put(ByteArray key, Versioned versioned, byte[] transforms) request(clientRequest, "put"); } + @Override public Object getCapability(StoreCapabilityType capability) { if(StoreCapabilityType.SOCKET_POOL.equals(capability)) return this.pool; @@ -239,14 +250,6 @@ public Object getCapability(StoreCapabilityType capability) { throw new NoSuchCapabilityException(capability, getName()); } - public String getName() { - return storeName; - } - - public void close() throws VoldemortException { - // don't close the socket pool, it is shared - } - /** * This method handles submitting and then waiting for the request from the * server. It uses the ClientRequest API to actually write the request and @@ -355,5 +358,4 @@ private void requestAsync(ClientRequest delegate, String operationName) { pool.submitAsync(this.destination, delegate, callback, timeoutMs, operationName); } - } diff --git a/src/java/voldemort/store/socket/clientrequest/ClientRequestExecutor.java b/src/java/voldemort/store/socket/clientrequest/ClientRequestExecutor.java index 3dcdc76bdf..1111b57b0c 100644 --- a/src/java/voldemort/store/socket/clientrequest/ClientRequestExecutor.java +++ b/src/java/voldemort/store/socket/clientrequest/ClientRequestExecutor.java @@ -28,7 +28,8 @@ import org.apache.log4j.Level; -import voldemort.utils.SelectorManagerWorker; +import voldemort.common.nio.CommBufferSizeStats; +import voldemort.common.nio.SelectorManagerWorker; import voldemort.utils.Time; /** @@ -55,7 +56,8 @@ public class ClientRequestExecutor extends SelectorManagerWorker { public ClientRequestExecutor(Selector selector, SocketChannel socketChannel, int socketBufferSize) { - super(selector, socketChannel, socketBufferSize); + // Not tracking or exposing the comm buffer statistics for now + super(selector, socketChannel, socketBufferSize, new CommBufferSizeStats()); isExpired = false; } @@ -106,7 +108,7 @@ public synchronized void addClientRequest(ClientRequest clientRequest, if(timeoutMs == -1) { this.expiration = -1; } else { - if (elapsedNs > (Time.NS_PER_MS * timeoutMs)) { + if(elapsedNs > (Time.NS_PER_MS * timeoutMs)) { this.expiration = System.nanoTime(); } else { this.expiration = System.nanoTime() + (Time.NS_PER_MS * timeoutMs) - elapsedNs; diff --git a/src/java/voldemort/store/socket/clientrequest/ClientRequestExecutorFactory.java b/src/java/voldemort/store/socket/clientrequest/ClientRequestExecutorFactory.java index ce95df84c2..212b290d9c 100644 --- a/src/java/voldemort/store/socket/clientrequest/ClientRequestExecutorFactory.java +++ b/src/java/voldemort/store/socket/clientrequest/ClientRequestExecutorFactory.java @@ -36,10 +36,10 @@ import org.apache.log4j.Level; import org.apache.log4j.Logger; +import voldemort.common.nio.SelectorManager; import voldemort.store.socket.SocketDestination; import voldemort.store.stats.ClientSocketStats; import voldemort.utils.DaemonThreadFactory; -import voldemort.utils.SelectorManager; import voldemort.utils.Time; import voldemort.utils.pool.ResourceFactory; @@ -120,104 +120,107 @@ public ClientRequestExecutor create(SocketDestination dest) throws Exception { + dest.getPort() + " using protocol " + dest.getRequestFormatType().getCode()); - SocketChannel socketChannel = SocketChannel.open(); - socketChannel.socket().setReceiveBufferSize(this.socketBufferSize); - socketChannel.socket().setSendBufferSize(this.socketBufferSize); - socketChannel.socket().setTcpNoDelay(true); - socketChannel.socket().setSoTimeout(soTimeoutMs); - socketChannel.socket().setKeepAlive(this.socketKeepAlive); - socketChannel.configureBlocking(false); - socketChannel.connect(new InetSocketAddress(dest.getHost(), dest.getPort())); - - long startTime = System.currentTimeMillis(); - long duration = 0; - long currWaitTime = 1; - long prevWaitTime = 1; - - // Since we're non-blocking and it takes a non-zero amount of time - // to connect, invoke finishConnect and loop. - while(!socketChannel.finishConnect()) { - duration = System.currentTimeMillis() - startTime; - long remaining = this.connectTimeoutMs - duration; - - if(remaining < 0) { - // Don't forget to close the socket before we throw our - // exception or they'll leak :( + SocketChannel socketChannel = null; + ClientRequestExecutor clientRequestExecutor = null; + + try { + socketChannel = SocketChannel.open(); + socketChannel.socket().setReceiveBufferSize(this.socketBufferSize); + socketChannel.socket().setSendBufferSize(this.socketBufferSize); + socketChannel.socket().setTcpNoDelay(true); + socketChannel.socket().setSoTimeout(soTimeoutMs); + socketChannel.socket().setKeepAlive(this.socketKeepAlive); + socketChannel.configureBlocking(false); + socketChannel.connect(new InetSocketAddress(dest.getHost(), dest.getPort())); + + long startTimeMs = System.currentTimeMillis(); + long durationMs = 0; + long currWaitTimeMs = 1; + long prevWaitTimeMS = 1; + + // Since we're non-blocking and it takes a non-zero amount of time + // to connect, invoke finishConnect and loop. + while(!socketChannel.finishConnect()) { + durationMs = System.currentTimeMillis() - startTimeMs; + long remaining = this.connectTimeoutMs - durationMs; + + if(remaining < 0) { + throw new ConnectException("Cannot connect socket " + numCreated + " for " + + dest.getHost() + ":" + dest.getPort() + " after " + + durationMs + " ms"); + } + + if(logger.isTraceEnabled()) + logger.trace("Still creating socket " + numCreated + " for " + dest.getHost() + + ":" + dest.getPort() + ", " + remaining + + " ms. remaining to connect"); + try { - socketChannel.close(); - } catch(Exception e) { + // Break up the connection timeout into smaller units, + // employing a Fibonacci-style back-off (1, 2, 3, 5, 8, ...) + Thread.sleep(Math.min(remaining, currWaitTimeMs)); + currWaitTimeMs = Math.min(currWaitTimeMs + prevWaitTimeMS, 50); + prevWaitTimeMS = currWaitTimeMs - prevWaitTimeMS; + } catch(InterruptedException e) { if(logger.isEnabledFor(Level.WARN)) logger.warn(e, e); } - - throw new ConnectException("Cannot connect socket " + numCreated + " for " - + dest.getHost() + ":" + dest.getPort() + " after " - + duration + " ms"); } + durationMs = System.currentTimeMillis() - startTimeMs; - if(logger.isTraceEnabled()) - logger.trace("Still creating socket " + numCreated + " for " + dest.getHost() + ":" - + dest.getPort() + ", " + remaining + " ms. remaining to connect"); - - try { - // Break up the connection timeout into smaller units, - // employing a Fibonacci-style back-off (1, 2, 3, 5, 8, ...) - Thread.sleep(Math.min(remaining, currWaitTime)); - currWaitTime = Math.min(currWaitTime + prevWaitTime, 50); - prevWaitTime = currWaitTime - prevWaitTime; - } catch(InterruptedException e) { - if(logger.isEnabledFor(Level.WARN)) - logger.warn(e, e); - } - } - - if(logger.isDebugEnabled()) - logger.debug("Created socket " + numCreated + " for " + dest.getHost() + ":" - + dest.getPort() + " using protocol " - + dest.getRequestFormatType().getCode() + " after " + duration + " ms."); - - // check buffer sizes--you often don't get out what you put in! - if(socketChannel.socket().getReceiveBufferSize() != this.socketBufferSize) - logger.debug("Requested socket receive buffer size was " + this.socketBufferSize - + " bytes but actual size is " - + socketChannel.socket().getReceiveBufferSize() + " bytes."); - - if(socketChannel.socket().getSendBufferSize() != this.socketBufferSize) - logger.debug("Requested socket send buffer size was " + this.socketBufferSize - + " bytes but actual size is " - + socketChannel.socket().getSendBufferSize() + " bytes."); - - ClientRequestSelectorManager selectorManager = selectorManagers[counter.getAndIncrement() - % selectorManagers.length]; - - Selector selector = selectorManager.getSelector(); - ClientRequestExecutor clientRequestExecutor = new ClientRequestExecutor(selector, - socketChannel, - socketBufferSize); - BlockingClientRequest clientRequest = new BlockingClientRequest(new ProtocolNegotiatorClientRequest(dest.getRequestFormatType()), - this.getTimeout()); - clientRequestExecutor.addClientRequest(clientRequest); - - selectorManager.registrationQueue.add(clientRequestExecutor); - selector.wakeup(); - - // Block while we wait for the protocol negotiation to complete. - clientRequest.await(); - - try { - // This will throw an error if the result of the protocol - // negotiation failed, otherwise it returns an uninteresting token - // we can safely ignore. + if(logger.isDebugEnabled()) + logger.debug("Created socket " + numCreated + " for " + dest.getHost() + ":" + + dest.getPort() + " using protocol " + + dest.getRequestFormatType().getCode() + " after " + durationMs + + " ms."); + + // check buffer sizes--you often don't get out what you put in! + if(socketChannel.socket().getReceiveBufferSize() != this.socketBufferSize) + logger.debug("Requested socket receive buffer size was " + this.socketBufferSize + + " bytes but actual size is " + + socketChannel.socket().getReceiveBufferSize() + " bytes."); + + if(socketChannel.socket().getSendBufferSize() != this.socketBufferSize) + logger.debug("Requested socket send buffer size was " + this.socketBufferSize + + " bytes but actual size is " + + socketChannel.socket().getSendBufferSize() + " bytes."); + + ClientRequestSelectorManager selectorManager = selectorManagers[counter.getAndIncrement() + % selectorManagers.length]; + + Selector selector = selectorManager.getSelector(); + clientRequestExecutor = new ClientRequestExecutor(selector, + socketChannel, + socketBufferSize); + BlockingClientRequest clientRequest = new BlockingClientRequest(new ProtocolNegotiatorClientRequest(dest.getRequestFormatType()), + this.getTimeout()); + clientRequestExecutor.addClientRequest(clientRequest); + + selectorManager.registrationQueue.add(clientRequestExecutor); + selector.wakeup(); + + // Block while we wait for protocol negotiation to complete. May + // throw interrupted exception + clientRequest.await(); + + // Either returns uninteresting token, or throws exception if + // protocol negotiation failed. clientRequest.getResult(); } catch(Exception e) { - // Don't forget to close the socket before we throw our exception or - // they'll leak :( - try { - socketChannel.close(); - } catch(Exception ex) { - if(logger.isEnabledFor(Level.WARN)) - logger.warn(ex, ex); + // Make sure not to leak socketChannels + if(socketChannel != null) { + try { + socketChannel.close(); + } catch(Exception ex) { + if(logger.isEnabledFor(Level.WARN)) + logger.warn(ex, ex); + } } + // If clientRequestExector is not null, some additional clean up may + // be warranted. However, clientRequestExecutor.close(), the + // "obvious" clean up, is not safe to call here. This is because + // .close() checks in a resource to the KeyedResourcePool that was + // never checked out. throw e; } diff --git a/src/java/voldemort/store/socket/clientrequest/ClientRequestExecutorPool.java b/src/java/voldemort/store/socket/clientrequest/ClientRequestExecutorPool.java index dd78813346..7d0e75c251 100644 --- a/src/java/voldemort/store/socket/clientrequest/ClientRequestExecutorPool.java +++ b/src/java/voldemort/store/socket/clientrequest/ClientRequestExecutorPool.java @@ -55,7 +55,6 @@ * Upon successful construction of this object, a new Thread is started. It is * terminated upon calling {@link #close()}. */ - public class ClientRequestExecutorPool implements SocketStoreFactory { private final QueuedKeyedResourcePool queuedPool; @@ -156,18 +155,29 @@ public SocketStore create(String storeName, */ public ClientRequestExecutor checkout(SocketDestination destination) { - // time checkout - long start = System.nanoTime(); + // timing instrumentation (stats only) + long startTimeNs = 0; + if(stats != null) { + startTimeNs = System.nanoTime(); + } + ClientRequestExecutor clientRequestExecutor; try { clientRequestExecutor = queuedPool.checkout(destination); } catch(Exception e) { + // If this exception caught here is from the nonBlockingPut call + // within KeyedResourcePool.attemptGrow(), then there is the chance + // a ClientRequestExector resource will be leaked. Cannot safely + // deal with this here since clientRequestExecutor is not assigned + // in this catch. Even if it was, clientRequestExecutore.close() + // checks in the SocketDestination resource and so is not safe to + // call. throw new UnreachableStoreException("Failure while checking out socket for " + destination + ": ", e); } finally { - long end = System.nanoTime(); if(stats != null) { - stats.recordCheckoutTimeUs(destination, (end - start) / Time.NS_PER_US); + stats.recordCheckoutTimeUs(destination, (System.nanoTime() - startTimeNs) + / Time.NS_PER_US); stats.recordCheckoutQueueLength(destination, queuedPool.getBlockingGetsCount(destination)); } @@ -190,6 +200,12 @@ public void checkin(SocketDestination destination, ClientRequestExecutor clientR } } + /** + * Reset the pool of resources for a specific destination. Idle resources + * will be destroyed. Checked out resources that are subsequently checked in + * will be destroyed. Newly created resources can be checked in to + * reestablish resources for the specific destination. + */ @Override public void close(SocketDestination destination) { factory.setLastClosedTimestamp(destination); @@ -197,7 +213,8 @@ public void close(SocketDestination destination) { } /** - * Close the socket pool + * Permanently close the ClientRequestExecutor pool. Resources subsequently + * checked in will be destroyed. */ @Override public void close() { diff --git a/src/java/voldemort/store/stats/ClientSocketStats.java b/src/java/voldemort/store/stats/ClientSocketStats.java index d9f7584ce6..19955788cf 100644 --- a/src/java/voldemort/store/stats/ClientSocketStats.java +++ b/src/java/voldemort/store/stats/ClientSocketStats.java @@ -22,6 +22,8 @@ import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; +import org.apache.log4j.Logger; + import voldemort.store.socket.SocketDestination; import voldemort.store.socket.clientrequest.ClientRequestExecutor; import voldemort.utils.JmxUtils; @@ -57,7 +59,9 @@ public class ClientSocketStats { private QueuedKeyedResourcePool pool; // monitoringInterval <= connectionCheckouts + resourceRequests - private final AtomicInteger monitoringInterval = new AtomicInteger(10000); + // 1 qps => monitoring interval of just over a day (~27 hours) + // 1000 qps => monitoring interval of 1 minute and 40 seconds + private final AtomicInteger monitoringInterval = new AtomicInteger(100000); // Connection lifecycle private final AtomicInteger connectionsCreated = new AtomicInteger(0); private final AtomicInteger connectionsDestroyed = new AtomicInteger(0); @@ -73,6 +77,7 @@ public class ClientSocketStats { private final Histogram resourceRequestQueueLengthHistogram = new Histogram(250, 1); private final int jmxId; + private static final Logger logger = Logger.getLogger(ClientSocketStats.class.getName()); /** * To construct a per node stats object @@ -91,6 +96,12 @@ public ClientSocketStats(ClientSocketStats parent, this.destination = destination; this.pool = pool; this.jmxId = jmxId; + + if(logger.isDebugEnabled()) { + logger.debug("Constructed ClientSocketStatsStats object (" + + System.identityHashCode(this) + ") with parent object(" + + System.identityHashCode(parent) + ")"); + } } /** @@ -104,6 +115,12 @@ public ClientSocketStats(int jmxId) { this.destination = null; this.pool = null; this.jmxId = jmxId; + + if(logger.isDebugEnabled()) { + logger.debug("Constructed ClientSocketStatsStats object (" + + System.identityHashCode(this) + ") with parent object(" + + System.identityHashCode(parent) + ")"); + } } /* get per node stats, create one if not exist */ @@ -315,6 +332,12 @@ protected void checkMonitoringInterval() { if(parent == null && statsMap != null) { int monitoringInterval = this.monitoringInterval.get(); if(monitoringCount % (monitoringInterval + 1) == monitoringInterval) { + // timing instrumentation (debug only) + long startTimeNs = 0; + if(logger.isDebugEnabled()) { + startTimeNs = System.nanoTime(); + } + // reset all children Iterator it = statsMap.keySet().iterator(); while(it.hasNext()) { @@ -323,6 +346,13 @@ protected void checkMonitoringInterval() { } // reset itself resetForInterval(); + + // timing instrumentation (debug only) + if(logger.isDebugEnabled()) { + logger.debug("ClientSocketStats(" + System.identityHashCode(this) + + ")::checkMonitoringInterval: reset self and all children in " + + (System.nanoTime() - startTimeNs) + " ns."); + } } } } diff --git a/src/java/voldemort/store/stats/Histogram.java b/src/java/voldemort/store/stats/Histogram.java index 88a9a7b768..f93182aa7f 100644 --- a/src/java/voldemort/store/stats/Histogram.java +++ b/src/java/voldemort/store/stats/Histogram.java @@ -1,3 +1,19 @@ +/* + * Copyright 2012 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + package voldemort.store.stats; import java.util.Arrays; @@ -7,10 +23,15 @@ import voldemort.annotations.concurrency.Threadsafe; /** - * A class for computing percentiles based on a histogram. Values are bucketed - * by a configurable bound (e.g., 0-1, 1-2, 2-3). When a value is inserted, - * perform a binary search to find the correct bucket. + * A class for computing percentiles based on a simple histogram. + * + * The histogram starts at 0 and then has uniformly sized buckets. The number of + * buckets and width of each bucket is specified upon construction. Each bucket + * in the histogram "counts" the number of values inserted into the histogram + * that fall into the bucket's range. * + * All interfaces for adding data to the histogram or querying the histogram for + * quantiles are synchronized to make this object threadsafe. * */ @Threadsafe @@ -19,30 +40,43 @@ public class Histogram { private final int nBuckets; private final int step; private final int[] buckets; - private final int[] bounds; + private final long upperBound; + private int size; + private long sum; private static final Logger logger = Logger.getLogger(Histogram.class); + private long resetIntervalMs = -1; + private long lastResetTimeMs; + /** * Initialize an empty histogram * * @param nBuckets The number of buckets to use * @param step The size of each bucket */ + public Histogram(int nBuckets, int step, long resetIntervalMs) { + this(nBuckets, step); + this.resetIntervalMs = resetIntervalMs; + this.lastResetTimeMs = System.currentTimeMillis(); + } + + /** + * Initialize an empty histogram + * + * @param nBuckets The number of buckets to use + * @param step The size (width) of each bucket + */ public Histogram(int nBuckets, int step) { this.nBuckets = nBuckets; this.step = step; + this.upperBound = step * nBuckets; this.buckets = new int[nBuckets]; - this.bounds = new int[nBuckets]; - init(); - } + reset(); - protected void init() { - int bound = 0; - for(int i = 0; i < nBuckets; i++, bound += step) { - bounds[i] = bound; + if(logger.isDebugEnabled()) { + logger.debug("Constructed a histogram with " + nBuckets + " buckets."); } - reset(); } /** @@ -51,73 +85,79 @@ protected void init() { public synchronized void reset() { Arrays.fill(buckets, 0); size = 0; + sum = 0; + this.lastResetTimeMs = System.currentTimeMillis(); } /** * Insert a value into the right bucket of the histogram. If the value is - * larger than any bound, insert into the last bucket + * larger than any bound, insert into the last bucket. If the value is less + * than zero, then ignore it. * * @param data The value to insert into the histogram */ public synchronized void insert(long data) { - int index = findBucket(data); - if(index == -1) { - logger.error(data + " can't be bucketed, is invalid!"); + resetIfNeeded(); + long index = 0; + if(data >= this.upperBound) { + index = nBuckets - 1; + } else if(data < 0) { + logger.error(data + " can't be bucketed because it is negative!"); + return; + } else { + index = data / step; + } + if(index < 0 || index >= nBuckets) { + // This should be dead code. Defending against code changes in + // future. + logger.error(data + " can't be bucketed because index is not in range [0,nBuckets)."); return; } - buckets[index]++; + buckets[(int) index]++; + sum += data; size++; } /** * Find the a value n such that the percentile falls within [ - * n, n + step) + * n, n + step). This method does a LINEAR probe + * of the histogram. I.e., this method is O(nBuckets). * * @param quantile The percentile to find * @return Lower bound associated with the percentile */ - public synchronized int getQuantile(double quantile) { + public synchronized long getQuantile(double quantile) { + resetIfNeeded(); int total = 0; for(int i = 0; i < nBuckets; i++) { total += buckets[i]; double currQuantile = ((double) total) / ((double) size); if(currQuantile >= quantile) { - return bounds[i]; + return i * step; } } return 0; } - private int findBucket(long needle) { - long max = step * nBuckets; - if(needle > max) { - return nBuckets - 1; - } - int low = 0; - int high = nBuckets - 1; - while(low <= high) { - int mid = (low + high) / 2; - int cmp = compareToBucket(mid, needle); - if(cmp == 0) { - return mid; - } else if(cmp > 0) { - high = mid - 1; - } else if(cmp < 0) { - low = mid + 1; - } - } - return -1; + /** + * Obtain the average of the data in the histogram + * + * Note: Caller is responsible for making sure 'sum' does not overflow + * within the reset interval + * + * @return the average over the current samples + */ + public synchronized double getAverage() { + if(size == 0) + return 0.0; + return (sum * 1.0) / size; } - private int compareToBucket(int bucket, long needle) { - int low = bounds[bucket]; - int high = low + step; - if(low <= needle && high > needle) { - return 0; - } else if(low > needle) { - return 1; - } else { - return -1; + private void resetIfNeeded() { + if(resetIntervalMs > -1) { + if((System.currentTimeMillis() - lastResetTimeMs) >= this.resetIntervalMs) { + this.reset(); + } } } } diff --git a/src/java/voldemort/store/stats/RequestCounter.java b/src/java/voldemort/store/stats/RequestCounter.java index 96c17bbf17..c0645bd189 100644 --- a/src/java/voldemort/store/stats/RequestCounter.java +++ b/src/java/voldemort/store/stats/RequestCounter.java @@ -1,7 +1,25 @@ +/* + * Copyright 2012 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + package voldemort.store.stats; import java.util.concurrent.atomic.AtomicReference; +import org.apache.log4j.Logger; + import voldemort.utils.SystemTime; import voldemort.utils.Time; @@ -14,41 +32,43 @@ public class RequestCounter { private final AtomicReference values; - private final int durationMS; + private final long durationMs; private final Time time; private final Histogram histogram; - private volatile int q95LatencyMs; - private volatile int q99LatencyMs; + private volatile long q95LatencyMs; + private volatile long q99LatencyMs; private boolean useHistogram; + private static final Logger logger = Logger.getLogger(RequestCounter.class.getName()); + /** - * @param durationMS specifies for how long you want to maintain this + * @param durationMs specifies for how long you want to maintain this * counter (in milliseconds). */ - public RequestCounter(int durationMS) { - this(durationMS, SystemTime.INSTANCE, false); + public RequestCounter(long durationMs) { + this(durationMs, SystemTime.INSTANCE, false); } /** - * @param durationMS specifies for how long you want to maintain this + * @param durationMs specifies for how long you want to maintain this * counter (in milliseconds). useHistogram indicates that this * counter should also use a histogram. */ - public RequestCounter(int durationMS, boolean useHistogram) { - this(durationMS, SystemTime.INSTANCE, useHistogram); + public RequestCounter(long durationMs, boolean useHistogram) { + this(durationMs, SystemTime.INSTANCE, useHistogram); } /** * For testing request expiration via an injected time provider */ - RequestCounter(int durationMS, Time time) { - this(durationMS, time, false); + RequestCounter(long durationMs, Time time) { + this(durationMs, time, false); } - RequestCounter(int durationMS, Time time, boolean useHistogram) { + RequestCounter(long durationMs, Time time, boolean useHistogram) { this.time = time; this.values = new AtomicReference(new Accumulator()); - this.durationMS = durationMS; + this.durationMs = durationMs; this.q95LatencyMs = 0; this.q99LatencyMs = 0; this.useHistogram = useHistogram; @@ -98,8 +118,8 @@ public String getDisplayAverageTimeInMs() { return String.format("%.4f", getAverageTimeInMs()); } - public int getDuration() { - return durationMS; + public long getDuration() { + return durationMs; } public long getMaxLatencyInMs() { @@ -111,11 +131,24 @@ private void maybeResetHistogram() { return; Accumulator accum = values.get(); long now = time.getMilliseconds(); - if(now - accum.startTimeMS > durationMS) { + if(now - accum.startTimeMS > durationMs) { + // timing instrumentation (debug only) + long startTimeNs = 0; + if(logger.isDebugEnabled()) { + startTimeNs = System.nanoTime(); + } + // Reset the histogram q95LatencyMs = histogram.getQuantile(0.95); q99LatencyMs = histogram.getQuantile(0.99); histogram.reset(); + + // timing instrumentation (debug only) + if(logger.isDebugEnabled()) { + logger.debug("Histogram (" + System.identityHashCode(histogram) + + ") : reset, Q95, & Q99 took " + (System.nanoTime() - startTimeNs) + + " ns."); + } } } @@ -127,7 +160,7 @@ private Accumulator getValidAccumulator() { /* * if still in the window, just return it */ - if(now - accum.startTimeMS <= durationMS) { + if(now - accum.startTimeMS <= durationMs) { return accum; } @@ -171,6 +204,12 @@ public void addRequest(long timeNS, long numEmptyResponses, long bytes, long getAllAggregatedCount) { + // timing instrumentation (trace only) + long startTimeNs = 0; + if(logger.isTraceEnabled()) { + startTimeNs = System.nanoTime(); + } + long timeMs = timeNS / Time.NS_PER_MS; if(this.useHistogram) { histogram.insert(timeMs); @@ -189,8 +228,22 @@ public void addRequest(long timeNS, oldv.getAllAggregatedCount + getAllAggregatedCount, getAllAggregatedCount > oldv.getAllMaxCount ? getAllAggregatedCount : oldv.getAllMaxCount); - if(values.compareAndSet(oldv, newv)) + if(values.compareAndSet(oldv, newv)) { + // timing instrumentation (trace only) + if(logger.isTraceEnabled()) { + logger.trace("addRequest (histogram.insert and accumulator update) took " + + (System.nanoTime() - startTimeNs) + " ns."); + } + // Return since data has been accumulated return; + } + } + logger.info("addRequest lost timing instrumentation data because three retries was insufficient to update the accumulator."); + + // timing instrumentation (trace only) + if(logger.isTraceEnabled()) { + logger.trace("addRequest (histogram.insert and accumulator update) took " + + (System.nanoTime() - startTimeNs) + " ns."); } } @@ -233,11 +286,11 @@ public long getGetAllMaxCount() { return getValidAccumulator().getAllMaxCount; } - public int getQ95LatencyMs() { + public long getQ95LatencyMs() { return q95LatencyMs; } - public int getQ99LatencyMs() { + public long getQ99LatencyMs() { return q99LatencyMs; } @@ -262,6 +315,11 @@ public Accumulator() { this(RequestCounter.this.time.getMilliseconds(), 0, 0, 0, 0, 0, 0, 0, 0, 0); } + /** + * This method resets startTimeMS. + * + * @return + */ public Accumulator newWithTotal() { return new Accumulator(RequestCounter.this.time.getMilliseconds(), 0, diff --git a/src/java/voldemort/store/stats/SimpleCounter.java b/src/java/voldemort/store/stats/SimpleCounter.java new file mode 100644 index 0000000000..0e2720c61f --- /dev/null +++ b/src/java/voldemort/store/stats/SimpleCounter.java @@ -0,0 +1,136 @@ +package voldemort.store.stats; + +import java.util.concurrent.atomic.AtomicLong; + +import voldemort.utils.Time; + +/** + * A simple concurrent, non-blocking event counter that resets itself every + * interval, and provides eventRate and average event value metrics over the + * last interval + * + */ +public class SimpleCounter { + + /** + * Count of total number of events in current interval + */ + AtomicLong eventsCounter; + /** + * Sum of all the event values in the current interval + */ + AtomicLong eventsValueCounter; + /** + * Last time when the counter was reset + */ + AtomicLong lastResetTimeMs; + + /** + * Number of events that occurred in the last interval + */ + long numEventsLastInterval; + + /** + * Sum of all the event values in the the last interval + */ + long totalEventValueLastInterval; + + // We need additional tracking for the end of the second last or penultimate + // interval, since resetting the atomicLong counters would mean we might + // miss some event updates + + /** + * Number of events that occurred in the second last interval. + */ + long numEventsLastLastInterval; + + /** + * Sum of all the event values in the the second last interval. + */ + long totalEventValueLastLastInterval; + + /** + * The counter will be reset once this many ms + */ + final long resetIntervalMs; + + public SimpleCounter(long resetIntervalMs) { + if(resetIntervalMs < 1) { + throw new IllegalArgumentException("Reset interval must be positive"); + } + this.resetIntervalMs = resetIntervalMs; + this.lastResetTimeMs = new AtomicLong(System.currentTimeMillis()); + this.eventsValueCounter = new AtomicLong(0); + this.eventsCounter = new AtomicLong(0); + this.numEventsLastInterval = 0; + this.totalEventValueLastInterval = 0; + this.numEventsLastLastInterval = 0; + this.totalEventValueLastLastInterval = 0; + } + + public void count() { + this.count(0); + } + + public void count(long eventValue) { + resetIfNeeded(); + eventsCounter.incrementAndGet(); + eventsValueCounter.addAndGet(eventValue); + } + + private void resetIfNeeded() { + long currentLastResetTimeMs = lastResetTimeMs.longValue(); + long now = System.currentTimeMillis(); + + // check if interval might have expired + if((now - currentLastResetTimeMs) >= resetIntervalMs) { + long numEvents = eventsCounter.longValue(); + long totalEventValue = eventsValueCounter.longValue(); + // more than one thread can get here concurrently. But exactly one + // will pass the check below + if(lastResetTimeMs.compareAndSet(currentLastResetTimeMs, now)) { + // the synchronization is for any monitoring thread to read a + // consistent state for reporting + synchronized(this) { + // reseting this counters here might be problematic since + // another application thread can go ahead and update the + // counters and we will miss those data points. instead we + // simply update the delta from the current interval. This + // guarantees correctness + numEventsLastLastInterval = numEventsLastInterval; + totalEventValueLastLastInterval = totalEventValueLastInterval; + numEventsLastInterval = numEvents; + totalEventValueLastInterval = totalEventValue; + } + } + } + } + + /** + * Returns the events per second in the current interval + * + * @return + */ + public Double getEventRate() { + resetIfNeeded(); + synchronized(this) { + return (numEventsLastInterval - numEventsLastLastInterval) + / ((1.0 * resetIntervalMs) / Time.MS_PER_SECOND); + } + } + + /** + * Returns the average event value in the current interval + */ + public Double getAvgEventValue() { + resetIfNeeded(); + synchronized(this) { + long eventsLastInterval = numEventsLastInterval - numEventsLastLastInterval; + if(eventsLastInterval > 0) + return ((totalEventValueLastInterval - totalEventValueLastLastInterval) * 1.0) + / eventsLastInterval; + else + return 0.0; + } + } +} diff --git a/src/java/voldemort/store/stats/StatTrackingStore.java b/src/java/voldemort/store/stats/StatTrackingStore.java index 0f595ef168..ace3c866bb 100644 --- a/src/java/voldemort/store/stats/StatTrackingStore.java +++ b/src/java/voldemort/store/stats/StatTrackingStore.java @@ -26,6 +26,7 @@ import voldemort.store.DelegatingStore; import voldemort.store.Store; import voldemort.store.StoreCapabilityType; +import voldemort.store.CompositeVoldemortRequest; import voldemort.utils.ByteArray; import voldemort.versioning.ObsoleteVersionException; import voldemort.versioning.Version; @@ -153,4 +154,99 @@ public StoreStats getStats() { public void resetStatistics() { this.stats = new StoreStats(); } + + @Override + public List> get(CompositeVoldemortRequest request) + throws VoldemortException { + List> result = null; + long start = System.nanoTime(); + try { + result = super.get(request); + return result; + } catch(VoldemortException e) { + stats.recordTime(Tracked.EXCEPTION, System.nanoTime() - start); + throw e; + } finally { + long duration = System.nanoTime() - start; + long totalBytes = 0; + boolean returningEmpty = true; + if(result != null) { + returningEmpty = result.size() == 0; + for(Versioned bytes: result) { + totalBytes += bytes.getValue().length; + } + } + stats.recordGetTime(duration, returningEmpty, totalBytes); + } + } + + @Override + // TODO: Validate all the keys in the request object + public Map>> getAll(CompositeVoldemortRequest request) + throws VoldemortException { + Map>> result = null; + long start = System.nanoTime(); + try { + result = super.getAll(request); + return result; + } catch(VoldemortException e) { + stats.recordTime(Tracked.EXCEPTION, System.nanoTime() - start); + throw e; + } finally { + long duration = System.nanoTime() - start; + long totalBytes = 0; + int requestedValues = 0; + int returnedValues = 0; + + // Determine how many values were requested + for(ByteArray k: request.getIterableKeys()) { + requestedValues++; + } + + if(result != null) { + // Determine the number of values being returned + returnedValues = result.keySet().size(); + // Determine the total size of the response + for(List> value: result.values()) { + for(Versioned bytes: value) { + totalBytes += bytes.getValue().length; + } + } + } + + stats.recordGetAllTime(duration, requestedValues, returnedValues, totalBytes); + } + } + + @Override + public void put(CompositeVoldemortRequest request) throws VoldemortException { + long start = System.nanoTime(); + try { + super.put(request); + } catch(ObsoleteVersionException e) { + stats.recordTime(Tracked.OBSOLETE, System.nanoTime() - start); + throw e; + } catch(VoldemortException e) { + stats.recordTime(Tracked.EXCEPTION, System.nanoTime() - start); + throw e; + } finally { + stats.recordPutTimeAndSize(System.nanoTime() - start, + request.getValue().getValue().length); + } + + } + + @Override + public boolean delete(CompositeVoldemortRequest request) + throws VoldemortException { + long start = System.nanoTime(); + try { + return super.delete(request); + } catch(VoldemortException e) { + stats.recordTime(Tracked.EXCEPTION, System.nanoTime() - start); + throw e; + } finally { + stats.recordTime(Tracked.DELETE, System.nanoTime() - start); + } + } } diff --git a/src/java/voldemort/store/stats/StoreStats.java b/src/java/voldemort/store/stats/StoreStats.java index e6cf34c99b..13f8d22807 100644 --- a/src/java/voldemort/store/stats/StoreStats.java +++ b/src/java/voldemort/store/stats/StoreStats.java @@ -1,9 +1,27 @@ +/* + * Copyright 2012 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + package voldemort.store.stats; import java.util.Collections; import java.util.EnumMap; import java.util.Map; +import org.apache.log4j.Logger; + /** * Some convenient statistics to track about the store * @@ -14,6 +32,8 @@ public class StoreStats { private final StoreStats parent; private final Map counters; + private static final Logger logger = Logger.getLogger(StoreStats.class.getName()); + public StoreStats() { this(null); } @@ -29,6 +49,11 @@ public StoreStats(StoreStats parent) { counters.put(tracked, new RequestCounter(300000, true)); } this.parent = parent; + + if(logger.isDebugEnabled()) { + logger.debug("Constructed StoreStats object (" + System.identityHashCode(this) + + ") with parent object (" + System.identityHashCode(parent) + ")"); + } } /** diff --git a/src/java/voldemort/store/stats/StreamStats.java b/src/java/voldemort/store/stats/StreamStats.java deleted file mode 100644 index 436c0993df..0000000000 --- a/src/java/voldemort/store/stats/StreamStats.java +++ /dev/null @@ -1,220 +0,0 @@ -package voldemort.store.stats; - -import java.util.Collection; -import java.util.Collections; -import java.util.HashMap; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.ConcurrentMap; -import java.util.concurrent.atomic.AtomicLong; - -import voldemort.utils.Time; - -import com.google.common.collect.ImmutableList; - -public class StreamStats { - - private static final int MAX_ENTRIES = 64; - - private final Map handles; - private final AtomicLong handleIdGenerator; - private final ConcurrentMap networkCounter; - private final ConcurrentMap diskCounter; - - public StreamStats() { - this.handles = Collections.synchronizedMap(new Cache(MAX_ENTRIES)); - this.handleIdGenerator = new AtomicLong(0L); - this.networkCounter = new ConcurrentHashMap(); - this.diskCounter = new ConcurrentHashMap(); - - for(Operation operation: Operation.values()) { - networkCounter.put(operation, new RequestCounter(300000)); - diskCounter.put(operation, new RequestCounter(30000)); - } - } - - public Handle makeHandle(Operation operation, - HashMap> replicaToPartitionList) { - Handle handle = new Handle(handleIdGenerator.getAndIncrement(), - operation, - System.currentTimeMillis(), - replicaToPartitionList); - handles.put(handle.getId(), handle); - return handle; - } - - public void closeHandle(Handle handle) { - handle.setFinished(true); - } - - public void clearFinished() { - for(long handleId: getHandleIds()) { - if(getHandle(handleId).isFinished()) - handles.remove(handleId); - } - } - - protected Handle getHandle(long handleId) { - if(!handles.containsKey(handleId)) - throw new IllegalArgumentException("No handle with id " + handleId); - - return handles.get(handleId); - } - - public Collection getHandleIds() { - return ImmutableList.copyOf(handles.keySet()); - } - - public Collection getHandles() { - return ImmutableList.copyOf(handles.values()); - } - - public void recordNetworkTime(Handle handle, long timeNs) { - networkCounter.get(handle.getOperation()).addRequest(timeNs); - } - - public void recordDiskTime(Handle handle, long timeNs) { - diskCounter.get(handle.getOperation()).addRequest(timeNs); - } - - public RequestCounter getNetworkCounter(Operation operation) { - return networkCounter.get(operation); - } - - public RequestCounter getDiskCounter(Operation operation) { - return diskCounter.get(operation); - } - - public enum Operation { - FETCH_KEYS, - FETCH_ENTRIES, - FETCH_FILE, - UPDATE, - SLOP, - DELETE, - } - - private static class Cache extends LinkedHashMap { - - private static final long serialVersionUID = 1L; - - private final int maxEntries; - - public Cache(int maxEntries) { - super(); - this.maxEntries = maxEntries; - } - - @Override - protected boolean removeEldestEntry(Map.Entry eldest) { - return eldest.getValue().isFinished() && size() > maxEntries; - } - } - - public static class Handle { - - private final long id; - private final Operation operation; - private final long startedMs; - private final HashMap> replicaToPartitionList; - private final AtomicLong entriesScanned; - private final AtomicLong timeNetworkNs; - private final AtomicLong timeDiskNs; - private volatile boolean finished; - - private Handle(long id, - Operation operation, - long startedMs, - HashMap> replicaToPartitionList) { - this.id = id; - this.operation = operation; - this.startedMs = startedMs; - this.replicaToPartitionList = replicaToPartitionList; - this.entriesScanned = new AtomicLong(0L); - this.timeNetworkNs = new AtomicLong(0L); - this.timeDiskNs = new AtomicLong(0L); - this.finished = false; - } - - public long getId() { - return id; - } - - public long getStartedMs() { - return startedMs; - } - - public Operation getOperation() { - return operation; - } - - public long getEntriesScanned() { - return entriesScanned.get(); - } - - public long incrementEntriesScanned() { - return entriesScanned.incrementAndGet(); - } - - public void setEntriesScanned(long newVal) { - entriesScanned.set(newVal); - } - - public long getEntriesPerSecond() { - long elapsedSecs = System.currentTimeMillis() - startedMs; - if(elapsedSecs == 0L) - return 0L; - return getEntriesScanned() / elapsedSecs; - } - - public boolean isFinished() { - return finished; - } - - public void setFinished(boolean finished) { - this.finished = finished; - } - - public HashMap> getReplicaToPartitionList() { - return replicaToPartitionList; - } - - public void recordTimeNetwork(long deltaNs) { - timeNetworkNs.addAndGet(deltaNs); - } - - public long getTimeNetworkNs() { - return timeNetworkNs.get(); - } - - public void recordTimeDisk(long deltaNs) { - timeDiskNs.addAndGet(deltaNs); - } - - public long getTimeDiskNs() { - return timeDiskNs.get(); - } - - public double getPercentDisk() { - long timeDiskMs = getTimeDiskNs() / Time.NS_PER_MS; - return (timeDiskMs * 100.0) / (System.currentTimeMillis() - startedMs); - } - - public double getPercentNetwork() { - long timeNetworkMs = getTimeNetworkNs() / Time.NS_PER_MS; - return (timeNetworkMs * 100.0) / (System.currentTimeMillis() - startedMs); - } - - @Override - public String toString() { - return "Handle{" + "id=" + id + ", operation=" + operation + ", startedMs=" + startedMs - + ", replicaToPartitionList=" + getReplicaToPartitionList() - + ", entriesScanned=" + getEntriesScanned() + ", finished=" + finished - + ", entriesPerSecond=" + getEntriesPerSecond() + ", timeDiskNs=" - + getTimeDiskNs() + ", timeNetworkNs=" + getTimeNetworkNs() + ", percentDisk=" - + getPercentDisk() + ", percentNetwork=" + getPercentNetwork() + '}'; - } - } -} diff --git a/src/java/voldemort/store/stats/StreamStatsJmx.java b/src/java/voldemort/store/stats/StreamStatsJmx.java deleted file mode 100644 index b483b476c7..0000000000 --- a/src/java/voldemort/store/stats/StreamStatsJmx.java +++ /dev/null @@ -1,107 +0,0 @@ -package voldemort.store.stats; - -import org.apache.log4j.Logger; -import voldemort.annotations.jmx.JmxGetter; -import voldemort.annotations.jmx.JmxManaged; -import voldemort.annotations.jmx.JmxOperation; - -@JmxManaged(description = "Streaming related statistics") -public class StreamStatsJmx { - private final static Logger logger = Logger.getLogger(StreamStatsJmx.class); - - private final StreamStats stats; - - public StreamStatsJmx(StreamStats stats) { - this.stats = stats; - } - - @JmxGetter(name = "streamOperationIds", description = "Get a list of all stream operations.") - public String getStreamOperationIds() { - try { - return stats.getHandleIds().toString(); - } catch(Exception e) { - logger.error("Exception in JMX call", e); - return e.getMessage(); - } - } - - @JmxGetter(name = "allStreamOperations", description = "Get status of all stream operations.") - public String getAllStreamOperations() { - try { - return stats.getHandles().toString(); - } catch(Exception e) { - logger.error("Exception in JMX call", e); - return e.getMessage(); - } - } - - @JmxOperation(description = "Get the status of a stream operation with specified id.") - public String getStreamOperation(long handleId) { - try { - return stats.getHandle(handleId).toString(); - } catch(Exception e) { - logger.error("Exception in JMX call", e); - return e.getMessage(); - } - } - - @JmxOperation(description = "Clear out finished tasks.") - public void clearFinished() { - stats.clearFinished(); - } - - // Disk statistics - - @JmxGetter(name = "averageFetchKeysDiskTimeMs", description = "The avg. disk time in ms per FETCH_KEYS operation.") - public double getAvgFetchKeysDiskTimeMs() { - return stats.getDiskCounter(StreamStats.Operation.FETCH_KEYS).getAverageTimeInMs(); - } - - @JmxGetter(name = "averageFetchEntriesDiskTimeMs", description = "The avg. disk time in ms per FETCH_ENTRIES operation.") - public double getAvgFetchEntriesDiskTimeMs() { - return stats.getDiskCounter(StreamStats.Operation.FETCH_ENTRIES).getAverageTimeInMs(); - } - - @JmxGetter(name = "averageFetchFileDiskTimeMs", description = "The avg. disk time in ms per FETCH_FILE operation.") - public double getAvgFetchFileDiskTimeMs() { - return stats.getDiskCounter(StreamStats.Operation.FETCH_FILE).getAverageTimeInMs(); - } - - @JmxGetter(name = "averageUpdateDiskTimeMs", description = "The avg. disk time in ms per UPDATE operation.") - public double getAvgUpdateDiskTimeMs() { - return stats.getDiskCounter(StreamStats.Operation.UPDATE).getAverageTimeInMs(); - } - - @JmxGetter(name = "averageSlopDiskTimeMs", description = "The avg. disk time in ms per UPDATE_SLOP operation.") - public double getAvgSlopDiskTimeMs() { - return stats.getDiskCounter(StreamStats.Operation.SLOP).getAverageTimeInMs(); - } - - - // Network statistics - - @JmxGetter(name = "averageFetchKeysNetworkTimeMs", description = "The avg. network time in ms per FETCH_KEYS operation.") - public double getAvgFetchKeysNetworkTimeMs() { - return stats.getNetworkCounter(StreamStats.Operation.FETCH_KEYS).getAverageTimeInMs(); - } - - @JmxGetter(name = "averageFetchEntriesNetworkTimeMs", description = "The avg. network time in ms per FETCH_ENTRIES operation.") - public double getAvgFetchEntriesNetworkTimeMs() { - return stats.getNetworkCounter(StreamStats.Operation.FETCH_ENTRIES).getAverageTimeInMs(); - } - - @JmxGetter(name = "averageFetchFileNetworkTimeMs", description = "The avg. network time in ms per FETCH_FILE operation.") - public double getAvgFetchFileNetworkTimeMs() { - return stats.getNetworkCounter(StreamStats.Operation.FETCH_FILE).getAverageTimeInMs(); - } - - @JmxGetter(name = "averageUpdateNetworkTimeMs", description = "The avg. network time in ms per UPDATE operation.") - public double getAvgUpdateNetworkTimeMs() { - return stats.getNetworkCounter(StreamStats.Operation.UPDATE).getAverageTimeInMs(); - } - - @JmxGetter(name = "averageSlopNetworkTimeMs", description = "The avg. network time in ms per UPDATE_SLOP operation.") - public double getAvgSlopNetworkTimeMs() { - return stats.getNetworkCounter(StreamStats.Operation.SLOP).getAverageTimeInMs(); - } -} diff --git a/src/java/voldemort/store/stats/StreamingStats.java b/src/java/voldemort/store/stats/StreamingStats.java new file mode 100644 index 0000000000..439069cbb7 --- /dev/null +++ b/src/java/voldemort/store/stats/StreamingStats.java @@ -0,0 +1,188 @@ +package voldemort.store.stats; + +import java.util.HashMap; + +import voldemort.annotations.jmx.JmxGetter; +import voldemort.utils.Time; + +public class StreamingStats { + + public enum Operation { + FETCH_KEYS, + FETCH_ENTRIES, + FETCH_FILE, + UPDATE_ENTRIES, + SLOP_UPDATE + } + + private static final int STREAMING_STATS_RESET_INTERVAL_MS = 60000; + private StreamingStats parent; + private HashMap networkTimeCounterMap; + private HashMap storageTimeCounterMap; + private HashMap streamingPutCounterMap; + private HashMap streamingFetchCounterMap; + private HashMap streamingScanCounterMap; + + public StreamingStats() { + networkTimeCounterMap = new HashMap(); + storageTimeCounterMap = new HashMap(); + streamingPutCounterMap = new HashMap(); + streamingFetchCounterMap = new HashMap(); + streamingScanCounterMap = new HashMap(); + + // create the counters for each operation + networkTimeCounterMap.put(Operation.FETCH_KEYS, + new SimpleCounter(STREAMING_STATS_RESET_INTERVAL_MS)); + networkTimeCounterMap.put(Operation.FETCH_ENTRIES, + new SimpleCounter(STREAMING_STATS_RESET_INTERVAL_MS)); + networkTimeCounterMap.put(Operation.UPDATE_ENTRIES, + new SimpleCounter(STREAMING_STATS_RESET_INTERVAL_MS)); + networkTimeCounterMap.put(Operation.SLOP_UPDATE, + new SimpleCounter(STREAMING_STATS_RESET_INTERVAL_MS)); + + storageTimeCounterMap.put(Operation.FETCH_KEYS, + new SimpleCounter(STREAMING_STATS_RESET_INTERVAL_MS)); + storageTimeCounterMap.put(Operation.FETCH_ENTRIES, + new SimpleCounter(STREAMING_STATS_RESET_INTERVAL_MS)); + storageTimeCounterMap.put(Operation.UPDATE_ENTRIES, + new SimpleCounter(STREAMING_STATS_RESET_INTERVAL_MS)); + storageTimeCounterMap.put(Operation.SLOP_UPDATE, + new SimpleCounter(STREAMING_STATS_RESET_INTERVAL_MS)); + + streamingPutCounterMap.put(Operation.SLOP_UPDATE, + new SimpleCounter(STREAMING_STATS_RESET_INTERVAL_MS)); + streamingPutCounterMap.put(Operation.UPDATE_ENTRIES, + new SimpleCounter(STREAMING_STATS_RESET_INTERVAL_MS)); + + streamingFetchCounterMap.put(Operation.FETCH_KEYS, + new SimpleCounter(STREAMING_STATS_RESET_INTERVAL_MS)); + streamingFetchCounterMap.put(Operation.FETCH_ENTRIES, + new SimpleCounter(STREAMING_STATS_RESET_INTERVAL_MS)); + streamingFetchCounterMap.put(Operation.FETCH_FILE, + new SimpleCounter(STREAMING_STATS_RESET_INTERVAL_MS)); + + streamingScanCounterMap.put(Operation.FETCH_KEYS, + new SimpleCounter(STREAMING_STATS_RESET_INTERVAL_MS)); + streamingScanCounterMap.put(Operation.FETCH_ENTRIES, + new SimpleCounter(STREAMING_STATS_RESET_INTERVAL_MS)); + } + + public StreamingStats(StreamingStats parent) { + this(); + this.parent = parent; + } + + public void reportNetworkTime(Operation op, long networkTimeMs) { + networkTimeCounterMap.get(op).count(networkTimeMs); + if(parent != null) + parent.reportNetworkTime(op, networkTimeMs); + } + + public void reportStorageTime(Operation op, long storageTimeMs) { + storageTimeCounterMap.get(op).count(storageTimeMs); + if(parent != null) + parent.reportStorageTime(op, storageTimeMs); + } + + public void reportStreamingFetch(Operation op) { + streamingFetchCounterMap.get(op).count(); + if(parent != null) + parent.reportStreamingFetch(op); + } + + public void reportStreamingScan(Operation op) { + streamingScanCounterMap.get(op).count(); + if(parent != null) + parent.reportStreamingScan(op); + } + + public void reportStreamingPut(Operation op) { + streamingPutCounterMap.get(op).count(); + if(parent != null) + parent.reportStreamingPut(op); + } + + // Mbeans for FETCH_KEYS + @JmxGetter(name = "avgFetchKeysNetworkTimeMs", description = "average time spent on network, for fetch keys") + public double getAvgFetchKeysNetworkTimeMs() { + return networkTimeCounterMap.get(Operation.FETCH_KEYS).getAvgEventValue() / Time.NS_PER_MS; + } + + @JmxGetter(name = "avgFetchKeysStorageTimeMs", description = "average time spent on storage, for fetch keys") + public double getAvgFetchKeysStorageTimeMs() { + return storageTimeCounterMap.get(Operation.FETCH_KEYS).getAvgEventValue() / Time.NS_PER_MS; + } + + @JmxGetter(name = "getFetchKeysFetchRate", description = "rate at which keys are fetched per second") + public double getFetchKeysFetchRate() { + return streamingFetchCounterMap.get(Operation.FETCH_KEYS).getEventRate(); + } + + @JmxGetter(name = "getFetchKeysScanRate", description = "rate at which keys are scanned per second") + public double getFetchKeysScanRate() { + return streamingScanCounterMap.get(Operation.FETCH_KEYS).getEventRate(); + } + + // Mbeans for FETCH_ENTRIES + @JmxGetter(name = "avgFetchEntriesNetworkTimeMs", description = "average time spent on network, for streaming operations") + public double getAvgFetchEntriesNetworkTimeMs() { + return networkTimeCounterMap.get(Operation.FETCH_ENTRIES).getAvgEventValue() + / Time.NS_PER_MS; + } + + @JmxGetter(name = "avgFetchEntriesStorageTimeMs", description = "average time spent on storage, for streaming operations") + public double getAvgFetchEntriesStorageTimeMs() { + return storageTimeCounterMap.get(Operation.FETCH_ENTRIES).getAvgEventValue() + / Time.NS_PER_MS; + } + + @JmxGetter(name = "getFetchEntriesFetchRate", description = "rate at which entries are fetched per second") + public double getFetchEntriesFetchRate() { + return streamingFetchCounterMap.get(Operation.FETCH_ENTRIES).getEventRate(); + } + + @JmxGetter(name = "getFetchEntriesScanRate", description = "rate at which entries are scanned per second") + public double getFetchEntriesScanRate() { + return streamingScanCounterMap.get(Operation.FETCH_ENTRIES).getEventRate(); + } + + // Mbeans for FETCH_FILE + @JmxGetter(name = "getFetchFileFetchRate", description = "rate at which RO files are fetched per second") + public double getFetchFileFetchRate() { + return streamingFetchCounterMap.get(Operation.FETCH_FILE).getEventRate(); + } + + // Mbeans for UPDATE_ENTRIES + @JmxGetter(name = "avgUpdateEntriesNetworkTimeMs", description = "average time spent on network, for streaming operations") + public double getAvgUpdateEntriesNetworkTimeMs() { + return networkTimeCounterMap.get(Operation.UPDATE_ENTRIES).getAvgEventValue() + / Time.NS_PER_MS; + } + + @JmxGetter(name = "avgUpdateEntriesStorageTimeMs", description = "average time spent on storage, for streaming operations") + public double getAvgUpdateEntriesStorageTimeMs() { + return storageTimeCounterMap.get(Operation.UPDATE_ENTRIES).getAvgEventValue() + / Time.NS_PER_MS; + } + + @JmxGetter(name = "getUpdateEntriesPutRate", description = "rate at which entries are streaming in per second") + public double getUpdateEntriesPutRate() { + return streamingPutCounterMap.get(Operation.UPDATE_ENTRIES).getEventRate(); + } + + // Mbeans for SLOP_UPDATE + @JmxGetter(name = "avgSlopUpdateNetworkTimeMs", description = "average time spent on network, for streaming operations") + public double getAvgSlopUpdateNetworkTimeMs() { + return networkTimeCounterMap.get(Operation.SLOP_UPDATE).getAvgEventValue() / Time.NS_PER_MS; + } + + @JmxGetter(name = "avgSlopUpdateStorageTimeMs", description = "average time spent on storage, for streaming operations") + public double getAvgSlopUpdateStorageTimeMs() { + return storageTimeCounterMap.get(Operation.SLOP_UPDATE).getAvgEventValue() / Time.NS_PER_MS; + } + + @JmxGetter(name = "getSlopUpdatePutRate", description = "Rate at which slop entries are written to the server per second") + public double getSlopUpdatePutRate() { + return streamingPutCounterMap.get(Operation.SLOP_UPDATE).getEventRate(); + } +} diff --git a/src/java/voldemort/store/system/SystemStoreConstants.java b/src/java/voldemort/store/system/SystemStoreConstants.java index 0013b47d7a..0bd7dfe416 100644 --- a/src/java/voldemort/store/system/SystemStoreConstants.java +++ b/src/java/voldemort/store/system/SystemStoreConstants.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2012 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -20,7 +20,7 @@ import java.util.List; import voldemort.store.StoreDefinition; -import voldemort.utils.RebalanceUtils; +import voldemort.utils.StoreDefinitionUtils; import voldemort.xml.StoreDefinitionsMapper; /** @@ -88,6 +88,6 @@ public static List getAllSystemStoreDefs() { public static StoreDefinition getSystemStoreDef(String name) { List allDefs = getAllSystemStoreDefs(); - return RebalanceUtils.getStoreDefinitionWithName(allDefs, name); + return StoreDefinitionUtils.getStoreDefinitionWithName(allDefs, name); } } \ No newline at end of file diff --git a/src/java/voldemort/store/versioned/InconsistencyResolvingStore.java b/src/java/voldemort/store/versioned/InconsistencyResolvingStore.java index 5a1805902d..d303713461 100644 --- a/src/java/voldemort/store/versioned/InconsistencyResolvingStore.java +++ b/src/java/voldemort/store/versioned/InconsistencyResolvingStore.java @@ -23,6 +23,7 @@ import voldemort.store.DelegatingStore; import voldemort.store.Store; import voldemort.store.StoreCapabilityType; +import voldemort.store.CompositeVoldemortRequest; import voldemort.versioning.InconsistencyResolver; import voldemort.versioning.Versioned; @@ -67,4 +68,25 @@ public Object getCapability(StoreCapabilityType capability) { return super.getCapability(capability); } + @Override + public List> get(CompositeVoldemortRequest request) throws VoldemortException { + if(request.resolveConflicts()) { + return resolver.resolveConflicts(super.get(request)); + } + return super.get(request); + } + + @Override + public Map>> getAll(CompositeVoldemortRequest request) + throws VoldemortException { + Map>> m = super.getAll(request); + if(request.resolveConflicts()) { + for(Map.Entry>> entry: m.entrySet()) { + m.put(entry.getKey(), resolver.resolveConflicts(entry.getValue())); + } + } + + return m; + } + } diff --git a/src/java/voldemort/store/versioned/VersionIncrementingStore.java b/src/java/voldemort/store/versioned/VersionIncrementingStore.java index 89f4a5ffe3..eb15aa3b35 100644 --- a/src/java/voldemort/store/versioned/VersionIncrementingStore.java +++ b/src/java/voldemort/store/versioned/VersionIncrementingStore.java @@ -33,8 +33,7 @@ * @param The value type * @param The transforms type */ -public class VersionIncrementingStore extends DelegatingStore implements - Store { +public class VersionIncrementingStore extends DelegatingStore { private final short nodeId; private final Time time; diff --git a/src/java/voldemort/store/views/ViewStorageConfiguration.java b/src/java/voldemort/store/views/ViewStorageConfiguration.java index 35480ca9fa..9c25d00dfc 100644 --- a/src/java/voldemort/store/views/ViewStorageConfiguration.java +++ b/src/java/voldemort/store/views/ViewStorageConfiguration.java @@ -3,6 +3,7 @@ import java.util.List; import voldemort.VoldemortException; +import voldemort.routing.RoutingStrategy; import voldemort.serialization.DefaultSerializerFactory; import voldemort.serialization.SerializerFactory; import voldemort.server.StoreRepository; @@ -34,7 +35,8 @@ public ViewStorageConfiguration(VoldemortConfig config, public void close() {} - public StorageEngine getStore(StoreDefinition storeDef) { + public StorageEngine getStore(StoreDefinition storeDef, + RoutingStrategy strategy) { String name = storeDef.getName(); StoreDefinition def = StoreUtils.getStoreDef(storeDefs, name); String targetName = def.getViewTargetStoreName(); diff --git a/src/java/voldemort/store/views/ViewStorageEngine.java b/src/java/voldemort/store/views/ViewStorageEngine.java index 556cdbb991..205f3246d5 100644 --- a/src/java/voldemort/store/views/ViewStorageEngine.java +++ b/src/java/voldemort/store/views/ViewStorageEngine.java @@ -8,6 +8,7 @@ import voldemort.VoldemortException; import voldemort.annotations.Experimental; import voldemort.serialization.Serializer; +import voldemort.store.AbstractStorageEngine; import voldemort.store.StorageEngine; import voldemort.store.Store; import voldemort.store.StoreCapabilityType; @@ -29,9 +30,8 @@ * */ @Experimental -public class ViewStorageEngine implements StorageEngine { +public class ViewStorageEngine extends AbstractStorageEngine { - private final String name; private final Store serializingStore; private final StorageEngine target; private final Serializer valSerializer; @@ -50,7 +50,7 @@ public ViewStorageEngine(String name, Serializer targetValSerializer, CompressionStrategy valueCompressionStrategy, View valueTrans) { - this.name = name; + super(name); this.target = Utils.notNull(target); this.serializingStore = new SerializingStore(target, targetKeySerializer, @@ -103,10 +103,12 @@ private Versioned inflateValue(Versioned versioned) throws Volde return new Versioned(inflatedData, versioned.getVersion()); } + @Override public boolean delete(ByteArray key, Version version) throws VoldemortException { return target.delete(key, version); } + @Override public List> get(ByteArray key, byte[] transforms) throws VoldemortException { List> values = target.get(key, null); @@ -126,20 +128,19 @@ public List> get(ByteArray key, byte[] transforms) throws Vold return results; } + @Override public Map>> getAll(Iterable keys, Map transforms) throws VoldemortException { return StoreUtils.getAll(this, keys, transforms); } - public String getName() { - return name; - } - + @Override public List getVersions(ByteArray key) { return target.getVersions(key); } + @Override public void put(ByteArray key, Versioned value, byte[] transforms) throws VoldemortException { if(valueCompressionStrategy != null) @@ -153,14 +154,27 @@ public void put(ByteArray key, Versioned value, byte[] transforms) target.put(key, result, null); } + @Override public ClosableIterator>> entries() { return new ViewIterator(target.entries()); } + @Override public ClosableIterator keys() { return StoreUtils.keys(entries()); } + @Override + public ClosableIterator>> entries(int partition) { + return new ViewIterator(target.entries(partition)); + } + + @Override + public ClosableIterator keys(int partition) { + return StoreUtils.keys(entries(partition)); + } + + @Override public void truncate() { ViewIterator iterator = new ViewIterator(target.entries()); while(iterator.hasNext()) { @@ -169,6 +183,7 @@ public void truncate() { } } + @Override public Object getCapability(StoreCapabilityType capability) { if(capability == StoreCapabilityType.VIEW_TARGET) return this.target; @@ -176,7 +191,7 @@ public Object getCapability(StoreCapabilityType capability) { return null; } - public void close() throws VoldemortException {} + // public void close() throws VoldemortException {} private byte[] valueFromViewSchema(ByteArray key, byte[] value, byte[] transforms) { return this.targetValSerializer.toBytes(this.view.viewToStore(this.serializingStore, @@ -203,6 +218,7 @@ public ViewIterator(ClosableIterator>> inner) this.inner = inner; } + @Override public void close() { this.inner.close(); } @@ -212,13 +228,19 @@ protected Pair> computeNext() { Pair> p = inner.next(); Versioned newVal = Versioned.value(valueToViewSchema(p.getFirst(), p.getSecond().getValue(), - null), p.getSecond() - .getVersion()); + null), + p.getSecond().getVersion()); return Pair.create(p.getFirst(), newVal); } } + @Override public boolean isPartitionAware() { return target.isPartitionAware(); } + + @Override + public boolean isPartitionScanSupported() { + return target.isPartitionScanSupported(); + } } diff --git a/src/java/voldemort/utils/ByteUtils.java b/src/java/voldemort/utils/ByteUtils.java index 4fb375458a..618559b772 100644 --- a/src/java/voldemort/utils/ByteUtils.java +++ b/src/java/voldemort/utils/ByteUtils.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2009 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -25,6 +25,9 @@ import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; +import org.apache.commons.codec.DecoderException; +import org.apache.commons.codec.binary.Hex; + /** * Utility functions for munging on bytes * @@ -71,17 +74,11 @@ public static MessageDigest getDigest(String algorithm) { * @return The string */ public static String toHexString(byte[] bytes) { - StringBuilder buffer = new StringBuilder(); + return Hex.encodeHexString(bytes); + } - for(byte b: bytes) { - String hex = Integer.toHexString(b & 0xff); - hex = hex.substring(0, Math.min(hex.length(), 2)); - if(hex.length() == 1) { - buffer.append("0"); - } - buffer.append(hex); - } - return buffer.toString(); + public static byte[] fromHexString(String hexString) throws DecoderException { + return Hex.decodeHex(hexString.toCharArray()); } /** @@ -158,6 +155,17 @@ public static short readShort(byte[] bytes, int offset) { return (short) ((bytes[offset] << 8) | (bytes[offset + 1] & 0xff)); } + /** + * Read an unsigned short from the byte array starting at the given offset + * + * @param bytes The byte array to read from + * @param offset The offset to start reading at + * @return The short read + */ + public static int readUnsignedShort(byte[] bytes, int offset) { + return (((bytes[offset] & 0xff) << 8) | (bytes[offset + 1] & 0xff)); + } + /** * Read an int from the byte array starting at the given offset * @@ -229,6 +237,18 @@ public static void writeShort(byte[] bytes, short value, int offset) { bytes[offset + 1] = (byte) (0xFF & value); } + /** + * Write an unsigned short to the byte array starting at the given offset + * + * @param bytes The byte array + * @param value The short to write + * @param offset The offset to begin writing at + */ + public static void writeUnsignedShort(byte[] bytes, int value, int offset) { + bytes[offset] = (byte) (0xFF & (value >> 8)); + bytes[offset + 1] = (byte) (0xFF & value); + } + /** * Write an int to the byte array starting at the given offset * diff --git a/src/java/voldemort/utils/ClusterInstance.java b/src/java/voldemort/utils/ClusterInstance.java new file mode 100644 index 0000000000..c2808250e8 --- /dev/null +++ b/src/java/voldemort/utils/ClusterInstance.java @@ -0,0 +1,271 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.utils; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import voldemort.cluster.Cluster; +import voldemort.store.StoreDefinition; + +import com.google.common.collect.Maps; + +// TODO: Add ClusterInstanceTest unit test for these helper methods. + +/** + * This class wraps up a Cluster object and a List. The methods + * are effectively helper or util style methods for analyzing partitions and so + * on which are a function of both Cluster and List. + */ +public class ClusterInstance { + + // TODO: (refactor) Improve upon the name "ClusterInstance". Object-oriented + // meaning of 'instance' is too easily confused with system notion of an + // "instance of a cluster" (the intended usage in this class name). + + private final Cluster cluster; + private final List storeDefs; + + public ClusterInstance(Cluster cluster, List storeDefs) { + this.cluster = cluster; + this.storeDefs = storeDefs; + } + + public Cluster getCluster() { + return cluster; + } + + public List getStoreDefs() { + return storeDefs; + } + + /** + * Wrapper that just returns the max/min ratio metric and throws away the + * verbose string. + */ + public double analyzeBalance() { + Pair analysis = analyzeBalanceVerbose(); + return analysis.getFirst(); + } + + /** + * + * @param nodeIdToPartitionCount + * @param title + * @return + */ + public Pair summarizeBalance(final Map nodeIdToPartitionCount, + String title) { + StringBuilder builder = new StringBuilder(); + Set nodeIds = cluster.getNodeIds(); + + builder.append("\n" + title + "\n"); + int minVal = Integer.MAX_VALUE; + int maxVal = Integer.MIN_VALUE; + int aggCount = 0; + for(Integer nodeId: nodeIds) { + int curCount = nodeIdToPartitionCount.get(nodeId); + builder.append("\tNode ID: " + nodeId + " : " + curCount + " (" + + cluster.getNodeById(nodeId).getHost() + ")\n"); + aggCount += curCount; + if(curCount > maxVal) + maxVal = curCount; + if(curCount < minVal) + minVal = curCount; + } + int avgVal = aggCount / nodeIdToPartitionCount.size(); + double maxAvgRatio = maxVal * 1.0 / avgVal; + if(avgVal == 0) { + maxAvgRatio = maxVal; + } + double maxMinRatio = maxVal * 1.0 / minVal; + if(minVal == 0) { + maxMinRatio = maxVal; + } + builder.append("\tMin: " + minVal + "\n"); + builder.append("\tAvg: " + avgVal + "\n"); + builder.append("\tMax: " + maxVal + "\n"); + builder.append("\t\tMax/Avg: " + maxAvgRatio + "\n"); + builder.append("\t\tMax/Min: " + maxMinRatio + "\n"); + + return Pair.create(maxMinRatio, builder.toString()); + } + + /** + * Outputs an analysis of how balanced the cluster is given the store + * definitions. The metric max/min ratio is used to describe balance. The + * max/min ratio is the ratio of largest number of store-partitions to + * smallest number of store-partitions). If the minimum number of + * store-partitions is zero, then the max/min ratio is set to max rather + * than to infinite. + * + * @return First element of pair is the max/min ratio. Second element of + * pair is a string that can be printed to dump all the gory details + * of the analysis. + */ + public Pair analyzeBalanceVerbose() { + StringBuilder builder = new StringBuilder(); + builder.append(ClusterUtils.verboseClusterDump(cluster)); + + HashMap uniqueStores = KeyDistributionGenerator.getUniqueStoreDefinitionsWithCounts(storeDefs); + List keys = KeyDistributionGenerator.generateKeys(KeyDistributionGenerator.DEFAULT_NUM_KEYS); + Set nodeIds = cluster.getNodeIds(); + Set zoneIds = cluster.getZoneIds(); + + builder.append("PARTITION DUMP\n"); + Map primaryAggNodeIdToPartitionCount = Maps.newHashMap(); + for(Integer nodeId: nodeIds) { + primaryAggNodeIdToPartitionCount.put(nodeId, 0); + } + + Map aggNodeIdToZonePrimaryCount = Maps.newHashMap(); + for(Integer nodeId: nodeIds) { + aggNodeIdToZonePrimaryCount.put(nodeId, 0); + } + + Map allAggNodeIdToPartitionCount = Maps.newHashMap(); + for(Integer nodeId: nodeIds) { + allAggNodeIdToPartitionCount.put(nodeId, 0); + } + + for(StoreDefinition storeDefinition: uniqueStores.keySet()) { + StoreInstance storeInstance = new StoreInstance(cluster, storeDefinition); + + builder.append("\n"); + builder.append("Store exemplar: " + storeDefinition.getName() + "\n"); + builder.append("\tReplication factor: " + storeDefinition.getReplicationFactor() + "\n"); + builder.append("\tRouting strategy: " + storeDefinition.getRoutingStrategyType() + "\n"); + builder.append("\tThere are " + uniqueStores.get(storeDefinition) + + " other similar stores.\n"); + + // Map of node Id to Sets of pairs. Pairs of Integers are of + // + Map>> nodeIdToAllPartitions = RebalanceUtils.getNodeIdToAllPartitions(cluster, + storeDefinition, + true); + Map primaryNodeIdToPartitionCount = Maps.newHashMap(); + Map nodeIdToZonePrimaryCount = Maps.newHashMap(); + Map allNodeIdToPartitionCount = Maps.newHashMap(); + + // Print out all partitions, by replica type, per node + builder.append("\n"); + builder.append("\tDetailed Dump:\n"); + for(Integer nodeId: nodeIds) { + builder.append("\tNode ID: " + nodeId + "in zone " + + cluster.getNodeById(nodeId).getZoneId() + "\n"); + primaryNodeIdToPartitionCount.put(nodeId, 0); + nodeIdToZonePrimaryCount.put(nodeId, 0); + allNodeIdToPartitionCount.put(nodeId, 0); + Set> partitionPairs = nodeIdToAllPartitions.get(nodeId); + + int replicaType = 0; + while(partitionPairs.size() > 0) { + List> replicaPairs = new ArrayList>(); + for(Pair pair: partitionPairs) { + if(pair.getFirst() == replicaType) { + replicaPairs.add(pair); + } + } + List partitions = new ArrayList(); + for(Pair pair: replicaPairs) { + partitionPairs.remove(pair); + partitions.add(pair.getSecond()); + } + java.util.Collections.sort(partitions); + + builder.append("\t\t" + replicaType); + for(int zoneId: zoneIds) { + builder.append(" : z" + zoneId + " : "); + List zonePartitions = new ArrayList(); + for(int partitionId: partitions) { + if(cluster.getPartitionIdsInZone(zoneId).contains(partitionId)) { + zonePartitions.add(partitionId); + } + } + builder.append(zonePartitions.toString()); + + } + builder.append("\n"); + if(replicaType == 0) { + primaryNodeIdToPartitionCount.put(nodeId, + primaryNodeIdToPartitionCount.get(nodeId) + + partitions.size()); + } + + allNodeIdToPartitionCount.put(nodeId, allNodeIdToPartitionCount.get(nodeId) + + partitions.size()); + replicaType++; + } + } + + // Go through all partition IDs and determine which node is "first" + // in the replicating node list for every zone. This determines the + // number of "zone primaries" each node hosts. + for(int partitionId = 0; partitionId < cluster.getNumberOfPartitions(); partitionId++) { + for(int zoneId: zoneIds) { + for(int nodeId: storeInstance.getReplicationNodeList(partitionId)) { + if(cluster.getNodeById(nodeId).getZoneId() == zoneId) { + nodeIdToZonePrimaryCount.put(nodeId, + nodeIdToZonePrimaryCount.get(nodeId) + 1); + break; + } + } + } + } + + builder.append("\n"); + builder.append("\tSummary Dump:\n"); + for(Integer nodeId: nodeIds) { + builder.append("\tNode ID: " + nodeId + " : " + + allNodeIdToPartitionCount.get(nodeId) + "\n"); + primaryAggNodeIdToPartitionCount.put(nodeId, + primaryAggNodeIdToPartitionCount.get(nodeId) + + (primaryNodeIdToPartitionCount.get(nodeId) * uniqueStores.get(storeDefinition))); + aggNodeIdToZonePrimaryCount.put(nodeId, aggNodeIdToZonePrimaryCount.get(nodeId) + + nodeIdToZonePrimaryCount.get(nodeId) + * uniqueStores.get(storeDefinition)); + allAggNodeIdToPartitionCount.put(nodeId, + allAggNodeIdToPartitionCount.get(nodeId) + + (allNodeIdToPartitionCount.get(nodeId) * uniqueStores.get(storeDefinition))); + } + } + + builder.append("\n"); + builder.append("STD DEV ANALYSIS\n"); + builder.append("\n"); + builder.append(KeyDistributionGenerator.printOverallDistribution(cluster, storeDefs, keys)); + builder.append("\n"); + builder.append("\n"); + + Pair summary = summarizeBalance(primaryAggNodeIdToPartitionCount, + "AGGREGATE PRIMARY-PARTITION COUNT (across all stores)"); + builder.append(summary.getSecond()); + + summary = summarizeBalance(aggNodeIdToZonePrimaryCount, + "AGGREGATE ZONEPRIMARY-PARTITION COUNT (across all stores)"); + builder.append(summary.getSecond()); + + summary = summarizeBalance(allAggNodeIdToPartitionCount, + "AGGREGATE NARY-PARTITION COUNT (across all stores)"); + builder.append(summary.getSecond()); + + return new Pair(summary.getFirst(), builder.toString()); + } +} diff --git a/src/java/voldemort/utils/ClusterUtils.java b/src/java/voldemort/utils/ClusterUtils.java new file mode 100644 index 0000000000..54ad791a18 --- /dev/null +++ b/src/java/voldemort/utils/ClusterUtils.java @@ -0,0 +1,400 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.utils; + +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; + +import org.apache.log4j.Logger; + +import voldemort.VoldemortException; +import voldemort.cluster.Cluster; +import voldemort.cluster.Node; +import voldemort.cluster.Zone; + +import com.google.common.collect.Maps; + +// TODO: (refactor) Move all of the static "util" methods for which Cluster is +// the only complex type that the method operates on to be members of the +// Cluster class. Unclear whether 'nodeid' and 'partitionid' should be treated +// as complex types since they are proxies for complicated concepts. +/** + * ClusterUtils provides basic tools for manipulating and inspecting a cluster. + * + * Methods in this util module should take exactly one Cluster object, and + * possibly some other minor, simple arguments. A method that takes other + * complicated types such as StoreDefs or RebalancePlans should not be included + * in this module. + */ +public class ClusterUtils { + + private static Logger logger = Logger.getLogger(ClusterUtils.class); + + /** + * Creates a new cluster object that is a copy of currentCluster. + * + * @param currentCluster The current cluster metadata + * @return New cluster metadata which is copy of currentCluster + */ + public static Cluster copyCluster(Cluster currentCluster) { + return new Cluster(currentCluster.getName(), + new ArrayList(currentCluster.getNodes()), + new ArrayList(currentCluster.getZones())); + } + + /** + * Given a cluster and a node id checks if the node exists + * + * @param cluster The cluster metadata to check in + * @param nodeId The node id to search for + * @return True if cluster contains the node id, else false + */ + public static boolean containsNode(Cluster cluster, int nodeId) { + try { + cluster.getNodeById(nodeId); + return true; + } catch(VoldemortException e) { + return false; + } + } + + /** + * Given a preference list and a node id, check if any one of the partitions + * is on the node in picture + * + * @param cluster Cluster metadata + * @param preferenceList Preference list of partition ids + * @param nodeId Node id which we are checking for + * @return True if the preference list contains a node whose id = nodeId + */ + public static boolean containsPreferenceList(Cluster cluster, + List preferenceList, + int nodeId) { + + for(int partition: preferenceList) { + if(getNodeByPartitionId(cluster, partition).getId() == nodeId) { + return true; + } + } + return false; + } + + /** + * Given the cluster metadata returns a mapping of partition to node + * + * @param currentCluster Cluster metadata + * @return Map of partition id to node id + */ + public static Map getCurrentPartitionMapping(Cluster currentCluster) { + + Map partitionToNode = new LinkedHashMap(); + + for(Node node: currentCluster.getNodes()) { + for(Integer partition: node.getPartitionIds()) { + // Check if partition is on another node + Integer previousRegisteredNodeId = partitionToNode.get(partition); + if(previousRegisteredNodeId != null) { + throw new IllegalArgumentException("Partition id " + partition + + " found on two nodes : " + node.getId() + + " and " + previousRegisteredNodeId); + } + + partitionToNode.put(partition, node.getId()); + } + } + + return partitionToNode; + } + + /** + * Returns the Node associated to the provided partition. + * + * @param cluster The cluster in which to find the node + * @param partitionId Partition id for which we want the corresponding node + * @return Node that owns the partition + */ + public static Node getNodeByPartitionId(Cluster cluster, int partitionId) { + for(Node node: cluster.getNodes()) { + if(node.getPartitionIds().contains(partitionId)) { + return node; + } + } + return null; + } + + /** + * Compress contiguous partitions into format "e-i" instead of + * "e, f, g, h, i". This helps illustrate contiguous partitions within a + * zone. + * + * @param cluster + * @param zoneId + * @return + */ + public static String compressedListOfPartitionsInZone(final Cluster cluster, int zoneId) { + Set partitionIds = cluster.getPartitionIdsInZone(zoneId); + if(partitionIds.size() == 0) { + return "[]"; + } + int curLastPartitionId = -1; + int curInitPartitionId = -1; + + String compressedList = "["; + for(int partitionId: partitionIds) { + // Handle initial condition + if(curInitPartitionId == -1) { + curInitPartitionId = partitionId; + curLastPartitionId = partitionId; + continue; + } + // Contiguous partition Id + if(partitionId == curLastPartitionId + 1) { + curLastPartitionId = partitionId; + continue; + } + + // End of (possibly) contiguous partition Ids + if(curInitPartitionId == curLastPartitionId) { + compressedList += curLastPartitionId + ", "; + } else { + compressedList += curInitPartitionId + "-" + curLastPartitionId + ", "; + } + curInitPartitionId = partitionId; + curLastPartitionId = partitionId; + } + // Handle end condition + if(curInitPartitionId == curLastPartitionId) { + compressedList += curLastPartitionId + "]"; + } else { + compressedList += curInitPartitionId + "-" + curLastPartitionId + "]"; + } + + return compressedList; + } + + /** + * Determines a histogram of contiguous runs of partitions within a zone. + * I.e., for each run length of contiguous partitions, how many such runs + * are there. + * + * @param cluster + * @param zoneId + * @return map of length of contiguous run of partitions to count of number + * of such runs. + */ + public static Map getMapOfContiguousPartitionRunLengths(final Cluster cluster, + int zoneId) { + List partitionIds = new ArrayList(cluster.getPartitionIdsInZone(zoneId)); + Map runLengthToCount = Maps.newHashMap(); + + if(partitionIds.isEmpty()) { + return runLengthToCount; + } + + int lastPartitionId = partitionIds.get(0); + int initPartitionId = lastPartitionId; + + for(int offset = 1; offset < partitionIds.size(); offset++) { + int partitionId = partitionIds.get(offset); + if(partitionId == lastPartitionId + 1) { + lastPartitionId = partitionId; + continue; + } + int runLength = lastPartitionId - initPartitionId + 1; + if(!runLengthToCount.containsKey(runLength)) { + runLengthToCount.put(runLength, 0); + } + runLengthToCount.put(runLength, runLengthToCount.get(runLength) + 1); + + initPartitionId = partitionId; + lastPartitionId = initPartitionId; + } + + int runLength = lastPartitionId - initPartitionId; + if(!runLengthToCount.containsKey(runLength)) { + runLengthToCount.put(runLength, 0); + } + runLengthToCount.put(runLength, runLengthToCount.get(runLength) + 1); + + return runLengthToCount; + } + + /** + * Pretty prints the output of getMapOfContiguousPartitionRunLengths + * + * @param cluster + * @param zoneId + * @return + */ + public static String getPrettyMapOfContiguousPartitionRunLengths(final Cluster cluster, + int zoneId) { + Map runLengthToCount = getMapOfContiguousPartitionRunLengths(cluster, + zoneId); + String prettyHistogram = "["; + boolean first = true; + Set runLengths = new TreeSet(runLengthToCount.keySet()); + for(int runLength: runLengths) { + if(first) { + first = false; + } else { + prettyHistogram += ", "; + } + prettyHistogram += "{" + runLength + " : " + runLengthToCount.get(runLength) + "}"; + } + prettyHistogram += "]"; + return prettyHistogram; + } + + /** + * Returns a pretty printed string of nodes that host specific "hot" + * partitions, where hot is defined as following a contiguous run of + * partitions of some length in another zone. + * + * @param cluster The cluster to analyze + * @param hotContiguityCutoff cutoff below which a contiguous run is not + * hot. + * @return + */ + public static String getHotPartitionsDueToContiguity(final Cluster cluster, + int hotContiguityCutoff) { + + StringBuilder sb = new StringBuilder(); + + for(Integer zoneId: cluster.getZoneIds()) { + List partitionIds = new ArrayList(cluster.getPartitionIdsInZone(zoneId)); + + int lastPartitionId = partitionIds.get(0); + int initPartitionId = lastPartitionId; + + for(int offset = 1; offset < partitionIds.size(); offset++) { + int partitionId = partitionIds.get(offset); + if(partitionId == lastPartitionId + 1) { + lastPartitionId = partitionId; + continue; + } + int runLength = lastPartitionId - initPartitionId + 1; + if(runLength > hotContiguityCutoff) { + int hotPartitionId = lastPartitionId + 1; + for(Node node: cluster.getNodes()) { + if(node.getPartitionIds().contains(hotPartitionId)) { + sb.append("\tNode " + node.getId() + " (" + node.getHost() + + ") has hot primary partition " + hotPartitionId + + " that follows contiguous run of length " + runLength + + "\n"); + } + } + } + + initPartitionId = partitionId; + lastPartitionId = initPartitionId; + } + } + + return sb.toString(); + } + + /** + * Prints the details of cluster xml in various formats. Some information is + * repeated in different forms. This is intentional so that it is easy to + * find the specific view of the cluster xml that you want. + * + * @param cluster + * @return + */ + public static String verboseClusterDump(final Cluster cluster) { + StringBuilder builder = new StringBuilder(); + + builder.append("CLUSTER XML SUMMARY\n"); + Map zoneIdToPartitionCount = Maps.newHashMap(); + Map zoneIdToNodeCount = Maps.newHashMap(); + for(Zone zone: cluster.getZones()) { + zoneIdToPartitionCount.put(zone.getId(), 0); + zoneIdToNodeCount.put(zone.getId(), 0); + } + for(Node node: cluster.getNodes()) { + zoneIdToPartitionCount.put(node.getZoneId(), + zoneIdToPartitionCount.get(node.getZoneId()) + + node.getNumberOfPartitions()); + zoneIdToNodeCount.put(node.getZoneId(), zoneIdToNodeCount.get(node.getZoneId()) + 1); + } + builder.append("\n"); + + builder.append("Number of partitions per zone:\n"); + for(Zone zone: cluster.getZones()) { + builder.append("\tZone: " + zone.getId() + " - " + + zoneIdToPartitionCount.get(zone.getId()) + "\n"); + } + builder.append("\n"); + + builder.append("Number of nodes per zone:\n"); + for(Zone zone: cluster.getZones()) { + builder.append("\tZone: " + zone.getId() + " - " + zoneIdToNodeCount.get(zone.getId()) + + "\n"); + } + builder.append("\n"); + + builder.append("Nodes in each zone:\n"); + for(Zone zone: cluster.getZones()) { + builder.append("\tZone: " + zone.getId() + " - " + + cluster.getNodeIdsInZone(zone.getId()) + "\n"); + } + builder.append("\n"); + + builder.append("Number of partitions per node:\n"); + for(Node node: cluster.getNodes()) { + builder.append("\tNode ID: " + node.getId() + " - " + node.getNumberOfPartitions() + + " (" + node.getHost() + ")\n"); + } + builder.append("\n"); + + if(cluster.getZones().size() > 1) { + builder.append("ZONE-PARTITION SUMMARY:\n"); + builder.append("\n"); + + builder.append("Partitions in each zone:\n"); + for(Zone zone: cluster.getZones()) { + builder.append("\tZone: " + + zone.getId() + + " - " + + ClusterUtils.compressedListOfPartitionsInZone(cluster, + zone.getId()) + "\n"); + } + builder.append("\n"); + + builder.append("Contiguous partition run lengths in each zone ('{run length : count}'):\n"); + for(Zone zone: cluster.getZones()) { + builder.append("\tZone: " + + zone.getId() + + " - " + + ClusterUtils.getPrettyMapOfContiguousPartitionRunLengths(cluster, + zone.getId()) + + "\n"); + } + builder.append("\n"); + + builder.append("The following nodes have hot partitions:\n"); + builder.append(ClusterUtils.getHotPartitionsDueToContiguity(cluster, 5)); + builder.append("\n"); + } + + return builder.toString(); + } +} diff --git a/src/java/voldemort/utils/CmdUtils.java b/src/java/voldemort/utils/CmdUtils.java index 9006e9a944..f5a2c2425e 100644 --- a/src/java/voldemort/utils/CmdUtils.java +++ b/src/java/voldemort/utils/CmdUtils.java @@ -36,14 +36,14 @@ public static Set> missing(OptionSet options, OptionSpec... req @SuppressWarnings("unchecked") public static T valueOf(OptionSet options, String opt, T defaultValue) { - if(options.has(opt)) + if(options.has(opt) && options.valueOf(opt) != null) return (T) options.valueOf(opt); else return defaultValue; } public static T valueOf(OptionSet options, OptionSpec opt, T defaultValue) { - if(options.has(opt)) + if(options.has(opt) && options.valueOf(opt) != null) return options.valueOf(opt); else return defaultValue; diff --git a/src/java/voldemort/utils/ConsistencyCheck.java b/src/java/voldemort/utils/ConsistencyCheck.java new file mode 100644 index 0000000000..a039dbfe93 --- /dev/null +++ b/src/java/voldemort/utils/ConsistencyCheck.java @@ -0,0 +1,889 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package voldemort.utils; + +import java.io.BufferedWriter; +import java.io.FileWriter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; +import java.util.concurrent.TimeUnit; + +import joptsimple.OptionParser; +import joptsimple.OptionSet; + +import org.apache.log4j.Logger; + +import voldemort.VoldemortException; +import voldemort.client.ClientConfig; +import voldemort.client.protocol.admin.AdminClient; +import voldemort.client.protocol.admin.AdminClientConfig; +import voldemort.cluster.Cluster; +import voldemort.cluster.Node; +import voldemort.routing.RoutingStrategyFactory; +import voldemort.store.StoreDefinition; +import voldemort.versioning.Occurred; +import voldemort.versioning.VectorClock; +import voldemort.versioning.Version; +import voldemort.versioning.Versioned; + +public class ConsistencyCheck { + + private static Logger logger = Logger.getLogger(ConsistencyCheck.class); + private final List urls; + private final String storeName; + private final Integer partitionId; + private final Reporter reporter; + + private Integer retentionDays = null; + private Integer replicationFactor = 0; + private Integer requiredWrites = 0; + + private List adminClients; + private List clusterNodeList = new ArrayList(); + private final Map>> keyVersionNodeSetMap = new HashMap>>(); + private RetentionChecker retentionChecker; + private KeyFetchTracker keyFetchTracker; + + public ConsistencyCheck(List urls, + String storeName, + int partitionId, + BufferedWriter badKeyWriter) { + this.urls = urls; + this.storeName = storeName; + this.partitionId = partitionId; + this.reporter = new Reporter(badKeyWriter); + } + + /** + * Connect to the clusters using given urls and start fetching process on + * correct nodes + * + * @throws Exception When no such store is found + */ + + public void connect() throws Exception { + adminClients = new ArrayList(urls.size()); + // bootstrap from two urls + Map clusterMap = new HashMap(urls.size()); + Map storeDefinitionMap = new HashMap(urls.size()); + + for(String url: urls) { + /* connect to cluster through admin port */ + if(logger.isInfoEnabled()) { + logger.info("Connecting to bootstrap server: " + url); + } + AdminClient adminClient = new AdminClient(url, + new AdminClientConfig(), + new ClientConfig(), + 0); + adminClients.add(adminClient); + /* get Cluster */ + Cluster cluster = adminClient.getAdminClientCluster(); + clusterMap.put(url, cluster); + /* get StoreDefinition */ + Versioned> storeDefinitions = adminClient.metadataMgmtOps.getRemoteStoreDefList(0); + StoreDefinition storeDefinition = StoreDefinitionUtils.getStoreDefinitionWithName(storeDefinitions.getValue(), + storeName); + storeDefinitionMap.put(url, storeDefinition); + } + + /* confirm same number of partitions in all clusters. */ + Integer partitionCount = null; + for(Entry entry: clusterMap.entrySet()) { + Integer currentPartitionCount = entry.getValue().getNumberOfPartitions(); + if(partitionCount == null) { + partitionCount = currentPartitionCount; + } + if(partitionCount != currentPartitionCount) { + logger.error("Partition count of different clusters is not the same: " + + partitionCount + " vs " + currentPartitionCount); + throw new VoldemortException("Will not connect because partition counts differ among clusters."); + } + } + + /* calculate nodes to scan */ + for(String url: urls) { + StoreDefinition storeDefinition = storeDefinitionMap.get(url); + Cluster cluster = clusterMap.get(url); + Map partitionToNodeMap = ClusterUtils.getCurrentPartitionMapping(cluster); + + /* find list of nodeId hosting partition */ + List partitionList = new RoutingStrategyFactory().updateRoutingStrategy(storeDefinition, + cluster) + .getReplicatingPartitionList(partitionId); + for(int partition: partitionList) { + Integer nodeId = partitionToNodeMap.get(partition); + Node node = cluster.getNodeById(nodeId); + clusterNodeList.add(new ClusterNode(urls.indexOf(url), node)); + } + } + + /* print config info */ + if(logger.isInfoEnabled()) { + StringBuilder configInfo = new StringBuilder(); + configInfo.append("TYPE,Store,PartitionId,Node,ZoneId,BootstrapUrl\n"); + for(ClusterNode clusterNode: clusterNodeList) { + configInfo.append("CONFIG,"); + configInfo.append(storeName + ","); + configInfo.append(partitionId + ","); + configInfo.append(clusterNode.getNode().getId() + ","); + configInfo.append(clusterNode.getNode().getZoneId() + ","); + configInfo.append(urls.get(clusterNode.getPrefixId()) + "\n"); + } + for(String line: configInfo.toString().split("\n")) { + logger.info(line); + } + } + + /* calculate retention days and more */ + for(String url: urls) { + StoreDefinition storeDefinition = storeDefinitionMap.get(url); + /* retention */ + int storeRetentionDays = 0; + if(storeDefinition.getRetentionDays() != null) { + storeRetentionDays = storeDefinition.getRetentionDays().intValue(); + } + if(retentionDays == null) { + retentionDays = storeRetentionDays; + } + if(retentionDays != storeRetentionDays) { + if(storeRetentionDays != 0 && (storeRetentionDays < retentionDays)) { + retentionDays = storeRetentionDays; + } + logger.warn("Retention-days is not consistent between clusters by urls. Will use the shorter."); + } + + /* replication writes */ + replicationFactor += storeDefinition.getReplicationFactor(); + + /* required writes */ + requiredWrites += storeDefinition.getRequiredWrites(); + } + if(replicationFactor != clusterNodeList.size()) { + logger.error("Replication factor is not consistent with number of nodes routed to."); + throw new VoldemortException("Will not connect because replication factor does not accord with number of nodes routed to."); + } + retentionChecker = new RetentionChecker(retentionDays); + } + + /** + * Run consistency check on connected key-value iterators + * + * @return Results in form of ConsistencyCheckStats + */ + public Reporter execute() throws IOException { + Map>>> nodeFetchIteratorMap; + nodeFetchIteratorMap = new HashMap>>>(); + /* start fetch from each node */ + for(ClusterNode clusterNode: clusterNodeList) { + AdminClient adminClient = adminClients.get(clusterNode.getPrefixId()); + List singlePartition = new ArrayList(); + singlePartition.add(partitionId); + if(logger.isDebugEnabled()) { + logger.debug("Start fetch request to Node[" + clusterNode.toString() + + "] for partition[" + partitionId + "] of store[" + storeName + "]"); + } + + Iterator>> fetchIterator; + fetchIterator = adminClient.bulkFetchOps.fetchEntries(clusterNode.getNode().getId(), + storeName, + singlePartition, + null, + false); + nodeFetchIteratorMap.put(clusterNode, fetchIterator); + } + keyFetchTracker = new KeyFetchTracker(clusterNodeList.size()); + + /* start to fetch */ + boolean fetchFinished; + do { + fetchFinished = true; + for(Map.Entry>>> nodeFetchIteratorMapEntry: nodeFetchIteratorMap.entrySet()) { + ClusterNode clusterNode = nodeFetchIteratorMapEntry.getKey(); + Iterator>> fetchIterator = nodeFetchIteratorMapEntry.getValue(); + if(fetchIterator.hasNext()) { + fetchFinished = false; + reporter.recordScans(1); + + Pair> fetchedEntry = fetchIterator.next(); + ByteArray key = fetchedEntry.getFirst(); + Versioned versioned = fetchedEntry.getSecond(); + + // record fetch + recordFetch(clusterNode, key, versioned); + + // try sweep last key fetched by this iterator + keyFetchTracker.recordFetch(clusterNode, key); + if(logger.isTraceEnabled()) { + logger.trace("fetched " + new String(key.get())); + logger.trace("map has keys: " + keyVersionNodeSetMap.size()); + } + trySweepAll(); + if(logger.isTraceEnabled()) { + logger.trace("sweeped; keys left: " + keyVersionNodeSetMap.size()); + } + } + } + + // stats reporting + if(logger.isInfoEnabled()) { + String report = reporter.tryProgressReport(); + if(report != null) { + for(String line: report.split("\n")) { + logger.info(line); + } + } + } + } while(!fetchFinished); + + /* adminClient shutdown */ + for(AdminClient adminClient: adminClients) { + if(adminClient != null) { + adminClient.close(); + } + } + + // clean keys not sufficient for write + cleanIneligibleKeys(keyVersionNodeSetMap, requiredWrites); + + keyFetchTracker.finishAll(); + trySweepAll(); + + reporter.processInconsistentKeys(storeName, partitionId, keyVersionNodeSetMap); + + return reporter; + } + + public void trySweepAll() { + for(ByteArray finishedKey = keyFetchTracker.nextFinished(); finishedKey != null; finishedKey = keyFetchTracker.nextFinished()) { + if(keyVersionNodeSetMap.containsKey(finishedKey)) { + ConsistencyLevel level = determineConsistency(keyVersionNodeSetMap.get(finishedKey), + replicationFactor); + if(level == ConsistencyLevel.FULL || level == ConsistencyLevel.LATEST_CONSISTENT) { + keyVersionNodeSetMap.remove(finishedKey); + reporter.recordGoodKey(1); + } + } + } + } + + public void recordFetch(ClusterNode clusterNode, ByteArray key, Versioned versioned) { + Version version; + if(urls.size() == 1) { + version = versioned.getVersion(); + } else { + version = new HashedValue(versioned); + } + + // skip version if expired + if(retentionChecker.isExpired(version)) { + reporter.recordExpired(1); + return; + } + + // initialize key -> Map> + if(!keyVersionNodeSetMap.containsKey(key)) { + keyVersionNodeSetMap.put(key, new HashMap>()); + } + Map> versionNodeSetMap = keyVersionNodeSetMap.get(key); + + // check existing version + if(!versionNodeSetMap.containsKey(version) && versionNodeSetMap.size() != 0) { + // if this version is new, sweep old version + // if this version is old, ignore this version + Version oneExistingVersion = versionNodeSetMap.keySet().iterator().next(); + if(version.compare(oneExistingVersion) == Occurred.AFTER) { + versionNodeSetMap.clear(); + } else if(oneExistingVersion.compare(version) == Occurred.AFTER) { + return; + } + } + + if(!versionNodeSetMap.containsKey(version)) { + // insert nodeSet into the map + versionNodeSetMap.put(version, new HashSet()); + } + + // add node to set + versionNodeSetMap.get(version).add(clusterNode); + } + + /** + * A class to track what keys have been fetched and what keys will not + * appear any more. It is used to detect keys that will not show up any more + * so that existing versions can be processed. + */ + protected static class KeyFetchTracker { + + private final Integer fetcherCount; + Map> fullyFetchedKeyMap = new HashMap>(); + Map lastFetchedKey = new HashMap(); + List fullyFetchedKeys = new LinkedList(); + + public KeyFetchTracker(Integer fetcherCount) { + this.fetcherCount = fetcherCount; + } + + /** + * Record a fetched result + * + * @param clusterNode The clusterNode from which the key has been + * fetched + * @param key The key itself + */ + public void recordFetch(ClusterNode clusterNode, ByteArray key) { + if(lastFetchedKey.containsKey(clusterNode)) { + ByteArray lastKey = lastFetchedKey.get(clusterNode); + if(!key.equals(lastKey)) { + if(!fullyFetchedKeyMap.containsKey(lastKey)) { + fullyFetchedKeyMap.put(lastKey, new HashSet()); + } + Set lastKeyIterSet = fullyFetchedKeyMap.get(lastKey); + lastKeyIterSet.add(clusterNode); + + // sweep if fully fetched by all iterators + if(lastKeyIterSet.size() == fetcherCount) { + fullyFetchedKeys.add(lastKey); + fullyFetchedKeyMap.remove(lastKey); + } + } + } + // remember key fetch states + lastFetchedKey.put(clusterNode, key); + } + + /** + * mark all keys appeared as finished So that they are all in the + * finished keys queue + */ + public void finishAll() { + Set keySet = new HashSet(); + keySet.addAll(fullyFetchedKeyMap.keySet()); + keySet.addAll(lastFetchedKey.values()); + fullyFetchedKeys.addAll(keySet); + fullyFetchedKeyMap.clear(); + } + + /** + * Get a key that are completed in fetching + * + * @return key considered finished; otherwise null + */ + public ByteArray nextFinished() { + if(fullyFetchedKeys.size() > 0) { + return fullyFetchedKeys.remove(0); + } else { + return null; + } + } + } + + protected enum ConsistencyLevel { + FULL, + LATEST_CONSISTENT, + INCONSISTENT, + EXPIRED, + INSUFFICIENT_WRITE + } + + /** + * Used to track nodes that may share the same nodeId in different clusters + * + */ + protected static class ClusterNode { + + private final Integer clusterId; + private final Node node; + + /** + * @param clusterId a prefix to be associated different clusters + * @param node the real node + */ + public ClusterNode(Integer clusterId, Node node) { + this.clusterId = clusterId; + this.node = node; + } + + public Integer getPrefixId() { + return clusterId; + } + + public Node getNode() { + return node; + } + + @Override + public boolean equals(Object o) { + if(this == o) + return true; + if(!(o instanceof ClusterNode)) + return false; + + ClusterNode n = (ClusterNode) o; + return clusterId.equals(n.getPrefixId()) && node.equals(n.getNode()); + } + + @Override + public String toString() { + return clusterId + "." + node.getId(); + } + + } + + /** + * A class to save version and value hash It is used to compare versions by + * the value hash + * + */ + protected static class HashedValue implements Version { + + final private Version innerVersion; + final private Integer valueHash; + + /** + * @param versioned Versioned value with version information and value + * itself + */ + public HashedValue(Versioned versioned) { + innerVersion = versioned.getVersion(); + valueHash = new FnvHashFunction().hash(versioned.getValue()); + } + + public int getValueHash() { + return valueHash; + } + + public Version getInner() { + return innerVersion; + } + + @Override + public boolean equals(Object object) { + if(this == object) + return true; + if(object == null) + return false; + if(!object.getClass().equals(HashedValue.class)) + return false; + HashedValue hash = (HashedValue) object; + boolean result = valueHash.equals(hash.getValueHash()); + return result; + } + + @Override + public int hashCode() { + return valueHash; + } + + @Override + public Occurred compare(Version v) { + return Occurred.CONCURRENTLY; // always regard as conflict + } + } + + /** + * A checker to determine if a key is to be cleaned according to retention + * policy + * + */ + protected static class RetentionChecker { + + final private long bufferTimeSeconds = 600; // expire N seconds earlier + final private long expiredTimeMs; + + /** + * @param days number of days ago from now to retain keys + */ + public RetentionChecker(int days) { + if(days <= 0) { + expiredTimeMs = 0; + } else { + long nowMs = System.currentTimeMillis(); + long expirationTimeS = TimeUnit.DAYS.toSeconds(days) - bufferTimeSeconds; + expiredTimeMs = nowMs - TimeUnit.SECONDS.toMillis(expirationTimeS); + } + } + + /** + * Determine if a version is expired + * + * @param v version to be checked + * @return if the version is expired according to retention policy + */ + public boolean isExpired(Version v) { + if(v instanceof VectorClock) { + return ((VectorClock) v).getTimestamp() < expiredTimeMs; + } else if(v instanceof HashedValue) { + return false; + } else { + logger.error("Version type is not supported for checking expiration"); + throw new VoldemortException("Version type is not supported for checking expiration" + + v.getClass().getCanonicalName()); + } + } + } + + /** + * Used to report bad keys, progress, and statistics + * + */ + protected static class Reporter { + + final BufferedWriter badKeyWriter; + final long reportPeriodMs; + + long lastReportTimeMs = 0; + long numRecordsScanned = 0; + long numRecordsScannedLast = 0; + long numExpiredRecords = 0; + long numGoodKeys = 0; + long numTotalKeys = 0; + + /** + * Will output progress reports every 5 seconds. + * + * @param badKeyWriter Writer to which to output bad keys. Null is OK. + */ + public Reporter(BufferedWriter badKeyWriter) { + this(badKeyWriter, 5000); + } + + /** + * @param badKeyWriter Writer to which to output bad keys. Null is OK. + * @param intervalMs Milliseconds between progress reports. + */ + public Reporter(BufferedWriter badKeyWriter, long intervalMs) { + this.badKeyWriter = badKeyWriter; + this.reportPeriodMs = intervalMs; + } + + public void recordScans(long count) { + numRecordsScanned += count; + } + + public void recordExpired(long count) { + numExpiredRecords += count; + } + + public String tryProgressReport() { + if(System.currentTimeMillis() > lastReportTimeMs + reportPeriodMs) { + long currentTimeMs = System.currentTimeMillis(); + StringBuilder s = new StringBuilder(); + s.append("=====Progress=====\n"); + s.append("Records Scanned: " + numRecordsScanned + "\n"); + s.append("Records Ignored: " + numExpiredRecords + " (Out of Retention)\n"); + s.append("Last Fetch Rate: " + (numRecordsScanned - numRecordsScannedLast) + / ((currentTimeMs - lastReportTimeMs) / 1000) + " (records/s)\n"); + lastReportTimeMs = currentTimeMs; + numRecordsScannedLast = numRecordsScanned; + return s.toString(); + } else { + return null; + } + } + + public void processInconsistentKeys(String storeName, + Integer partitionId, + Map>> keyVersionNodeSetMap) + throws IOException { + if(logger.isDebugEnabled()) { + logger.debug("TYPE,Store,ParId,Key,ServerSet,VersionTS,VectorClock[,ValueHash]"); + } + for(Map.Entry>> entry: keyVersionNodeSetMap.entrySet()) { + ByteArray key = entry.getKey(); + if(badKeyWriter != null) { + badKeyWriter.write(ByteUtils.toHexString(key.get()) + "\n"); + } + if(logger.isDebugEnabled()) { + Map> versionMap = entry.getValue(); + logger.debug(keyVersionToString(key, versionMap, storeName, partitionId)); + } + } + + recordInconsistentKey(keyVersionNodeSetMap.size()); + } + + public void recordGoodKey(long count) { + numGoodKeys += count; + numTotalKeys += count; + } + + public void recordInconsistentKey(long count) { + numTotalKeys += count; + } + } + + /** + * Return args parser + * + * @return program parser + * */ + private static OptionParser getParser() { + /* parse options */ + OptionParser parser = new OptionParser(); + parser.accepts("help", "print help information"); + parser.accepts("urls", "[REQUIRED] bootstrap URLs") + .withRequiredArg() + .describedAs("bootstrap-url") + .withValuesSeparatedBy(',') + .ofType(String.class); + parser.accepts("partitions", "partition-id") + .withRequiredArg() + .describedAs("partition-id") + .withValuesSeparatedBy(',') + .ofType(Integer.class); + parser.accepts("store", "store name") + .withRequiredArg() + .describedAs("store-name") + .ofType(String.class); + parser.accepts("bad-key-file", "File name to which inconsistent keys are to be written.") + .withRequiredArg() + .describedAs("badKeyFileOut") + .ofType(String.class); + return parser; + } + + /** + * Print Usage to STDOUT + */ + private static void printUsage() { + StringBuilder help = new StringBuilder(); + help.append("ConsistencyCheck Tool\n"); + help.append(" Scan partitions of a store by bootstrap url(s) for consistency and\n"); + help.append(" output inconsistent keys to a file.\n"); + help.append("Options:\n"); + help.append(" Required:\n"); + help.append(" --partitions [,...]\n"); + help.append(" --urls [,...]\n"); + help.append(" --store \n"); + help.append(" --bad-key-file \n"); + help.append(" Optional:\n"); + help.append(" --help\n"); + help.append(" Note:\n"); + help.append(" If you have two or more clusters to scan for consistency across them,\n"); + help.append(" You will need to supply multiple bootstrap urls, one for each cluster.\n"); + help.append(" When multiple urls are used, all versions are considered as concurrent.\n"); + help.append(" Versioned objects from different nodes are identified by value hashes,\n"); + help.append(" instead of VectorClocks\n"); + help.append(" If specified clusters do not have the same number of partitions, \n"); + help.append(" checking will fail.\n"); + System.out.print(help.toString()); + } + + /** + * Determine the consistency level of a key + * + * @param versionNodeSetMap A map that maps version to set of PrefixNodes + * @param replicationFactor Total replication factor for the set of clusters + * @return ConsistencyLevel Enum + */ + public static ConsistencyLevel determineConsistency(Map> versionNodeSetMap, + int replicationFactor) { + boolean fullyConsistent = true; + Version latestVersion = null; + for(Map.Entry> versionNodeSetEntry: versionNodeSetMap.entrySet()) { + Version version = versionNodeSetEntry.getKey(); + if(version instanceof VectorClock) { + if(latestVersion == null + || ((VectorClock) latestVersion).getTimestamp() < ((VectorClock) version).getTimestamp()) { + latestVersion = version; + } + } + Set nodeSet = versionNodeSetEntry.getValue(); + fullyConsistent = fullyConsistent && (nodeSet.size() == replicationFactor); + } + if(fullyConsistent) { + return ConsistencyLevel.FULL; + } else { + // latest write consistent, effectively consistent + if(latestVersion != null + && versionNodeSetMap.get(latestVersion).size() == replicationFactor) { + return ConsistencyLevel.LATEST_CONSISTENT; + } + // all other states inconsistent + return ConsistencyLevel.INCONSISTENT; + } + } + + /** + * Determine if a key version is invalid by comparing the version's + * existance and required writes configuration + * + * @param keyVersionNodeSetMap A map that contains keys mapping to a map + * that maps versions to set of PrefixNodes + * @param requiredWrite Required Write configuration + */ + public static void cleanIneligibleKeys(Map>> keyVersionNodeSetMap, + int requiredWrite) { + Set keysToDelete = new HashSet(); + for(Map.Entry>> entry: keyVersionNodeSetMap.entrySet()) { + Set versionsToDelete = new HashSet(); + + ByteArray key = entry.getKey(); + Map> versionNodeSetMap = entry.getValue(); + // mark version for deletion if not enough writes + for(Map.Entry> versionNodeSetEntry: versionNodeSetMap.entrySet()) { + Set nodeSet = versionNodeSetEntry.getValue(); + if(nodeSet.size() < requiredWrite) { + versionsToDelete.add(versionNodeSetEntry.getKey()); + } + } + // delete versions + for(Version v: versionsToDelete) { + versionNodeSetMap.remove(v); + } + // mark key for deletion if no versions left + if(versionNodeSetMap.size() == 0) { + keysToDelete.add(key); + } + } + // delete keys + for(ByteArray k: keysToDelete) { + keyVersionNodeSetMap.remove(k); + } + } + + @SuppressWarnings("unchecked") + public static void main(String[] args) throws Exception { + OptionSet options = getParser().parse(args); + + /* validate options */ + if(options.hasArgument("help")) { + printUsage(); + return; + } + if(!options.hasArgument("urls") || !options.hasArgument("partitions") + || !options.hasArgument("store") || !options.hasArgument("bad-key-file")) { + printUsage(); + return; + } + + List urls = (List) options.valuesOf("urls"); + String storeName = (String) options.valueOf("store"); + List partitionIds = (List) options.valuesOf("partitions"); + String badKeyFile = (String) options.valueOf("bad-key-file"); + + BufferedWriter badKeyWriter = null; + try { + badKeyWriter = new BufferedWriter(new FileWriter(badKeyFile)); + } catch(IOException e) { + Utils.croak("Failure to open output file : " + e.getMessage()); + } + + Map partitionStatsMap = new HashMap(); + /* scan each partitions */ + try { + for(Integer partitionId: partitionIds) { + ConsistencyCheck checker = new ConsistencyCheck(urls, + storeName, + partitionId, + badKeyWriter); + checker.connect(); + Reporter reporter = checker.execute(); + partitionStatsMap.put(partitionId, reporter); + } + } catch(Exception e) { + Utils.croak("Exception during consistency checking : " + e.getMessage()); + } finally { + badKeyWriter.close(); + } + + /* print stats */ + StringBuilder statsString = new StringBuilder(); + long totalGoodKeys = 0; + long totalTotalKeys = 0; + // each partition + statsString.append("TYPE,Store,ParitionId,KeysConsistent,KeysTotal,Consistency\n"); + for(Map.Entry entry: partitionStatsMap.entrySet()) { + Integer partitionId = entry.getKey(); + Reporter reporter = entry.getValue(); + totalGoodKeys += reporter.numGoodKeys; + totalTotalKeys += reporter.numTotalKeys; + statsString.append("STATS,"); + statsString.append(storeName + ","); + statsString.append(partitionId + ","); + statsString.append(reporter.numGoodKeys + ","); + statsString.append(reporter.numTotalKeys + ","); + statsString.append((double) (reporter.numGoodKeys) / (double) reporter.numTotalKeys); + statsString.append("\n"); + } + // all partitions + statsString.append("STATS,"); + statsString.append(storeName + ","); + statsString.append("aggregate,"); + statsString.append(totalGoodKeys + ","); + statsString.append(totalTotalKeys + ","); + statsString.append((double) (totalGoodKeys) / (double) totalTotalKeys); + statsString.append("\n"); + + for(String line: statsString.toString().split("\n")) { + logger.info(line); + } + } + + /** + * Convert a key-version-nodeSet information to string + * + * @param key The key + * @param versionMap mapping versions to set of PrefixNodes + * @param storeName store's name + * @param partitionId partition scanned + * @return a string that describe the information passed in + */ + public static String keyVersionToString(ByteArray key, + Map> versionMap, + String storeName, + Integer partitionId) { + StringBuilder record = new StringBuilder(); + for(Map.Entry> versionSet: versionMap.entrySet()) { + Version version = versionSet.getKey(); + Set nodeSet = versionSet.getValue(); + + record.append("BAD_KEY,"); + record.append(storeName + ","); + record.append(partitionId + ","); + record.append(ByteUtils.toHexString(key.get()) + ","); + record.append(nodeSet.toString().replace(", ", ";") + ","); + if(version instanceof VectorClock) { + record.append(((VectorClock) version).getTimestamp() + ","); + record.append(version.toString() + .replaceAll(", ", ";") + .replaceAll(" ts:[0-9]*", "") + .replaceAll("version\\((.*)\\)", "[$1]")); + } + if(version instanceof HashedValue) { + Integer hashValue = ((HashedValue) version).getValueHash(); + Version realVersion = ((HashedValue) version).getInner(); + record.append(((VectorClock) realVersion).getTimestamp() + ","); + record.append(realVersion.toString() + .replaceAll(", ", ";") + .replaceAll(" ts:[0-9]*", "") + .replaceAll("version\\((.*)\\)", "[$1],")); + record.append(hashValue); + } + } + return record.toString(); + } + +} diff --git a/src/java/voldemort/utils/ConsistencyFix.java b/src/java/voldemort/utils/ConsistencyFix.java new file mode 100644 index 0000000000..9615b0cdbc --- /dev/null +++ b/src/java/voldemort/utils/ConsistencyFix.java @@ -0,0 +1,656 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.utils; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.text.DecimalFormat; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ArrayBlockingQueue; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.RejectedExecutionHandler; +import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.TimeUnit; + +import org.apache.log4j.Logger; + +import voldemort.client.ClientConfig; +import voldemort.client.protocol.admin.AdminClient; +import voldemort.client.protocol.admin.AdminClientConfig; +import voldemort.client.protocol.admin.QueryKeyResult; +import voldemort.cluster.Cluster; +import voldemort.store.StoreDefinition; +import voldemort.versioning.ClockEntry; +import voldemort.versioning.VectorClock; +import voldemort.versioning.Versioned; + +// TODO: (refactor) Move to new directory voldemort/tools. Also move +// ConsistencyCheck, Rebalance, and possibly other tools (shells and so on). +// This would reduce the amount of different stuff in the utils directory. +public class ConsistencyFix { + + private static final Logger logger = Logger.getLogger(ConsistencyFix.class); + + private final String storeName; + private final AdminClient adminClient; + private final StoreInstance storeInstance; + private final Stats stats; + private final long perServerQPSLimit; + private final ConcurrentMap putThrottlers; + private final boolean dryRun; + private final boolean parseOnly; + + ConsistencyFix(String url, + String storeName, + long progressBar, + long perServerQPSLimit, + boolean dryRun, + boolean parseOnly) { + this.storeName = storeName; + logger.info("Connecting to bootstrap server: " + url); + this.adminClient = new AdminClient(url, new AdminClientConfig(), new ClientConfig(), 0); + Cluster cluster = adminClient.getAdminClientCluster(); + logger.info("Cluster determined to be: " + cluster.getName()); + + Versioned> storeDefinitions = adminClient.metadataMgmtOps.getRemoteStoreDefList(0); + List storeDefs = storeDefinitions.getValue(); + StoreDefinition storeDefinition = StoreDefinitionUtils.getStoreDefinitionWithName(storeDefs, + storeName); + logger.info("Store definition for store " + storeName + " has been determined."); + + storeInstance = new StoreInstance(cluster, storeDefinition); + + stats = new Stats(progressBar); + + this.perServerQPSLimit = perServerQPSLimit; + this.putThrottlers = new ConcurrentHashMap(); + this.dryRun = dryRun; + this.parseOnly = parseOnly; + } + + public String getStoreName() { + return storeName; + } + + public StoreInstance getStoreInstance() { + return storeInstance; + } + + public AdminClient getAdminClient() { + return adminClient; + } + + public void close() { + adminClient.close(); + } + + public Stats getStats() { + return stats; + } + + public boolean isDryRun() { + return dryRun; + } + + public boolean isParseOnly() { + return parseOnly; + } + + /** + * Throttle put (repair) activity per server. + * + * @param nodeId The node for which to possibly throttle put activity. + */ + public void maybePutThrottle(int nodeId) { + if(!putThrottlers.containsKey(nodeId)) { + putThrottlers.putIfAbsent(nodeId, new EventThrottler(perServerQPSLimit)); + } + putThrottlers.get(nodeId).maybeThrottle(1); + } + + /** + * Status of the repair of a specific "bad key" + */ + public enum Status { + SUCCESS("success"), + BAD_INIT("bad initialization of fix key"), + FETCH_EXCEPTION("exception during fetch"), + REPAIR_EXCEPTION("exception during repair"); + + private final String name; + + private Status(String name) { + this.name = name; + } + + @Override + public String toString() { + return name; + } + } + + public String execute(int parallelism, + String badKeyFileIn, + boolean orphanFormat, + String badKeyFileOut) { + ExecutorService badKeyReaderService; + ExecutorService badKeyWriterService; + ExecutorService consistencyFixWorkers; + + // Create BadKeyWriter thread + BlockingQueue badKeyQOut = new ArrayBlockingQueue(parallelism * 10); + badKeyWriterService = Executors.newSingleThreadExecutor(); + badKeyWriterService.submit(new BadKeyWriter(badKeyFileOut, badKeyQOut)); + logger.info("Created badKeyWriter."); + + // Create ConsistencyFixWorker thread pool + BlockingQueue blockingQ = new ArrayBlockingQueue(parallelism); + RejectedExecutionHandler rejectedExecutionHandler = new ThreadPoolExecutor.CallerRunsPolicy(); + consistencyFixWorkers = new ThreadPoolExecutor(parallelism, + parallelism, + 0L, + TimeUnit.MILLISECONDS, + blockingQ, + rejectedExecutionHandler); + logger.info("Created ConsistencyFixWorker pool."); + + // Create BadKeyReader thread + CountDownLatch allBadKeysReadLatch = new CountDownLatch(1); + badKeyReaderService = Executors.newSingleThreadExecutor(); + BadKeyReader badKeyReader = null; + if(!orphanFormat) { + badKeyReader = new BadKeyReader(allBadKeysReadLatch, + badKeyFileIn, + this, + consistencyFixWorkers, + badKeyQOut); + } else { + badKeyReader = new BadKeyOrphanReader(allBadKeysReadLatch, + badKeyFileIn, + this, + consistencyFixWorkers, + badKeyQOut); + } + badKeyReaderService.submit(badKeyReader); + + logger.info("Created badKeyReader."); + + try { + allBadKeysReadLatch.await(); + + badKeyReaderService.shutdown(); + badKeyReaderService.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS); + logger.info("Bad key reader service has shutdown."); + + consistencyFixWorkers.shutdown(); + consistencyFixWorkers.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS); + logger.info("All workers have shutdown."); + + // Poison the bad key writer to have it exit. + badKeyQOut.put(new BadKeyStatus()); + badKeyWriterService.shutdown(); + badKeyWriterService.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS); + logger.info("Bad key writer service has shutdown."); + } catch(InterruptedException e) { + logger.error("InterruptedException caught."); + if(logger.isDebugEnabled()) { + e.printStackTrace(); + } + } finally { + adminClient.close(); + } + + // Cobble together a status string for overall execution. + StringBuilder sb = new StringBuilder(); + sb.append("\n\n"); + sb.append("Exit statuses of various threads:\n"); + sb.append("\tBadKeyReader: "); + if(badKeyReader.hasException()) { + sb.append("Had exception!\n"); + } else { + sb.append("OK.\n"); + } + sb.append("\tBadKeyWriter: "); + if(badKeyReader.hasException()) { + sb.append("Had exception!\n"); + } else { + sb.append("OK.\n"); + } + sb.append("\n\n"); + sb.append(stats.summary()); + + return sb.toString(); + } + + /** + * Type with which to wrap a "bad key" + */ + public static class BadKey { + + private final String keyInHexFormat; + private final String readerInput; + + BadKey(String keyInHexFormat, String readerInput) { + this.keyInHexFormat = keyInHexFormat; + this.readerInput = readerInput; + } + + public String getKeyInHexFormat() { + return keyInHexFormat; + } + + public String getReaderInput() { + return readerInput; + } + } + + /** + * Type with which to wrap a "bad key" that could not be repaired and so + * needs to be written to output file. Has a "poison" value to effectively + * signal end-of-stream. + */ + public static class BadKeyStatus { + + private final BadKey badKey; + private final Status status; + private final boolean poison; + + /** + * Common case constructor. + */ + BadKeyStatus(BadKey badKey, Status fixKeyResult) { + this.badKey = badKey; + this.status = fixKeyResult; + this.poison = false; + } + + /** + * Constructs a "poison" object. + */ + BadKeyStatus() { + this.badKey = null; + this.status = null; + this.poison = true; + } + + public boolean isPoison() { + return poison; + } + + public BadKey getBadKey() { + return badKey; + } + + public Status getStatus() { + return status; + } + } + + public static class BadKeyReader implements Runnable { + + protected final CountDownLatch latch; + protected final String badKeyFileIn; + + protected final ConsistencyFix consistencyFix; + protected final ExecutorService consistencyFixWorkers; + protected final BlockingQueue badKeyQOut; + + protected BufferedReader fileReader; + protected boolean hasException; + + BadKeyReader(CountDownLatch latch, + String badKeyFileIn, + ConsistencyFix consistencyFix, + ExecutorService consistencyFixWorkers, + BlockingQueue badKeyQOut) { + this.latch = latch; + this.badKeyFileIn = badKeyFileIn; + + this.consistencyFix = consistencyFix; + this.consistencyFixWorkers = consistencyFixWorkers; + this.badKeyQOut = badKeyQOut; + + try { + this.fileReader = new BufferedReader(new FileReader(badKeyFileIn)); + } catch(IOException e) { + Utils.croak("Failure to open input stream: " + e.getMessage()); + } + + this.hasException = false; + } + + @Override + public void run() { + try { + int counter = 0; + for(String keyLine = fileReader.readLine(); keyLine != null; keyLine = fileReader.readLine()) { + BadKey badKey = new BadKey(keyLine.trim(), keyLine); + if(!keyLine.isEmpty()) { + counter++; + logger.debug("BadKeyReader read line: key (" + keyLine + ") and counter (" + + counter + ")"); + if(!consistencyFix.isParseOnly()) { + consistencyFixWorkers.submit(new ConsistencyFixWorker(badKey, + consistencyFix, + badKeyQOut)); + } + } + } + } catch(IOException ioe) { + logger.error("IO exception reading badKeyFile " + badKeyFileIn + " : " + + ioe.getMessage()); + hasException = true; + } finally { + latch.countDown(); + try { + fileReader.close(); + } catch(IOException ioe) { + logger.warn("IOException during fileReader.close in BadKeyReader thread."); + } + } + } + + boolean hasException() { + return hasException; + } + } + + public static class BadKeyOrphanReader extends BadKeyReader { + + BadKeyOrphanReader(CountDownLatch latch, + String badKeyFileIn, + ConsistencyFix consistencyFix, + ExecutorService consistencyFixWorkers, + BlockingQueue badKeyQOut) { + super(latch, badKeyFileIn, consistencyFix, consistencyFixWorkers, badKeyQOut); + } + + /** + * Parses a "version" string of the following format: + * + * 'version(2:25, 25:2, 29:156) ts:1355451322089' + * + * and converts this parsed value back into a VectorClock type. Note + * that parsing is white space sensitive. I.e., trim the string first + * and make skippy sure that the white space matches the above. + * + * This method should not be necessary. VectorClock.toBytes() should be + * used for serialization, *not* VectorClock.toString(). VectorClocks + * serialized via toBytes can be deserialized via VectorClock(byte[]). + * + * @param versionString + * @return + * @throws IOException + */ + @Deprecated + private VectorClock parseVersion(String versionString) throws IOException { + List versions = new ArrayList(); + long timestamp = 0; + + String parsed[] = versionString.split(" ts:"); + logger.trace("parsed[0]: " + parsed[0]); + if(parsed.length != 2) { + throw new IOException("Could not parse vector clock: " + versionString); + } + timestamp = Long.parseLong(parsed[1]); + // "version(" + // _01234567_ + // => 8 is the magic offset to elide "version(" + // '-1' gets rid of the last ")" + String clockEntryList = parsed[0].substring(8, parsed[0].length() - 1); + logger.trace("clockEntryList: <" + clockEntryList + ">"); + String parsedClockEntryList[] = clockEntryList.split(", "); + for(int i = 0; i < parsedClockEntryList.length; ++i) { + logger.trace("parsedClockEntry... : <" + parsedClockEntryList[i] + ">"); + String parsedClockEntry[] = parsedClockEntryList[i].split(":"); + if(parsedClockEntry.length != 2) { + throw new IOException("Could not parse ClockEntry: <" + parsedClockEntryList[i] + + ">"); + } + short nodeId = Short.parseShort(parsedClockEntry[0]); + long version = Long.parseLong(parsedClockEntry[1]); + logger.trace("clock entry parsed: <" + nodeId + "> : <" + version + ">"); + versions.add(new ClockEntry(nodeId, version)); + } + + return new VectorClock(versions, timestamp); + } + + @Override + public void run() { + try { + int counter = 0; + for(String keyNumValsLine = fileReader.readLine(); keyNumValsLine != null; keyNumValsLine = fileReader.readLine()) { + String badKeyEntry = keyNumValsLine; + + String keyNumVals = keyNumValsLine.trim(); + if(!keyNumVals.isEmpty()) { + counter++; + String parsed[] = keyNumVals.split(","); + if(parsed.length != 2) { + throw new IOException("KeyNumVal line did not parse into two elements: " + + keyNumVals); + } + logger.trace("parsed[0]: <" + parsed[0] + ">, parsed[1] <" + parsed[1] + + ">"); + String key = parsed[0]; + ByteArray keyByteArray = new ByteArray(ByteUtils.fromHexString(key)); + int numVals = Integer.parseInt(parsed[1]); + logger.debug("BadKeyReader read line: key (" + key + ") and counter (" + + counter + ") and numVals is (" + numVals + ")"); + + List> values = new ArrayList>(); + for(int i = 0; i < numVals; ++i) { + String valueVersionLine = fileReader.readLine(); + badKeyEntry.concat(valueVersionLine); + String valueVersion = valueVersionLine.trim(); + + if(valueVersion.isEmpty()) { + throw new IOException("ValueVersion line was empty!"); + } + parsed = valueVersion.split(",", 2); + if(parsed.length != 2) { + throw new IOException("ValueVersion line did not parse into two elements: " + + valueVersion); + } + byte[] value = ByteUtils.fromHexString(parsed[0]); + VectorClock vectorClock = parseVersion(parsed[1]); + + values.add(new Versioned(value, vectorClock)); + } + QueryKeyResult queryKeyResult = new QueryKeyResult(keyByteArray, values); + if(!consistencyFix.isParseOnly()) { + BadKey badKey = new BadKey(key, badKeyEntry); + consistencyFixWorkers.submit(new ConsistencyFixWorker(badKey, + consistencyFix, + badKeyQOut, + queryKeyResult)); + } + } + } + } catch(Exception e) { + logger.error("Exception reading badKeyFile " + badKeyFileIn + " : " + + e.getMessage()); + hasException = true; + } finally { + latch.countDown(); + try { + fileReader.close(); + } catch(IOException ioe) { + logger.warn("IOException during fileReader.close in BadKeyReader thread."); + } + } + } + } + + public static class BadKeyWriter implements Runnable { + + private final String badKeyFileOut; + private final BlockingQueue badKeyQOut; + + private BufferedWriter fileWriter = null; + private boolean hasException; + + BadKeyWriter(String badKeyFile, BlockingQueue badKeyQOut) { + this.badKeyFileOut = badKeyFile; + this.badKeyQOut = badKeyQOut; + + try { + fileWriter = new BufferedWriter(new FileWriter(badKeyFileOut)); + } catch(IOException e) { + Utils.croak("Failure to open output file : " + e.getMessage()); + } + this.hasException = false; + } + + @Override + public void run() { + try { + BadKeyStatus badKeyStatus = badKeyQOut.take(); + while(!badKeyStatus.isPoison()) { + logger.debug("BADKEY," + badKeyStatus.getBadKey().getKeyInHexFormat() + "," + + badKeyStatus.getStatus().name() + "\n"); + + fileWriter.write(badKeyStatus.getBadKey().getReaderInput()); + badKeyStatus = badKeyQOut.take(); + } + } catch(IOException ioe) { + logger.error("IO exception writing badKeyFile " + badKeyFileOut + " : " + + ioe.getMessage()); + hasException = true; + } catch(InterruptedException ie) { + logger.error("Interrupted exception during writing of badKeyFile " + badKeyFileOut + + " : " + ie.getMessage()); + hasException = true; + } finally { + try { + fileWriter.close(); + } catch(IOException ioe) { + logger.warn("Interrupted exception during fileWriter.close:" + ioe.getMessage()); + } + } + } + + boolean hasException() { + return hasException; + } + } + + public static class Stats { + + final long progressPeriodOps; + long fixCount; + long putCount; + long failures; + Map failureDistribution; + long oveCount; // ObsoleteVersionExceptions + long lastTimeMs; + final long startTimeMs; + + /** + * + * @param progressPeriodOps Number of operations between progress bar + * updates. + */ + Stats(long progressPeriodOps) { + this.progressPeriodOps = progressPeriodOps; + this.fixCount = 0; + this.putCount = 0; + this.failures = 0; + this.failureDistribution = new HashMap(); + this.oveCount = 0; + this.lastTimeMs = System.currentTimeMillis(); + this.startTimeMs = lastTimeMs; + } + + private synchronized String getPrettyQPS(long count, long ms) { + long periodS = TimeUnit.MILLISECONDS.toSeconds(ms); + double qps = (count * 1.0 / periodS); + DecimalFormat df = new DecimalFormat("0.##"); + return df.format(qps); + } + + public synchronized void incrementFixCount() { + fixCount++; + if(fixCount % progressPeriodOps == 0) { + long nowTimeMs = System.currentTimeMillis(); + StringBuilder sb = new StringBuilder(); + sb.append("\nConsistencyFix Progress\n"); + sb.append("\tBad keys processed : " + fixCount + + " (during this progress period of " + progressPeriodOps + " ops)\n"); + sb.append("\tBad key processing rate : " + + getPrettyQPS(progressPeriodOps, nowTimeMs - lastTimeMs) + + " bad keys/second)\n"); + sb.append("\tServer-puts issued : " + putCount + " (since fixer started)\n"); + sb.append("\tObsoleteVersionExceptions encountered : " + oveCount + + " (since fixer started)\n"); + logger.info(sb.toString()); + lastTimeMs = nowTimeMs; + } + } + + public synchronized void incrementPutCount() { + putCount++; + } + + public synchronized void incrementObsoleteVersionExceptions() { + oveCount++; + } + + public synchronized void incrementFailures(Status status) { + failures++; + if(failures % progressPeriodOps == 0) { + logger.info("Bad key failed to process count = " + failures); + } + if(!failureDistribution.containsKey(status)) { + failureDistribution.put(status, 0L); + } + failureDistribution.put(status, failureDistribution.get(status) + 1); + } + + public synchronized String summary() { + StringBuilder summary = new StringBuilder(); + summary.append("\n\n"); + summary.append("Consistency Fix Summary\n"); + summary.append("-----------------------\n"); + summary.append("Total bad keys processed: " + fixCount + "\n"); + summary.append("Total server-puts issued: " + putCount + "\n"); + summary.append("Total ObsoleteVersionExceptions encountered: " + oveCount + "\n"); + summary.append("Total keys processed that were not corrected: " + failures + "\n"); + for(Status status: failureDistribution.keySet()) { + summary.append("\t" + status + " : " + failureDistribution.get(status) + "\n"); + } + + long nowTimeMs = System.currentTimeMillis(); + summary.append("Keys per second processed: " + + getPrettyQPS(fixCount, nowTimeMs - startTimeMs) + "\n"); + + return summary.toString(); + } + } +} diff --git a/src/java/voldemort/utils/ConsistencyFixCLI.java b/src/java/voldemort/utils/ConsistencyFixCLI.java new file mode 100644 index 0000000000..9cc12e0217 --- /dev/null +++ b/src/java/voldemort/utils/ConsistencyFixCLI.java @@ -0,0 +1,201 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.utils; + +import java.io.IOException; + +import joptsimple.OptionParser; +import joptsimple.OptionSet; + +public class ConsistencyFixCLI { + + public static void printUsage() { + StringBuilder sb = new StringBuilder(); + sb.append("\n"); + sb.append("Required arguments: \n"); + sb.append("\t--url \n"); + sb.append("\t--store \n"); + sb.append("\t--bad-key-file-in \n"); + sb.append("\t--bad-key-file-out )\n"); + sb.append("Optional arguments: \n"); + sb.append("\t--orphan-format\n"); + sb.append("\t--dry-run\n"); + sb.append("\t--progress-period-ops \n"); + sb.append("\t--parallelism \n"); + sb.append("\t--per-server-iops-limit \n"); + sb.append("\n"); + + System.out.println(sb.toString()); + } + + public static void printUsage(String errMessage, OptionParser parser) { + System.err.println("Error: " + errMessage); + try { + parser.printHelpOn(System.out); + } catch(IOException ioe) { + System.err.println("Caught IOException while trying to print out parser options: " + + ioe.getMessage()); + } + printUsage(); + System.exit(1); + } + + private static class Options { + + public final static int defaultParallelism = 8; + public final static long defaultProgressPeriodOps = 1000; + public final static long defaultPerServerQPSLimit = 100; + + public String url = null; + public String storeName = null; + public String badKeyFileIn = null; + public boolean badKeyFileInOrphanFormat = false; + public String badKeyFileOut = null; + public int parallelism = defaultParallelism; + public long progressPeriodOps = defaultProgressPeriodOps; + public long perServerQPSLimit = defaultPerServerQPSLimit; + public boolean dryRun = false; + public boolean parseOnly = false; + } + + /** + * All the logic for parsing and validating options. + * + * @param args + * @return A struct containing validated options. + * @throws IOException + */ + private static ConsistencyFixCLI.Options parseArgs(String[] args) { + OptionParser parser = new OptionParser(); + parser.accepts("help", "print help information"); + parser.accepts("url", "The bootstrap url.") + .withRequiredArg() + .describedAs("bootstrapUrl") + .ofType(String.class); + parser.accepts("store", "The store name.") + .withRequiredArg() + .describedAs("storeName") + .ofType(String.class); + parser.accepts("bad-key-file-in", + "Name of bad-key-file-in. " + "Each key must be in hexadecimal format. " + + "Each key must be on a separate line in the file. ") + .withRequiredArg() + .describedAs("badKeyFileIn") + .ofType(String.class); + parser.accepts("orphan-format", + "Indicates format of bad-key-file-in is of 'orphan' key-values."); + parser.accepts("dry-run", + "Indicates to go through all of the read actions until the point of issuing repair puts. Then, do a 'no-op'."); + parser.accepts("parse-only", + "Indicates to only parse the input file. Does not perform any key queries or repair puts. " + + "Does bootstrap though so bootstrapUrl and storeName must be specified."); + parser.accepts("bad-key-file-out", + "Name of bad-key-file-out. " + + "Keys that are not mae consistent are output to this file.") + .withRequiredArg() + .describedAs("badKeyFileOut") + .ofType(String.class); + parser.accepts("parallelism", + "Number of consistency fix messages outstanding in parallel. ") + .withRequiredArg() + .describedAs("parallelism [Default value: " + Options.defaultParallelism + "]") + .ofType(Integer.class); + parser.accepts("progress-period-ops", + "Number of operations between 'info' progress messages. ") + .withRequiredArg() + .describedAs("period (in operations) between outputting progress [Default value: " + + Options.defaultProgressPeriodOps + "]") + .ofType(Long.class); + parser.accepts("per-server-qps-limit", + "Number of operations that the consistency fixer will issue to any individual server in one second. ") + .withRequiredArg() + .describedAs("perServerQPSLimit [Default value: " + Options.defaultPerServerQPSLimit + + "]") + .ofType(Long.class); + + OptionSet optionSet = parser.parse(args); + + if(optionSet.hasArgument("help")) { + try { + parser.printHelpOn(System.out); + } catch(IOException e) { + e.printStackTrace(); + } + printUsage(); + System.exit(0); + } + if(!optionSet.hasArgument("url")) { + printUsage("Missing required 'url' argument.", parser); + } + if(!optionSet.hasArgument("store")) { + printUsage("Missing required 'store' argument.", parser); + } + if(!optionSet.has("bad-key-file-in")) { + printUsage("Missing required 'bad-key-file-in' argument.", parser); + } + if(!optionSet.has("bad-key-file-out")) { + printUsage("Missing required 'bad-key-file-out' argument.", parser); + } + + Options options = new Options(); + + options.url = (String) optionSet.valueOf("url"); + options.storeName = (String) optionSet.valueOf("store"); + options.badKeyFileIn = (String) optionSet.valueOf("bad-key-file-in"); + options.badKeyFileOut = (String) optionSet.valueOf("bad-key-file-out"); + if(optionSet.has("orphan-format")) { + options.badKeyFileInOrphanFormat = true; + } + if(optionSet.has("parallelism")) { + options.parallelism = (Integer) optionSet.valueOf("parallelism"); + } + if(optionSet.has("progress-period-ops")) { + options.progressPeriodOps = (Long) optionSet.valueOf("progress-period-ops"); + } + if(optionSet.has("per-server-qps-limit")) { + options.perServerQPSLimit = (Long) optionSet.valueOf("per-server-qps-limit"); + } + if(optionSet.has("dry-run")) { + options.dryRun = true; + } + if(optionSet.has("parse-only")) { + options.parseOnly = true; + } + + return options; + } + + public static void main(String[] args) throws Exception { + Options options = parseArgs(args); + + ConsistencyFix consistencyFix = new ConsistencyFix(options.url, + options.storeName, + options.progressPeriodOps, + options.perServerQPSLimit, + options.dryRun, + options.parseOnly); + + String summary = consistencyFix.execute(options.parallelism, + options.badKeyFileIn, + options.badKeyFileInOrphanFormat, + options.badKeyFileOut); + + consistencyFix.close(); + + System.out.println(summary); + } +} diff --git a/src/java/voldemort/utils/ConsistencyFixWorker.java b/src/java/voldemort/utils/ConsistencyFixWorker.java new file mode 100644 index 0000000000..63b6a0b97c --- /dev/null +++ b/src/java/voldemort/utils/ConsistencyFixWorker.java @@ -0,0 +1,389 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.utils; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.BlockingQueue; + +import org.apache.log4j.Logger; + +import voldemort.VoldemortException; +import voldemort.client.protocol.admin.QueryKeyResult; +import voldemort.store.routed.NodeValue; +import voldemort.store.routed.ReadRepairer; +import voldemort.utils.ConsistencyFix.BadKey; +import voldemort.utils.ConsistencyFix.BadKeyStatus; +import voldemort.utils.ConsistencyFix.Status; +import voldemort.versioning.ObsoleteVersionException; +import voldemort.versioning.VectorClock; +import voldemort.versioning.Versioned; + +import com.google.common.collect.Lists; + +class ConsistencyFixWorker implements Runnable { + + private static final Logger logger = Logger.getLogger(ConsistencyFixWorker.class); + private static final int fakeNodeID = Integer.MIN_VALUE; + + private final BadKey badKey; + private final ConsistencyFix consistencyFix; + private final BlockingQueue badKeyQOut; + private final QueryKeyResult orphanedValues; + + /** + * Normal use case constructor. + * + * @param keyInHexFormat + * @param consistencyFix + * @param badKeyQOut + */ + ConsistencyFixWorker(BadKey badKey, + ConsistencyFix consistencyFix, + BlockingQueue badKeyQOut) { + this(badKey, consistencyFix, badKeyQOut, null); + } + + /** + * Constructor for "orphaned values" use case. I.e., there are values for + * the specific key that exist somewhere and may need to be written to the + * nodes which actually host the key. + * + * @param keyInHexFormat + * @param consistencyFix + * @param badKeyQOut + * @param orphanedValues Set to null if no orphaned values to be included. + */ + ConsistencyFixWorker(BadKey badKey, + ConsistencyFix consistencyFix, + BlockingQueue badKeyQOut, + QueryKeyResult orphanedValues) { + this.badKey = badKey; + this.consistencyFix = consistencyFix; + this.badKeyQOut = badKeyQOut; + this.orphanedValues = orphanedValues; + } + + private String myName() { + return Thread.currentThread().getName() + "-" + ConsistencyFixWorker.class.getName(); + } + + @Override + public void run() { + logger.trace("About to process key " + badKey + " (" + myName() + ")"); + Status status = doConsistencyFix(badKey); + logger.trace("Finished processing key " + badKey + " (" + myName() + ")"); + consistencyFix.getStats().incrementFixCount(); + + if(status != Status.SUCCESS) { + try { + badKeyQOut.put(new BadKeyStatus(badKey, status)); + } catch(InterruptedException ie) { + logger.warn("Worker thread " + myName() + " interrupted."); + } + consistencyFix.getStats().incrementFailures(status); + } + } + + public Status doConsistencyFix(BadKey badKey) { + // Initialization. + byte[] keyInBytes; + List nodeIdList = null; + int masterPartitionId = -1; + try { + keyInBytes = ByteUtils.fromHexString(badKey.getKeyInHexFormat()); + masterPartitionId = consistencyFix.getStoreInstance().getMasterPartitionId(keyInBytes); + nodeIdList = consistencyFix.getStoreInstance() + .getReplicationNodeList(masterPartitionId); + } catch(Exception exception) { + logger.info("Aborting fixKey due to bad init."); + if(logger.isDebugEnabled()) { + exception.printStackTrace(); + } + return Status.BAD_INIT; + } + ByteArray keyAsByteArray = new ByteArray(keyInBytes); + + // Do the reads + Map nodeIdToKeyValues = doReads(nodeIdList, + keyInBytes, + badKey.getKeyInHexFormat()); + + // Process read replies (i.e., nodeIdToKeyValues) + ProcessReadRepliesResult result = processReadReplies(nodeIdList, + keyAsByteArray, + badKey.getKeyInHexFormat(), + nodeIdToKeyValues); + if(result.status != Status.SUCCESS) { + return result.status; + } + + // Resolve conflicts indicated in nodeValues + List> toReadRepair = resolveReadConflicts(result.nodeValues); + if(logger.isTraceEnabled()) { + if(toReadRepair.size() == 0) { + logger.trace("Nothing to repair"); + } + for(NodeValue nodeValue: toReadRepair) { + logger.trace(nodeValue.getNodeId() + " --- " + nodeValue.getKey().toString()); + } + } + + // Do the repairs + Status status = doRepairPut(toReadRepair); + + // return status of last operation (success or otherwise) + return status; + } + + /** + * + * @param nodeIdList + * @param keyInBytes + * @param keyInHexFormat + * @return + */ + private Map doReads(final List nodeIdList, + final byte[] keyInBytes, + final String keyInHexFormat) { + Map nodeIdToKeyValues = new HashMap(); + + ByteArray key = new ByteArray(keyInBytes); + for(int nodeId: nodeIdList) { + List> values = null; + try { + values = consistencyFix.getAdminClient().storeOps.getNodeKey(consistencyFix.getStoreName(), + nodeId, + key); + nodeIdToKeyValues.put(nodeId, new QueryKeyResult(key, values)); + } catch(VoldemortException ve) { + nodeIdToKeyValues.put(nodeId, new QueryKeyResult(key, ve)); + } + } + + return nodeIdToKeyValues; + } + + /** + * Result of an invocation of processReadReplies + */ + private class ProcessReadRepliesResult { + + public final Status status; + public final List> nodeValues; + + /** + * Constructor for error status + */ + ProcessReadRepliesResult(Status status) { + this.status = status; + this.nodeValues = null; + } + + /** + * Constructor for success + */ + ProcessReadRepliesResult(List> nodeValues) { + this.status = Status.SUCCESS; + this.nodeValues = nodeValues; + } + } + + /** + * + * @param nodeIdList + * @param keyAsByteArray + * @param keyInHexFormat + * @param nodeIdToKeyValues + * @param nodeValues Effectively the output of this method. Must pass in a + * non-null object to be populated by this method. + * @return + */ + private ProcessReadRepliesResult processReadReplies(final List nodeIdList, + final ByteArray keyAsByteArray, + final String keyInHexFormat, + final Map nodeIdToKeyValues) { + List> nodeValues = new ArrayList>(); + boolean exceptionsEncountered = false; + for(int nodeId: nodeIdList) { + QueryKeyResult keyValue; + if(nodeIdToKeyValues.containsKey(nodeId)) { + keyValue = nodeIdToKeyValues.get(nodeId); + + if(keyValue.hasException()) { + logger.debug("Exception encountered while fetching key " + keyInHexFormat + + " from node with nodeId " + nodeId + " : " + + keyValue.getException().getMessage()); + exceptionsEncountered = true; + } else { + if(keyValue.getValues().isEmpty()) { + Versioned versioned = new Versioned(null); + nodeValues.add(new NodeValue(nodeId, + keyValue.getKey(), + versioned)); + + } else { + for(Versioned value: keyValue.getValues()) { + nodeValues.add(new NodeValue(nodeId, + keyValue.getKey(), + value)); + } + } + } + } else { + logger.debug("No key-value returned from node with id:" + nodeId); + Versioned versioned = new Versioned(null); + nodeValues.add(new NodeValue(nodeId, keyAsByteArray, versioned)); + } + } + if(exceptionsEncountered) { + logger.info("Aborting fixKey because exceptions were encountered when fetching key-values."); + return new ProcessReadRepliesResult(Status.FETCH_EXCEPTION); + } + + if(logger.isDebugEnabled()) { + for(NodeValue nkv: nodeValues) { + logger.debug("\tRead NodeKeyValue : " + ByteUtils.toHexString(nkv.getKey().get()) + + " on node with id " + nkv.getNodeId() + " for version " + + nkv.getVersion()); + } + } + + return new ProcessReadRepliesResult(nodeValues); + } + + /** + * Decide on the specific key-value to write everywhere. + * + * @param nodeValues + * @return The subset of entries from nodeValues that need to be repaired. + */ + private List> resolveReadConflicts(final List> nodeValues) { + + if(logger.isTraceEnabled()) { + logger.trace("NodeValues passed into resolveReadConflicts."); + if(nodeValues.size() == 0) { + logger.trace("Empty nodeValues passed to resolveReadConflicts"); + } + for(NodeValue nodeValue: nodeValues) { + logger.trace("\t" + nodeValue.getNodeId() + " - " + nodeValue.getKey().toString() + + " - " + nodeValue.getVersion().toString()); + } + } + + // If orphaned values exist, add them to fake nodes to be processed by + // "getRepairs" + int currentFakeNodeId = fakeNodeID; + if(this.orphanedValues != null) { + for(Versioned value: this.orphanedValues.getValues()) { + nodeValues.add(new NodeValue(currentFakeNodeId, + this.orphanedValues.getKey(), + value)); + currentFakeNodeId++; + } + } + + // Some cut-paste-and-modify coding from + // store/routed/action/AbstractReadRepair.java and + // store/routed/ThreadPoolRoutedStore.java + ReadRepairer readRepairer = new ReadRepairer(); + List> nodeKeyValues = readRepairer.getRepairs(nodeValues); + + if(logger.isTraceEnabled()) { + if(nodeKeyValues.size() == 0) { + logger.trace("\treadRepairer returned an empty list."); + } + for(NodeValue nodeKeyValue: nodeKeyValues) { + logger.trace("\tNodeKeyValue result from readRepairer.getRepairs : " + + ByteUtils.toHexString(nodeKeyValue.getKey().get()) + + " on node with id " + nodeKeyValue.getNodeId() + " for version " + + nodeKeyValue.getVersion()); + } + } + + List> toReadRepair = Lists.newArrayList(); + for(NodeValue v: nodeKeyValues) { + if(v.getNodeId() > currentFakeNodeId) { + // Only copy repairs intended for real nodes. + Versioned versioned = Versioned.value(v.getVersioned().getValue(), + ((VectorClock) v.getVersion()).clone()); + toReadRepair.add(new NodeValue(v.getNodeId(), + v.getKey(), + versioned)); + } else { + if(logger.isDebugEnabled()) { + logger.debug("\tIgnoring repair to fake node: " + + ByteUtils.toHexString(v.getKey().get()) + " on node with id " + + v.getNodeId() + " for version " + v.getVersion()); + } + } + + } + + if(logger.isTraceEnabled()) { + if(toReadRepair.size() == 0) { + logger.trace("\ttoReadRepair is empty."); + } + for(NodeValue nodeKeyValue: toReadRepair) { + logger.trace("\tRepair key " + ByteUtils.toHexString(nodeKeyValue.getKey().get()) + + " on node with id " + nodeKeyValue.getNodeId() + " for version " + + nodeKeyValue.getVersion()); + + } + } + return toReadRepair; + } + + /** + * + * @param toReadRepair Effectively the output of this method. Must pass in a + * non-null object to be populated by this method. + * @return + */ + public Status doRepairPut(final List> toReadRepair) { + if(this.consistencyFix.isDryRun()) { + logger.debug("Returning success from ConsistencyFixWorker.doRepairPut because this is a dry run."); + return Status.SUCCESS; + } + + boolean allRepairsSuccessful = true; + for(NodeValue nodeKeyValue: toReadRepair) { + try { + consistencyFix.maybePutThrottle(nodeKeyValue.getNodeId()); + consistencyFix.getAdminClient().storeOps.putNodeKeyValue(consistencyFix.getStoreName(), + nodeKeyValue); + consistencyFix.getStats().incrementPutCount(); + } catch(ObsoleteVersionException ove) { + // Treat OVE as success. + consistencyFix.getStats().incrementObsoleteVersionExceptions(); + } catch(VoldemortException ve) { + allRepairsSuccessful = false; + logger.debug("Repair of key " + nodeKeyValue.getKey() + "on node with id " + + nodeKeyValue.getNodeId() + " for version " + + nodeKeyValue.getVersion() + " failed because of exception : " + + ve.getMessage()); + } + } + if(!allRepairsSuccessful) { + logger.info("Aborting fixKey because exceptions were encountered when repairing key-values."); + return Status.REPAIR_EXCEPTION; + } + return Status.SUCCESS; + } +} \ No newline at end of file diff --git a/src/java/voldemort/utils/Entropy.java b/src/java/voldemort/utils/Entropy.java index 36bf063682..302bffccfd 100644 --- a/src/java/voldemort/utils/Entropy.java +++ b/src/java/voldemort/utils/Entropy.java @@ -1,3 +1,19 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + package voldemort.utils; import java.io.File; @@ -12,6 +28,7 @@ import joptsimple.OptionParser; import joptsimple.OptionSet; +import voldemort.client.ClientConfig; import voldemort.client.protocol.RequestFormatType; import voldemort.client.protocol.admin.AdminClient; import voldemort.client.protocol.admin.AdminClientConfig; @@ -31,6 +48,10 @@ import com.google.common.base.Joiner; import com.google.common.collect.Maps; +// TODO: Remove from the code base. +// Entropy is replaced by KeySamplerCLI and KeyVersionFetcherCLI. Entropy +// never really worked as described and had a complicated interface. +@Deprecated public class Entropy { private int nodeId; @@ -187,7 +208,8 @@ public void generateEntropy(Cluster cluster, AdminClient adminClient = null; try { adminClient = new AdminClient(cluster, - new AdminClientConfig().setMaxConnectionsPerNode(storeDefs.size())); + new AdminClientConfig().setMaxConnectionsPerNode(storeDefs.size()), + new ClientConfig()); if(opType) { System.out.println("Running entropy calculator"); @@ -222,26 +244,32 @@ public void generateEntropy(Cluster cluster, int numKeysPerNode = (int) Math.floor(numKeys / cluster.getNumberOfNodes()); + int numKeysStored = 0; for(Node node: cluster.getNodes()) { - keys = adminClient.fetchKeys(node.getId(), - storeDef.getName(), - cluster.getNodeById(node.getId()) - .getPartitionIds(), - null, - false); + System.out.println("Fetching " + numKeysPerNode + + " keys from node " + node.getHost()); + keys = adminClient.bulkFetchOps.fetchKeys(node.getId(), + storeDef.getName(), + cluster.getNodeById(node.getId()) + .getPartitionIds(), + null, + false, + numKeysPerNode); for(long keyId = 0; keyId < numKeysPerNode && keys.hasNext(); keyId++) { ByteArray key = keys.next(); // entropy returns distinct keys from each // node - record the key only if this node // holds the primary partition of the key - if(RebalanceUtils.getNodeIds(strategy.routeRequest(key.get()) - .subList(0, 1)) - .contains(node.getId())) { + if(NodeUtils.getNodeIds(strategy.routeRequest(key.get()) + .subList(0, 1)) + .contains(node.getId())) { writer.write(key.length()); writer.write(key.get()); + numKeysStored++; } } } + System.out.println("Fetched a total of " + numKeysStored + " keys."); } else { List partitions = cluster.getNodeById(nodeId) .getPartitionIds(); @@ -253,19 +281,20 @@ public void generateEntropy(Cluster cluster, partitionMap.put(partitionId, 0); } - keys = adminClient.fetchKeys(nodeId, - storeDef.getName(), - partitions, - null, - false); + keys = adminClient.bulkFetchOps.fetchKeys(nodeId, + storeDef.getName(), + partitions, + null, + false, + numKeysPerPartition); while(keys.hasNext() && numKeysStored < numKeys) { ByteArray key = keys.next(); // entropy returns distinct keys from each // node - record the key only if this node // holds the primary partition of the key - if(RebalanceUtils.getNodeIds(strategy.routeRequest(key.get()) - .subList(0, 1)) - .contains(nodeId)) { + if(NodeUtils.getNodeIds(strategy.routeRequest(key.get()).subList(0, + 1)) + .contains(nodeId)) { int targetPartition = strategy.getPartitionList(key.get()) .get(0); int partitionCount = partitionMap.get(targetPartition); @@ -321,6 +350,7 @@ public void generateEntropy(Cluster cluster, long deletedKeys = 0L; long foundKeys = 0L; long totalKeys = 0L; + long keysRead = 0L; try { reader = new FileInputStream(storesKeyFile); @@ -328,12 +358,14 @@ public void generateEntropy(Cluster cluster, int size = reader.read(); if(size <= 0) { + System.out.println("End of file reached."); break; } // Read the key byte[] key = new byte[size]; reader.read(key); + keysRead++; List responsibleNodes = strategy.routeRequest(key); @@ -378,7 +410,8 @@ public void generateEntropy(Cluster cluster, } if(!negativeTest) { - System.out.println("Found = " + foundKeys + " Total = " + totalKeys); + System.out.println("Found = " + foundKeys + ", Total = " + totalKeys + + ", Keys read = " + keysRead); if(foundKeys > 0 && totalKeys > 0) { System.out.println("%age found - " + 100.0 * (double) foundKeys / totalKeys); @@ -404,7 +437,7 @@ public void generateEntropy(Cluster cluster, } } finally { if(adminClient != null) - adminClient.stop(); + adminClient.close(); } } } diff --git a/src/java/voldemort/utils/EventThrottler.java b/src/java/voldemort/utils/EventThrottler.java index 770660b345..54fbb5f081 100644 --- a/src/java/voldemort/utils/EventThrottler.java +++ b/src/java/voldemort/utils/EventThrottler.java @@ -1,3 +1,18 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ package voldemort.utils; import org.apache.log4j.Logger; @@ -46,7 +61,17 @@ public EventThrottler(Time time, long ratePerSecond, long intervalMs) { this.startTime = 0L; } + /** + * Sleeps if necessary to slow down the caller. + * + * @param eventsSeen Number of events seen since last invocation. Basis for + * determining whether its necessary to sleep. + */ public synchronized void maybeThrottle(int eventsSeen) { + // TODO: This implements "bang bang" control. This is OK. But, this + // permits unbounded bursts of activity within the intervalMs. A + // controller that has more memory and explicitly bounds peak activity + // within the intervalMs may be better. long rateLimit = getRate(); if(logger.isDebugEnabled()) diff --git a/src/java/voldemort/utils/KeySamplerCLI.java b/src/java/voldemort/utils/KeySamplerCLI.java new file mode 100644 index 0000000000..c8464a16ad --- /dev/null +++ b/src/java/voldemort/utils/KeySamplerCLI.java @@ -0,0 +1,470 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package voldemort.utils; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.Writer; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; + +import joptsimple.OptionException; +import joptsimple.OptionParser; +import joptsimple.OptionSet; + +import org.apache.log4j.Logger; + +import voldemort.VoldemortException; +import voldemort.client.ClientConfig; +import voldemort.client.protocol.admin.AdminClient; +import voldemort.client.protocol.admin.AdminClientConfig; +import voldemort.cluster.Cluster; +import voldemort.cluster.Node; +import voldemort.store.StoreDefinition; + +/** + * The KeySamplerCLI tool samples keys for every partition for every store on a + * cluster. A distinct file of sampled keys is generated for each store. + * + * By default, the "first" key of each partition is sampled. Optional arguments + * control sampling more keys per partition. + */ +public class KeySamplerCLI { + + private static Logger logger = Logger.getLogger(KeySamplerCLI.class); + + private final static int DEFAULT_NODE_PARALLELISM = 8; + private final static int DEFAULT_RECORDS_PER_PARTITION = 0; // INF + private final static int DEFAULT_KEYS_PER_SECOND_LIMIT = 200; + private final static int DEFAULT_PROGRESS_PERIOD_OPS = 1000; + + private final AdminClient adminClient; + private final Cluster cluster; + private final List storeDefinitions; + private final Map storeNameToKeyStringsMap; + + private final String outDir; + + private final List partitionIds; + + private final ExecutorService nodeSamplerService; + private final int recordsPerPartition; + private final int keysPerSecondLimit; + private final int progressPeriodOps; + + public KeySamplerCLI(String url, + String outDir, + List storeNames, + List partitionIds, + int nodeParallelism, + int recordsPerPartition, + int keysPerSecondLimit, + int progressPeriodOps) { + if(logger.isInfoEnabled()) { + logger.info("Connecting to bootstrap server: " + url); + } + this.adminClient = new AdminClient(url, new AdminClientConfig(), new ClientConfig()); + this.cluster = adminClient.getAdminClientCluster(); + this.storeDefinitions = adminClient.metadataMgmtOps.getRemoteStoreDefList(0).getValue(); + this.storeNameToKeyStringsMap = new HashMap(); + for(StoreDefinition storeDefinition: storeDefinitions) { + String storeName = storeDefinition.getName(); + if(storeNames != null) { + if(!storeNames.contains(storeName)) { + logger.debug("Will not sample store " + + storeName + + " since it is not in list of storeNames provided on command line."); + continue; + } + } + this.storeNameToKeyStringsMap.put(storeName, new StringBuilder()); + } + + if(storeNames != null) { + List badStoreNames = new LinkedList(); + for(String storeName: storeNames) { + if(!this.storeNameToKeyStringsMap.keySet().contains(storeName)) { + badStoreNames.add(storeName); + } + } + if(badStoreNames.size() > 0) { + Utils.croak("Some storeNames provided on the command line were not found on this cluster: " + + badStoreNames); + } + } + + this.outDir = outDir; + + this.partitionIds = partitionIds; + + this.nodeSamplerService = Executors.newFixedThreadPool(nodeParallelism); + this.recordsPerPartition = recordsPerPartition; + this.keysPerSecondLimit = keysPerSecondLimit; + this.progressPeriodOps = progressPeriodOps; + } + + public boolean sampleStores() { + for(StoreDefinition storeDefinition: storeDefinitions) { + if(storeNameToKeyStringsMap.keySet().contains(storeDefinition.getName())) { + if(!sampleStore(storeDefinition)) { + return false; + } + } + } + return true; + } + + public static class NodeSampleResult { + + public final boolean success; + public final String keysString; + + NodeSampleResult(boolean success, String keysString) { + this.success = success; + this.keysString = keysString; + } + } + + public class NodeSampler implements Callable { + + private final Node node; + private final StoreDefinition storeDefinition; + private final EventThrottler throttler; + + NodeSampler(Node node, StoreDefinition storeDefinition) { + this.node = node; + this.storeDefinition = storeDefinition; + this.throttler = new EventThrottler(keysPerSecondLimit); + } + + @Override + public NodeSampleResult call() throws Exception { + String storeName = storeDefinition.getName(); + StringBuilder hexKeysString = new StringBuilder(); + String nodeTag = node.getId() + " [" + node.getHost() + "]"; + + List nodePartitionIds = new ArrayList(node.getPartitionIds()); + if(partitionIds != null) { + nodePartitionIds.retainAll(partitionIds); + if(nodePartitionIds.size() == 0) { + logger.info("No partitions to sample for store '" + storeName + "' on node " + + nodeTag); + return new NodeSampleResult(true, hexKeysString.toString()); + } + } + + String infoTag = "store " + storeName + ", partitionIDs " + nodePartitionIds + + " on node " + nodeTag; + logger.info("Starting sample --- " + infoTag); + + long startTimeMs = System.currentTimeMillis(); + + try { + Iterator fetchIterator; + fetchIterator = adminClient.bulkFetchOps.fetchKeys(node.getId(), + storeName, + nodePartitionIds, + null, + true, + recordsPerPartition); + long keyCount = 0; + while(fetchIterator.hasNext()) { + ByteArray key = fetchIterator.next(); + String hexKeyString = ByteUtils.toHexString(key.get()); + hexKeysString.append(hexKeyString + "\n"); + keyCount++; + + throttler.maybeThrottle(1); + + if(0 == keyCount % progressPeriodOps) { + if(logger.isInfoEnabled()) { + long durationS = TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis() + - startTimeMs); + logger.info(infoTag + " --- " + keyCount + " keys sampled in " + + durationS + " seconds."); + } + } + } + + long expectedKeyCount = recordsPerPartition * node.getPartitionIds().size(); + if(keyCount < expectedKeyCount) { + logger.warn("Fewer keys (" + keyCount + ") than expected (" + expectedKeyCount + + ") returned --- " + infoTag); + } else if(keyCount < recordsPerPartition) { + logger.warn("More keys (" + keyCount + ") than expected (" + expectedKeyCount + + ") returned --- " + infoTag); + } + + logger.info("Finished sample --- " + infoTag); + return new NodeSampleResult(true, hexKeysString.toString()); + } catch(VoldemortException ve) { + logger.error("Failed to sample --- " + infoTag + " --- VoldemortException caught (" + + ve.getMessage() + ") caused by (" + ve.getCause().getMessage() + ")"); + throw ve; + } + } + } + + public boolean sampleStore(StoreDefinition storeDefinition) { + String storeName = storeDefinition.getName(); + String fileName = outDir + System.getProperty("file.separator") + storeName + ".keys"; + + File file = new File(fileName); + if(file.exists()) { + logger.warn("Key file " + fileName + " already exists. Skipping sampling store " + + storeName + "."); + return true; + } + + Writer keyWriter = null; + try { + keyWriter = new FileWriter(file); + + Map> results = new HashMap>(); + for(Node node: cluster.getNodes()) { + Future future = nodeSamplerService.submit(new NodeSampler(node, + storeDefinition)); + results.put(node, future); + } + + boolean success = true; + for(Node node: cluster.getNodes()) { + Future future = results.get(node); + if(!success) { + future.cancel(true); + continue; + } + + try { + NodeSampleResult nodeSampleResult = future.get(); + if(nodeSampleResult.success) { + keyWriter.write(nodeSampleResult.keysString); + } else { + success = false; + logger.error("Sampling on node " + node.getHost() + " of store " + + storeDefinition.getName() + " failed."); + } + } catch(ExecutionException ee) { + success = false; + logger.error("Encountered an execution exception on node " + node.getHost() + + " while sampling " + storeName + ": " + ee.getMessage()); + ee.printStackTrace(); + } catch(InterruptedException ie) { + success = false; + logger.error("Waiting for node " + node.getHost() + " to be sampled for store " + + storeName + ", but was interrupted: " + ie.getMessage()); + } + } + return success; + } catch(IOException e) { + logger.error("IOException encountered for store " + storeName + " : " + e.getMessage()); + return false; + } finally { + try { + keyWriter.close(); + } catch(IOException e) { + logger.error("IOException caught while trying to close keyWriter for store " + + storeName + " : " + e.getMessage()); + } + } + } + + public void stop() { + if(adminClient != null) { + adminClient.close(); + } + nodeSamplerService.shutdown(); + } + + /** + * Return args parser + * + * @return program parser + * */ + private static OptionParser getParser() { + OptionParser parser = new OptionParser(); + parser.accepts("help", "print help information"); + parser.accepts("url", "[REQUIRED] bootstrap URL") + .withRequiredArg() + .describedAs("bootstrap-url") + .ofType(String.class); + parser.accepts("out-dir", + "[REQUIRED] Directory in which to output the key files (named \"{storeName}.keys\".") + .withRequiredArg() + .describedAs("outputDirectory") + .ofType(String.class); + parser.accepts("store-names", + "Store names to sample. Comma delimited list or singleton. [Default: ALL]") + .withRequiredArg() + .describedAs("storeNames") + .withValuesSeparatedBy(',') + .ofType(String.class); + parser.accepts("partition-ids", + "Partition IDs to sample for each store. Comma delimited list or singleton. [Default: ALL]") + .withRequiredArg() + .describedAs("partitionIds") + .withValuesSeparatedBy(',') + .ofType(Integer.class); + parser.accepts("parallelism", + "Number of nodes to sample in parallel. [Default: " + + DEFAULT_NODE_PARALLELISM + " ]") + .withRequiredArg() + .describedAs("storeParallelism") + .ofType(Integer.class); + parser.accepts("records-per-partition", + "Number of keys sampled per partition. [Default: INF]") + .withRequiredArg() + .describedAs("recordsPerPartition") + .ofType(Integer.class); + parser.accepts("keys-per-second-limit", + "Number of keys sampled per second limit. [Default: " + + DEFAULT_KEYS_PER_SECOND_LIMIT + " ]") + .withRequiredArg() + .describedAs("keysPerSecondLimit") + .ofType(Integer.class); + parser.accepts("progress-period-ops", + "Number of operations between progress info is displayed. [Default: " + + DEFAULT_PROGRESS_PERIOD_OPS + " ]") + .withRequiredArg() + .describedAs("progressPeriodOps") + .ofType(Integer.class); + return parser; + } + + /** + * Print Usage to STDOUT + */ + private static void printUsage() { + StringBuilder help = new StringBuilder(); + help.append("KeySamplerCLI Tool\n"); + help.append(" Sample keys from store-partitions. Output keys per store.\n"); + help.append("Options:\n"); + help.append(" Required:\n"); + help.append(" --url \n"); + help.append(" --out-dir \n"); + help.append(" Optional:\n"); + help.append(" --store-names [,...]\n"); + help.append(" --partition-ids [,...]\n"); + help.append(" --parallelism \n"); + help.append(" --records-per-partition \n"); + help.append(" --keys-per-second-limit \n"); + help.append(" --progress-period-ops \n"); + help.append(" --help\n"); + help.append(" Notes:\n"); + help.append(" To select ALL storeNames or partitionIds, you must\n"); + help.append(" not specify the pertinent optional argument.\n"); + help.append(" To select INF records per partitoin, either do not\n"); + help.append(" specify the argument, or specify a value <= 0.\n"); + System.out.print(help.toString()); + } + + private static void printUsageAndDie(String errMessage) { + printUsage(); + Utils.croak("\n" + errMessage); + } + + public static void main(String[] args) throws Exception { + OptionParser parser = null; + OptionSet options = null; + try { + parser = getParser(); + options = parser.parse(args); + } catch(OptionException oe) { + parser.printHelpOn(System.out); + printUsageAndDie("Exception when parsing arguments : " + oe.getMessage()); + return; + } + + /* validate options */ + if(options.hasArgument("help")) { + parser.printHelpOn(System.out); + printUsage(); + return; + } + if(!options.hasArgument("url") || !options.hasArgument("out-dir")) { + parser.printHelpOn(System.out); + printUsageAndDie("Missing a required argument."); + return; + } + + String url = (String) options.valueOf("url"); + + String outDir = (String) options.valueOf("out-dir"); + Utils.mkdirs(new File(outDir)); + + List storeNames = null; + if(options.hasArgument("store-names")) { + @SuppressWarnings("unchecked") + List list = (List) options.valuesOf("store-names"); + storeNames = list; + } + + List partitionIds = null; + if(options.hasArgument("partition-ids")) { + @SuppressWarnings("unchecked") + List list = (List) options.valuesOf("partition-ids"); + partitionIds = list; + } + + Integer nodeParallelism = DEFAULT_NODE_PARALLELISM; + if(options.hasArgument("parallelism")) { + nodeParallelism = (Integer) options.valueOf("parallelism"); + } + + Integer recordsPerPartition = DEFAULT_RECORDS_PER_PARTITION; + if(options.hasArgument("records-per-partition")) { + recordsPerPartition = (Integer) options.valueOf("records-per-partition"); + } + + Integer keysPerSecondLimit = DEFAULT_KEYS_PER_SECOND_LIMIT; + if(options.hasArgument("keys-per-second-limit")) { + keysPerSecondLimit = (Integer) options.valueOf("keys-per-second-limit"); + } + System.err.println("throttle: " + keysPerSecondLimit); + + Integer progressPeriodOps = DEFAULT_PROGRESS_PERIOD_OPS; + if(options.hasArgument("progress-period-ops")) { + progressPeriodOps = (Integer) options.valueOf("progress-period-ops"); + } + + KeySamplerCLI sampler = new KeySamplerCLI(url, + outDir, + storeNames, + partitionIds, + nodeParallelism, + recordsPerPartition, + keysPerSecondLimit, + progressPeriodOps); + try { + if(!sampler.sampleStores()) { + logger.error("Some stores were not successfully sampled."); + } + } finally { + sampler.stop(); + } + } +} diff --git a/src/java/voldemort/utils/KeyVersionFetcherCLI.java b/src/java/voldemort/utils/KeyVersionFetcherCLI.java new file mode 100644 index 0000000000..a9b608a4e7 --- /dev/null +++ b/src/java/voldemort/utils/KeyVersionFetcherCLI.java @@ -0,0 +1,414 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package voldemort.utils; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; + +import joptsimple.OptionException; +import joptsimple.OptionParser; +import joptsimple.OptionSet; + +import org.apache.commons.codec.DecoderException; +import org.apache.log4j.Logger; + +import voldemort.client.ClientConfig; +import voldemort.client.protocol.admin.AdminClient; +import voldemort.client.protocol.admin.AdminClientConfig; +import voldemort.cluster.Cluster; +import voldemort.store.StoreDefinition; +import voldemort.versioning.Versioned; + +/** + * The KeyVersionFetcherCLI is a rudimentary tool that outputs a sampling of + * existing keys from a cluster. For each store in the cluster, a distinct file + * of keys to sample is expected. And, for each of these, a distint file of + * key-versions is generated. + * + */ +public class KeyVersionFetcherCLI { + + private static Logger logger = Logger.getLogger(KeyVersionFetcherCLI.class); + + private final static int DEFAULT_KEY_PARALLELISM = 4; + private final static int DEFAULT_PROGRESS_PERIOD_OPS = 1000; + + private final AdminClient adminClient; + private final Cluster cluster; + private final List storeDefinitions; + private final Map storeNameToKeyStringsMap; + + private final String inDir; + private final String outDir; + + private final ExecutorService kvFetcherService; + private final int progressPeriodOps; + + private final long startTimeMs; + private static AtomicInteger fetches = new AtomicInteger(0); + + public KeyVersionFetcherCLI(String url, + String inDir, + String outDir, + List storeNames, + int keyParallelism, + int progressPeriodOps) { + if(logger.isInfoEnabled()) { + logger.info("Connecting to bootstrap server: " + url); + } + this.adminClient = new AdminClient(url, new AdminClientConfig(), new ClientConfig()); + this.cluster = adminClient.getAdminClientCluster(); + this.storeDefinitions = adminClient.metadataMgmtOps.getRemoteStoreDefList(0).getValue(); + this.storeNameToKeyStringsMap = new HashMap(); + for(StoreDefinition storeDefinition: storeDefinitions) { + String storeName = storeDefinition.getName(); + if(storeNames != null) { + if(!storeNames.contains(storeName)) { + logger.debug("Will not sample store " + + storeName + + " since it is not in list of storeNames provided on command line."); + continue; + } + } + this.storeNameToKeyStringsMap.put(storeName, new StringBuilder()); + } + + if(storeNames != null) { + List badStoreNames = new LinkedList(); + for(String storeName: storeNames) { + if(!this.storeNameToKeyStringsMap.keySet().contains(storeName)) { + badStoreNames.add(storeName); + } + } + if(badStoreNames.size() > 0) { + Utils.croak("Some storeNames provided on the command line were not found on this cluster: " + + badStoreNames); + } + } + + this.inDir = inDir; + this.outDir = outDir; + + this.kvFetcherService = Executors.newFixedThreadPool(keyParallelism); + + this.progressPeriodOps = progressPeriodOps; + this.startTimeMs = System.currentTimeMillis(); + } + + public boolean sampleStores() { + for(StoreDefinition storeDefinition: storeDefinitions) { + if(storeNameToKeyStringsMap.keySet().contains(storeDefinition.getName())) { + if(!sampleStore(storeDefinition)) { + return false; + } + } + } + return true; + } + + public void updateFetchProgress() { + int curFetches = fetches.incrementAndGet(); + + if(0 == curFetches % progressPeriodOps) { + if(logger.isInfoEnabled()) { + long durationS = TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis() + - startTimeMs); + logger.info("Fetched " + curFetches + " in " + durationS + " seconds."); + } + } + } + + public class KeyVersionFetcher implements Callable { + + private final StoreInstance storeInstance; + private final byte[] key; + + KeyVersionFetcher(StoreInstance storeInstance, byte[] key) { + this.storeInstance = storeInstance; + this.key = key; + } + + @Override + public String call() throws Exception { + String storeName = storeInstance.getStoreDefinition().getName(); + int masterPartitionId = storeInstance.getMasterPartitionId(key); + List replicatingNodeIds = storeInstance.getReplicationNodeList(masterPartitionId); + + int replicationOffset = 0; + StringBuilder sb = new StringBuilder(); + for(int replicatingNodeId: replicatingNodeIds) { + List> values = adminClient.storeOps.getNodeKey(storeName, + replicatingNodeId, + new ByteArray(key)); + sb.append(replicationOffset + " : " + ByteUtils.toHexString(key) + "\t"); + for(Versioned value: values) { + sb.append(value.getVersion().toString() + "\t"); + } + sb.append("\n"); + replicationOffset++; + } + updateFetchProgress(); + return sb.toString(); + } + } + + public boolean sampleStore(StoreDefinition storeDefinition) { + String storeName = storeDefinition.getName(); + + String keysFileName = inDir + System.getProperty("file.separator") + storeName + ".keys"; + File keysFile = new File(keysFileName); + if(!keysFile.exists()) { + logger.error("Keys file " + keysFileName + "does not exist!"); + return false; + } + + String kvFileName = outDir + System.getProperty("file.separator") + storeName + ".kvs"; + File kvFile = new File(kvFileName); + if(kvFile.exists()) { + logger.info("Key-Version file " + kvFileName + + " exists, so will not sample keys from file " + keysFileName + "."); + return true; + } + + StoreInstance storeInstance = new StoreInstance(cluster, storeDefinition); + BufferedReader keyReader = null; + BufferedWriter kvWriter = null; + try { + keyReader = new BufferedReader(new FileReader(keysFileName)); + + Queue> futureKVs = new LinkedList>(); + for(String keyLine = keyReader.readLine(); keyLine != null; keyLine = keyReader.readLine()) { + byte[] keyInBytes = ByteUtils.fromHexString(keyLine.trim()); + + KeyVersionFetcher kvFetcher = new KeyVersionFetcher(storeInstance, keyInBytes); + Future future = kvFetcherService.submit(kvFetcher); + futureKVs.add(future); + } + + kvWriter = new BufferedWriter(new FileWriter(kvFileName)); + while(!futureKVs.isEmpty()) { + Future future = futureKVs.poll(); + String keyVersions = future.get(); + kvWriter.append(keyVersions); + } + + return true; + } catch(DecoderException de) { + logger.error("Could not decode key to sample for store " + storeName + " : " + + de.getMessage()); + return false; + } catch(IOException ioe) { + logger.error("IOException caught while sampling store " + storeName + " : " + + ioe.getMessage()); + return false; + } catch(InterruptedException ie) { + logger.error("InterruptedException caught while sampling store " + storeName + " : " + + ie.getMessage()); + return false; + } catch(ExecutionException ee) { + logger.error("Encountered an execution exception while sampling " + storeName + ": " + + ee.getMessage()); + ee.printStackTrace(); + return false; + } finally { + if(keyReader != null) { + try { + keyReader.close(); + } catch(IOException e) { + logger.error("IOException caught while trying to close keyReader for store " + + storeName + " : " + e.getMessage()); + e.printStackTrace(); + } + } + if(kvWriter != null) { + try { + kvWriter.close(); + } catch(IOException e) { + logger.error("IOException caught while trying to close kvWriter for store " + + storeName + " : " + e.getMessage()); + e.printStackTrace(); + } + } + } + } + + public void stop() { + if(adminClient != null) { + adminClient.close(); + } + kvFetcherService.shutdown(); + } + + /** + * Return args parser + * + * @return program parser + * */ + private static OptionParser getParser() { + OptionParser parser = new OptionParser(); + parser.accepts("help", "print help information"); + parser.accepts("url", "[REQUIRED] bootstrap URL") + .withRequiredArg() + .describedAs("bootstrap-url") + .ofType(String.class); + parser.accepts("in-dir", + "[REQUIRED] Directory in which to find the input key files (named \"{storeName}.kvs\", generated by KeyFetcherCLI.") + .withRequiredArg() + .describedAs("inputDirectory") + .ofType(String.class); + parser.accepts("out-dir", + "[REQUIRED] Directory in which to output the key files (named \"{storeName}.kvs\".") + .withRequiredArg() + .describedAs("outputDirectory") + .ofType(String.class); + parser.accepts("store-names", + "Store names to sample. Comma delimited list or singleton. [Default: ALL]") + .withRequiredArg() + .describedAs("storeNames") + .withValuesSeparatedBy(',') + .ofType(String.class); + parser.accepts("parallelism", + "Number of key-versions to sample in parallel. [Default: " + + DEFAULT_KEY_PARALLELISM + " ]") + .withRequiredArg() + .describedAs("storeParallelism") + .ofType(Integer.class); + parser.accepts("progress-period-ops", + "Number of operations between progress info is displayed. [Default: " + + DEFAULT_PROGRESS_PERIOD_OPS + " ]") + .withRequiredArg() + .describedAs("progressPeriodOps") + .ofType(Integer.class); + return parser; + } + + /** + * Print Usage to STDOUT + */ + private static void printUsage() { + StringBuilder help = new StringBuilder(); + help.append("KeyFetcherCLI Tool\n"); + help.append(" Find one key from each store-partition. Output keys per store.\n"); + help.append("Options:\n"); + help.append(" Required:\n"); + help.append(" --url \n"); + help.append(" --in-dir \n"); + help.append(" --out-dir \n"); + help.append(" Optional:\n"); + help.append(" --store-names [,...]\n"); + help.append(" --parallelism \n"); + help.append(" --progress-period-ops \n"); + help.append(" --help\n"); + System.out.print(help.toString()); + } + + private static void printUsageAndDie(String errMessage) { + printUsage(); + Utils.croak("\n" + errMessage); + } + + // In the future, this tool could be expanded with the following options: + // - fetch value in addition to version + // - choose between printing human readable data (.toString()) or computer + // readable data (ByteUtils.toHexString(byte[])). + public static void main(String[] args) throws Exception { + OptionParser parser = null; + OptionSet options = null; + try { + parser = getParser(); + options = parser.parse(args); + } catch(OptionException oe) { + parser.printHelpOn(System.out); + printUsageAndDie("Exception when parsing arguments : " + oe.getMessage()); + return; + } + + /* validate options */ + if(options.hasArgument("help")) { + parser.printHelpOn(System.out); + printUsage(); + return; + } + if(!options.hasArgument("url") || !options.hasArgument("in-dir") + || !options.hasArgument("out-dir")) { + parser.printHelpOn(System.out); + printUsageAndDie("Missing a required argument."); + return; + } + + String url = (String) options.valueOf("url"); + + String inDir = (String) options.valueOf("in-dir"); + Utils.mkdirs(new File(inDir)); + + String outDir = (String) options.valueOf("out-dir"); + Utils.mkdirs(new File(outDir)); + + List storeNames = null; + if(options.hasArgument("store-names")) { + @SuppressWarnings("unchecked") + List list = (List) options.valuesOf("store-names"); + storeNames = list; + } + + Integer keyParallelism = DEFAULT_KEY_PARALLELISM; + if(options.hasArgument("parallelism")) { + keyParallelism = (Integer) options.valueOf("parallelism"); + } + + Integer progressPeriodOps = DEFAULT_PROGRESS_PERIOD_OPS; + if(options.hasArgument("progress-period-ops")) { + progressPeriodOps = (Integer) options.valueOf("progress-period-ops"); + } + + try { + KeyVersionFetcherCLI sampler = new KeyVersionFetcherCLI(url, + inDir, + outDir, + storeNames, + keyParallelism, + progressPeriodOps); + + try { + if(!sampler.sampleStores()) { + logger.error("Key-versions were not successfully sampled from some stores."); + } + } finally { + sampler.stop(); + } + + } catch(Exception e) { + Utils.croak("Exception during key-version sampling: " + e.getMessage()); + } + + } +} diff --git a/src/java/voldemort/utils/ManifestFileReader.java b/src/java/voldemort/utils/ManifestFileReader.java index 8ba26d7515..8ff477dc89 100644 --- a/src/java/voldemort/utils/ManifestFileReader.java +++ b/src/java/voldemort/utils/ManifestFileReader.java @@ -16,9 +16,11 @@ package voldemort.utils; -import java.io.FileInputStream; import java.io.IOException; -import java.util.Properties; +import java.net.URL; +import java.util.Enumeration; +import java.util.jar.Attributes; +import java.util.jar.Manifest; import org.apache.log4j.Logger; @@ -31,17 +33,32 @@ public class ManifestFileReader { protected static final Logger logger = Logger.getLogger(ManifestFileReader.class); private static String MANIFEST_FILE = "META-INF/MANIFEST.MF"; - private static String RELEASE_VERSION_KEY = "Implementation-Version"; + + private static String RELEASE_VERSION_KEY = "Voldemort-Implementation-Version"; public static String getReleaseVersion() { - String version = null; - Properties properties = new Properties(); + try { - properties.load(new FileInputStream(MANIFEST_FILE)); - version = properties.getProperty(RELEASE_VERSION_KEY); - } catch(IOException e) { - logger.warn("Unable to load voldemort release version due to the following error:", e); + Enumeration resources = ManifestFileReader.class.getClassLoader() + .getResources(MANIFEST_FILE); + while(resources.hasMoreElements()) { + + Manifest manifest = new Manifest(resources.nextElement().openStream()); + + Attributes mainAttribs = manifest.getMainAttributes(); + String version = mainAttribs.getValue(RELEASE_VERSION_KEY); + + if(version != null) { + logger.debug("Voldemort Release version is:" + version); + return version; + } + + } + } catch(IOException IoE) { + logger.warn("Unable to load voldemort release version, could not find a manifest file"); } - return version; + + return null; + } } diff --git a/src/java/voldemort/utils/NodeUtils.java b/src/java/voldemort/utils/NodeUtils.java new file mode 100644 index 0000000000..117cc7d2c2 --- /dev/null +++ b/src/java/voldemort/utils/NodeUtils.java @@ -0,0 +1,124 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.utils; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Set; + +import org.apache.log4j.Logger; + +import voldemort.cluster.Node; + +import com.google.common.collect.Sets; + +// TODO: (refactor) Move all of the static "util" methods for which Node is +// the only complex type that the method operates on to be members of the +// Cluster class. Unclear how to treat the Integer and List types of partition +// ids... + +/** + * NodeUtils provides basic tools for manipulating and inspecting nodes. + * + * Methods in this util module take take exactly one Node object, or a + * collection of Node objects, and possibly some other minor, simple arguments. + */ +public class NodeUtils { + + private static Logger logger = Logger.getLogger(NodeUtils.class); + + /** + * Creates a replica of the node with the new partitions list + * + * @param node The node whose replica we are creating + * @param partitionsList The new partitions list + * @return Replica of node with new partitions list + */ + public static Node updateNode(Node node, List partitionsList) { + return new Node(node.getId(), + node.getHost(), + node.getHttpPort(), + node.getSocketPort(), + node.getAdminPort(), + node.getZoneId(), + partitionsList); + } + + /** + * Add a partition to the node provided + * + * @param node The node to which we'll add the partition + * @param donatedPartition The partition to add + * @return The new node with the new partition + */ + public static Node addPartitionToNode(final Node node, Integer donatedPartition) { + return addPartitionToNode(node, Sets.newHashSet(donatedPartition)); + } + + /** + * Remove a partition from the node provided + * + * @param node The node from which we're removing the partition + * @param donatedPartition The partitions to remove + * @return The new node without the partition + */ + public static Node removePartitionToNode(final Node node, Integer donatedPartition) { + return removePartitionToNode(node, Sets.newHashSet(donatedPartition)); + } + + /** + * Add the set of partitions to the node provided + * + * @param node The node to which we'll add the partitions + * @param donatedPartitions The list of partitions to add + * @return The new node with the new partitions + */ + public static Node addPartitionToNode(final Node node, final Set donatedPartitions) { + List deepCopy = new ArrayList(node.getPartitionIds()); + deepCopy.addAll(donatedPartitions); + Collections.sort(deepCopy); + return updateNode(node, deepCopy); + } + + /** + * Remove the set of partitions from the node provided + * + * @param node The node from which we're removing the partitions + * @param donatedPartitions The list of partitions to remove + * @return The new node without the partitions + */ + public static Node removePartitionToNode(final Node node, final Set donatedPartitions) { + List deepCopy = new ArrayList(node.getPartitionIds()); + deepCopy.removeAll(donatedPartitions); + return updateNode(node, deepCopy); + } + + /** + * Given a list of nodes, retrieves the list of node ids + * + * @param nodes The list of nodes + * @return Returns a list of node ids + */ + public static List getNodeIds(List nodes) { + List nodeIds = new ArrayList(nodes.size()); + for(Node node: nodes) { + nodeIds.add(node.getId()); + } + return nodeIds; + } +} diff --git a/src/java/voldemort/utils/RebalanceClusterUtils.java b/src/java/voldemort/utils/RebalanceClusterUtils.java new file mode 100644 index 0000000000..110eaa66b4 --- /dev/null +++ b/src/java/voldemort/utils/RebalanceClusterUtils.java @@ -0,0 +1,997 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.utils; + +import java.io.File; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Random; + +import org.apache.commons.io.FileUtils; +import org.apache.log4j.Logger; + +import voldemort.client.rebalance.RebalanceClusterPlan; +import voldemort.cluster.Cluster; +import voldemort.cluster.Node; +import voldemort.store.StoreDefinition; +import voldemort.xml.ClusterMapper; + +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; + +/** + * RebalanceClusterUtils provides functions that balance the distribution of + * partitions across a cluster. + * + */ +public class RebalanceClusterUtils { + + // TODO: (refactor) Improve upon the name "RebalanceClusterUtils". All of + // these util methods support moving partitions around a cluster to achieve + // better balance. + + private static Logger logger = Logger.getLogger(RebalanceClusterUtils.class); + + /** + * Outputs an optimized cluster based on the existing cluster and the new + * nodes that are being added. + * + * @param currentCluster Current cluster metadata + * @param targetCluster The target cluster metadata which contains the nodes + * of the current cluster + new nodes with empty partitions + * @param storeDefs List of store definitions + * @param outputDir The output directory where we'll store the cluster + * metadata ( if not null ) + * @param maxTriesRebalancing See RebalanceCLI. + * @param generateEnableXzonePrimary See RebalanceCLI. + * @param generateEnableXzoneNary See RebalanceCLI. + * @param enableRandomSwaps See RebalanceCLI. + * @param randomSwapAttempts See RebalanceCLI. + * @param randomSwapSuccesses See RebalanceCLI. + * @param enableGreedySwaps See RebalanceCLI. + * @param greedySwapAttempts See RebalanceCLI. + * @param greedySwapMaxPartitionsPerNode See RebalanceCLI. + * @param greedySwapMaxPartitionsPerZone See RebalanceCLI. + * @param maxContiguousPartitionsPerZone See RebalanceCLI. + */ + public static void balanceTargetCluster(final Cluster currentCluster, + final Cluster targetCluster, + final List storeDefs, + final String outputDir, + final int maxTriesRebalancing, + final boolean generateDisablePrimaryBalancing, + final boolean generateEnableXzonePrimary, + final boolean generateEnableAnyXzoneNary, + final boolean generateEnableLastResortXzoneNary, + final boolean enableXzoneShuffle, + final boolean enableRandomSwaps, + final int randomSwapAttempts, + final int randomSwapSuccesses, + final boolean enableGreedySwaps, + final int greedySwapAttempts, + final int greedySwapMaxPartitionsPerNode, + final int greedySwapMaxPartitionsPerZone, + final int maxContiguousPartitionsPerZone) { + Pair analysis = new ClusterInstance(currentCluster, storeDefs).analyzeBalanceVerbose(); + dumpAnalysisToFile(outputDir, + RebalanceUtils.initialClusterFileName + ".analysis", + analysis.getSecond()); + + Cluster minCluster = targetCluster; + double minMaxMinRatio = Double.MAX_VALUE; + + for(int numTries = 0; numTries < maxTriesRebalancing; numTries++) { + Cluster nextCluster = targetCluster; + + if(maxContiguousPartitionsPerZone > 0) { + nextCluster = repeatedlyBalanceContiguousPartitionsPerZone(nextCluster, + maxContiguousPartitionsPerZone); + } + + if(!generateDisablePrimaryBalancing) { + nextCluster = balancePrimaryPartitionsPerNode(nextCluster, + storeDefs, + generateEnableXzonePrimary, + generateEnableAnyXzoneNary, + generateEnableLastResortXzoneNary); + } + + if(enableRandomSwaps) { + nextCluster = randomShufflePartitions(nextCluster, + enableXzoneShuffle, + randomSwapAttempts, + randomSwapSuccesses, + storeDefs); + } + if(enableGreedySwaps) { + nextCluster = greedyShufflePartitions(nextCluster, + enableXzoneShuffle, + greedySwapAttempts, + greedySwapMaxPartitionsPerNode, + greedySwapMaxPartitionsPerZone, + storeDefs); + } + + if(!validateClusterUpdate(currentCluster, nextCluster)) { + System.err.println("The modified cluster does not pass validation. Reverting to initial cluster..."); + nextCluster = currentCluster; + } + + System.out.println("-------------------------\n"); + analysis = new ClusterInstance(nextCluster, storeDefs).analyzeBalanceVerbose(); + double currentMaxMinRatio = analysis.getFirst(); + System.out.println("Optimization number " + numTries + ": " + currentMaxMinRatio + + " max/min ratio"); + + if(currentMaxMinRatio <= minMaxMinRatio) { + minMaxMinRatio = currentMaxMinRatio; + minCluster = nextCluster; + + dumpClusterToFile(outputDir, + RebalanceUtils.finalClusterFileName + numTries, + minCluster); + dumpAnalysisToFile(outputDir, RebalanceUtils.finalClusterFileName + numTries + + ".analysis", analysis.getSecond()); + } + System.out.println("-------------------------\n"); + } + + System.out.println("\n=========================="); + System.out.println("Final distribution"); + analysis = new ClusterInstance(minCluster, storeDefs).analyzeBalanceVerbose(); + System.out.println(analysis.getSecond()); + + dumpClusterToFile(outputDir, RebalanceUtils.finalClusterFileName, minCluster); + dumpAnalysisToFile(outputDir, + RebalanceUtils.finalClusterFileName + ".analysis", + analysis.getSecond()); + return; + } + + /** + * Determines how many primary partitions each node within each zone should + * have. The list of integers returned per zone is the same length as the + * number of nodes in that zone. + * + * @param targetCluster + * @return A map of zoneId to list of target number of partitions per node + * within zone. + */ + public static HashMap> getBalancedNumberOfPrimaryPartitionsPerNodePerZone(final Cluster targetCluster) { + HashMap> numPartitionsPerNodePerZone = Maps.newHashMap(); + for(Integer zoneId: targetCluster.getZoneIds()) { + int numNodesInZone = targetCluster.getNumberOfNodesInZone(zoneId); + int numPartitionsInZone = targetCluster.getNumberOfPartitionsInZone(zoneId); + int floorPartitionsPerNodeInZone = numPartitionsInZone / numNodesInZone; + int numNodesInZoneWithCeil = numPartitionsInZone + - (numNodesInZone * floorPartitionsPerNodeInZone); + + ArrayList partitionsOnNode = new ArrayList(numNodesInZone); + for(int i = 0; i < numNodesInZoneWithCeil; i++) { + partitionsOnNode.add(i, floorPartitionsPerNodeInZone + 1); + } + for(int i = numNodesInZoneWithCeil; i < numNodesInZone; i++) { + partitionsOnNode.add(i, floorPartitionsPerNodeInZone); + } + numPartitionsPerNodePerZone.put(zoneId, partitionsOnNode); + } + return numPartitionsPerNodePerZone; + } + + /** + * Assign target number of partitions per node to specific node IDs. Then, + * separates Nodes into donorNodes and stealerNodes based on whether the + * node needs to donate or steal primary partitions. + * + * @param targetCluster + * @return a Pair. First element is donorNodes, second element is + * stealerNodes. Each element in the pair is a HashMap of Node to + * Integer where the integer value is the number of partitions to + * store. + */ + public static Pair, HashMap> getDonorsAndStealersForBalancedPrimaries(final Cluster targetCluster) { + HashMap> numPartitionsPerNodePerZone = getBalancedNumberOfPrimaryPartitionsPerNodePerZone(targetCluster); + + HashMap donorNodes = Maps.newHashMap(); + HashMap stealerNodes = Maps.newHashMap(); + + HashMap numNodesAssignedInZone = Maps.newHashMap(); + for(Integer zoneId: targetCluster.getZoneIds()) { + numNodesAssignedInZone.put(zoneId, 0); + } + for(Node node: targetCluster.getNodes()) { + int zoneId = node.getZoneId(); + + int offset = numNodesAssignedInZone.get(zoneId); + numNodesAssignedInZone.put(zoneId, offset + 1); + + int numPartitions = numPartitionsPerNodePerZone.get(zoneId).get(offset); + + if(numPartitions < node.getNumberOfPartitions()) { + donorNodes.put(node, numPartitions); + } else if(numPartitions > node.getNumberOfPartitions()) { + stealerNodes.put(node, numPartitions); + } + } + + // Print out donor/stealer information + for(Node node: donorNodes.keySet()) { + System.out.println("Donor Node: " + node.getId() + ", zoneId " + node.getZoneId() + + ", numPartitions " + node.getNumberOfPartitions() + + ", target number of partitions " + donorNodes.get(node)); + } + for(Node node: stealerNodes.keySet()) { + System.out.println("Stealer Node: " + node.getId() + ", zoneId " + node.getZoneId() + + ", numPartitions " + node.getNumberOfPartitions() + + ", target number of partitions " + stealerNodes.get(node)); + } + + return new Pair, HashMap>(donorNodes, stealerNodes); + } + + /** + * Balance the number of primary partitions per node per zone. Balanced + * means that the number of primary partitions per node within a zone are + * all within one of one another. A common usage of this method is to assign + * partitions to newly added nodes that do not have any partitions yet. + * + * @param targetCluster Target cluster metadata ( which contains old nodes + + * new nodes [ empty partitions ]) + * @param storeDefs List of store definitions + * @param generateEnableXzonePrimary See RebalanceCLI. + * @param generateEnableAnyXzoneNary See RebalanceCLI. + * @param generateEnableLastResortXzoneNary See RebalanceCLI. + * @return Return new cluster metadata + */ + public static Cluster balancePrimaryPartitionsPerNode(final Cluster targetCluster, + final List storeDefs, + final boolean generateEnableXzonePrimary, + final boolean generateEnableAnyXzoneNary, + final boolean generateEnableLastResortXzoneNary) { + System.out.println("Balance number of partitions per node within a zone."); + System.out.println("numPartitionsPerZone"); + for(int zoneId: targetCluster.getZoneIds()) { + System.out.println(zoneId + " : " + targetCluster.getNumberOfPartitionsInZone(zoneId)); + } + System.out.println("numNodesPerZone"); + for(int zoneId: targetCluster.getZoneIds()) { + System.out.println(zoneId + " : " + targetCluster.getNumberOfNodesInZone(zoneId)); + } + + Pair, HashMap> donorsAndStealers = getDonorsAndStealersForBalancedPrimaries(targetCluster); + HashMap donorNodes = donorsAndStealers.getFirst(); + List donorNodeKeys = new ArrayList(donorNodes.keySet()); + + HashMap stealerNodes = donorsAndStealers.getSecond(); + List stealerNodeKeys = new ArrayList(stealerNodes.keySet()); + + // Go over every stealerNode and steal partitions from donor nodes + Cluster returnCluster = ClusterUtils.copyCluster(targetCluster); + + Collections.shuffle(stealerNodeKeys, new Random(System.currentTimeMillis())); + for(Node stealerNode: stealerNodeKeys) { + int partitionsToSteal = stealerNodes.get(stealerNode) + - stealerNode.getNumberOfPartitions(); + + System.out.println("Node (" + stealerNode.getId() + ") in zone (" + + stealerNode.getZoneId() + ") has partitionsToSteal of " + + partitionsToSteal); + + while(partitionsToSteal > 0) { + Collections.shuffle(donorNodeKeys, new Random(System.currentTimeMillis())); + + // Repeatedly loop over donor nodes to distribute stealing + for(Node donorNode: donorNodeKeys) { + Node currentDonorNode = returnCluster.getNodeById(donorNode.getId()); + + if(!generateEnableXzonePrimary + && (currentDonorNode.getZoneId() != stealerNode.getZoneId())) { + // Only steal from donor nodes within same zone + continue; + } + // Only steal from donor nodes with extra partitions + if(currentDonorNode.getNumberOfPartitions() == donorNodes.get(donorNode)) { + continue; + } + + List donorPartitions = Lists.newArrayList(currentDonorNode.getPartitionIds()); + + Collections.shuffle(donorPartitions, new Random(System.currentTimeMillis())); + for(int donorPartition: donorPartitions) { + Cluster intermediateCluster = RebalanceUtils.createUpdatedCluster(returnCluster, + stealerNode.getId(), + Lists.newArrayList(donorPartition)); + + int crossZoneMoves = 0; + if(!generateEnableAnyXzoneNary) { + // getCrossZoneMoves can be a *slow* call. E.g., for + // an 11 node cluster, the call takes ~230 ms, + // whereas for a 39 node cluster the call takes ~10 + // s (40x longer). + long startTimeNs = System.nanoTime(); + crossZoneMoves = RebalanceUtils.getCrossZoneMoves(intermediateCluster, + new RebalanceClusterPlan(returnCluster, + intermediateCluster, + storeDefs, + true)); + System.out.println("getCrossZoneMoves took " + + (System.nanoTime() - startTimeNs) + " ns."); + } + if(crossZoneMoves == 0) { + returnCluster = intermediateCluster; + partitionsToSteal--; + System.out.println("Stealer node " + stealerNode.getId() + + ", donor node " + currentDonorNode.getId() + + ", partition stolen " + donorPartition); + break; + } else { + System.out.println("Stealer node " + stealerNode.getId() + + ", donor node " + currentDonorNode.getId() + + ", attempted to steal partition " + donorPartition + + " however, getCrossZoneMoves did NOT return 0!"); + + if(donorPartition == donorPartitions.get(donorPartitions.size() - 1)) { + partitionsToSteal--; + if(generateEnableLastResortXzoneNary) { + returnCluster = intermediateCluster; + System.out.println("Stealer node " + + stealerNode.getId() + + ", donor node " + + currentDonorNode.getId() + + ", is stealing partition " + + donorPartition + + " in spite of the fact that getCrossZoneMoves did NOT return 0!"); + } else { + System.out.println("Stealer node " + + stealerNode.getId() + + " is reducing number of partitions to steal because getCrossZoneMoves did not return 0 for all possible partitions."); + } + } + } + } + + if(partitionsToSteal == 0) + break; + } + } + } + + return returnCluster; + } + + // TODO: Add a similar method that rebalances a cluster to ensure that no + // Node hosts contiguous partition IDs (rather than doing so at zone level). + /** + * Loops over cluster and repeatedly tries to break up contiguous runs of + * partitions. After each phase of breaking up contiguous partitions, random + * partitions are selected to move between zones to balance the number of + * partitions in each zone. The second phase may re-introduce contiguous + * partition runs in another zone. Therefore, this overall process is + * repeated multiple times. + * + * @param nextCluster + * @param maxContiguousPartitionsPerZone See RebalanceCLI. + * @return + */ + public static Cluster repeatedlyBalanceContiguousPartitionsPerZone(final Cluster targetCluster, + final int maxContiguousPartitionsPerZone) { + // TODO: Make this loop definitive. I.e., ensure that + // maxContiguousPartitionsPerZone is truly met. + System.out.println("Looping to evenly balance partitions across zones while limiting contiguous partitions"); + int repeatContigBalance = 10; + Cluster nextCluster = targetCluster; + for(int i = 0; i < repeatContigBalance; i++) { + nextCluster = balanceContiguousPartitionsPerZone(nextCluster, + maxContiguousPartitionsPerZone); + + nextCluster = balanceNumPartitionsPerZone(nextCluster); + System.out.println("Completed round of balancing contiguous partitions: round " + + (i + 1) + " of " + repeatContigBalance); + } + + return nextCluster; + } + + /** + * Ensures that no more than maxContiguousPartitionsPerZone partitions are + * contiguous within a single zone. + * + * Moves the necessary partitions to break up contiguous runs from each zone + * to some other random zone/node. There is some chance that such random + * moves could result in contiguous partitions in other zones. + * + * @param targetCluster Target cluster metadata + * @param maxContiguousPartitionsPerZone See RebalanceCLI. + * @return Return a pair of cluster metadata and number of primary + * partitions that have moved. + */ + public static Cluster balanceContiguousPartitionsPerZone(final Cluster targetCluster, + final int maxContiguousPartitionsPerZone) { + System.out.println("Balance number of contiguous partitions within a zone."); + System.out.println("numPartitionsPerZone"); + for(int zoneId: targetCluster.getZoneIds()) { + System.out.println(zoneId + " : " + targetCluster.getNumberOfPartitionsInZone(zoneId)); + } + System.out.println("numNodesPerZone"); + for(int zoneId: targetCluster.getZoneIds()) { + System.out.println(zoneId + " : " + targetCluster.getNumberOfNodesInZone(zoneId)); + } + + // Break up contiguous partitions within each zone + HashMap> partitionsToRemoveFromZone = Maps.newHashMap(); + System.out.println("Contiguous partitions"); + for(Integer zoneId: targetCluster.getZoneIds()) { + System.out.println("\tZone: " + zoneId); + List partitions = new ArrayList(targetCluster.getPartitionIdsInZone(zoneId)); + + List partitionsToRemoveFromThisZone = new ArrayList(); + List contiguousPartitions = new ArrayList(); + int lastPartitionId = partitions.get(0); + for(int i = 1; i < partitions.size(); ++i) { + if(partitions.get(i) == lastPartitionId + 1) { + contiguousPartitions.add(partitions.get(i)); + } else { + if(contiguousPartitions.size() > maxContiguousPartitionsPerZone) { + System.out.println("Contiguous partitions: " + contiguousPartitions); + partitionsToRemoveFromThisZone.addAll(removeItemsToSplitListEvenly(contiguousPartitions, + maxContiguousPartitionsPerZone)); + } + contiguousPartitions.clear(); + } + lastPartitionId = partitions.get(i); + } + partitionsToRemoveFromZone.put(zoneId, partitionsToRemoveFromThisZone); + System.out.println("\t\tPartitions to remove: " + partitionsToRemoveFromThisZone); + } + + Cluster returnCluster = ClusterUtils.copyCluster(targetCluster); + + Random r = new Random(); + for(int zoneId: returnCluster.getZoneIds()) { + for(int partitionId: partitionsToRemoveFromZone.get(zoneId)) { + // Pick a random other zone Id + List otherZoneIds = new ArrayList(); + for(int otherZoneId: returnCluster.getZoneIds()) { + if(otherZoneId != zoneId) { + otherZoneIds.add(otherZoneId); + } + } + int whichOtherZoneId = otherZoneIds.get(r.nextInt(otherZoneIds.size())); + + // Pick a random node from other zone ID + int whichNodeOffset = r.nextInt(returnCluster.getNumberOfNodesInZone(whichOtherZoneId)); + int whichNodeId = new ArrayList(returnCluster.getNodeIdsInZone(whichOtherZoneId)).get(whichNodeOffset); + + // Steal partition from one zone to another! + returnCluster = RebalanceUtils.createUpdatedCluster(returnCluster, + whichNodeId, + Lists.newArrayList(partitionId)); + } + } + + return returnCluster; + } + + /** + * Ensures that all zones have within 1 number of partitions. + * + * Moves some number of partitions from each zone to some other random + * zone/node. There is some chance that such moves could result in + * contiguous partitions in other zones. + * + * @param targetCluster Target cluster metadata + * @return Return a pair of cluster metadata and number of primary + * partitions that have moved. + */ + public static Cluster balanceNumPartitionsPerZone(final Cluster targetCluster) { + System.out.println("Balance number of partitions per zone."); + System.out.println("numPartitionsPerZone"); + for(int zoneId: targetCluster.getZoneIds()) { + System.out.println(zoneId + " : " + targetCluster.getNumberOfPartitionsInZone(zoneId)); + } + System.out.println("numNodesPerZone"); + for(int zoneId: targetCluster.getZoneIds()) { + System.out.println(zoneId + " : " + targetCluster.getNumberOfNodesInZone(zoneId)); + } + + int numPartitions = targetCluster.getNumberOfPartitions(); + + // Set up a balanced target number of partitions to move per zone + HashMap targetNumPartitionsPerZone = Maps.newHashMap(); + int numZones = targetCluster.getNumberOfZones(); + int floorPartitions = numPartitions / numZones; + int numZonesWithCeil = numPartitions - (numZones * floorPartitions); + int zoneCounter = 0; + for(Integer zoneId: targetCluster.getZoneIds()) { + int floorPartitionsInZone = floorPartitions + - targetCluster.getNumberOfPartitionsInZone(zoneId); + if(zoneCounter < numZonesWithCeil) { + targetNumPartitionsPerZone.put(zoneId, floorPartitionsInZone + 1); + } else { + targetNumPartitionsPerZone.put(zoneId, floorPartitionsInZone); + } + zoneCounter++; + } + + List donorZoneIds = new ArrayList(); + List stealerZoneIds = new ArrayList(); + for(Integer zoneId: targetCluster.getZoneIds()) { + if(targetNumPartitionsPerZone.get(zoneId) > 0) { + stealerZoneIds.add(zoneId); + } else if(targetNumPartitionsPerZone.get(zoneId) < 0) { + donorZoneIds.add(zoneId); + } + } + + Cluster returnCluster = ClusterUtils.copyCluster(targetCluster); + Random r = new Random(); + + for(Integer stealerZoneId: stealerZoneIds) { + while(targetNumPartitionsPerZone.get(stealerZoneId) > 0) { + for(Integer donorZoneId: donorZoneIds) { + if(targetNumPartitionsPerZone.get(donorZoneId) < 0) { + // Select random stealer node + int stealerNodeOffset = r.nextInt(targetCluster.getNumberOfNodesInZone(stealerZoneId)); + Integer stealerNodeId = new ArrayList(targetCluster.getNodeIdsInZone(stealerZoneId)).get(stealerNodeOffset); + + // Select random donor partition + List partitionsThisZone = new ArrayList(targetCluster.getPartitionIdsInZone(donorZoneId)); + int donorPartitionOffset = r.nextInt(partitionsThisZone.size()); + int donorPartitionId = partitionsThisZone.get(donorPartitionOffset); + + // Accounting + targetNumPartitionsPerZone.put(donorZoneId, + targetNumPartitionsPerZone.get(donorZoneId) + 1); + targetNumPartitionsPerZone.put(stealerZoneId, + targetNumPartitionsPerZone.get(stealerZoneId) - 1); + + // Steal it! + returnCluster = RebalanceUtils.createUpdatedCluster(returnCluster, + stealerNodeId, + Lists.newArrayList(donorPartitionId)); + } + } + } + } + + return returnCluster; + } + + /** + * Swaps two specified partitions + * + * @return modified cluster metadata. + */ + public static Cluster swapPartitions(final Cluster targetCluster, + final int nodeIdA, + final int partitionIdA, + final int nodeIdB, + final int partitionIdB) { + Cluster returnCluster = ClusterUtils.copyCluster(targetCluster); + + // Swap partitions between nodes! + returnCluster = RebalanceUtils.createUpdatedCluster(returnCluster, + nodeIdA, + Lists.newArrayList(partitionIdB)); + returnCluster = RebalanceUtils.createUpdatedCluster(returnCluster, + nodeIdB, + Lists.newArrayList(partitionIdA)); + + return returnCluster; + } + + /** + * Within a single zone, swaps one random partition on one random node with + * another random partition on different random node. + * + * @param targetCluster + * @param zoneId Zone ID within which to shuffle partitions + * @return + */ + public static Cluster swapRandomPartitionsWithinZone(final Cluster targetCluster, + final int zoneId) { + List nodeIdsInZone = new ArrayList(targetCluster.getNodeIdsInZone(zoneId)); + + Cluster returnCluster = ClusterUtils.copyCluster(targetCluster); + Random r = new Random(); + + // Select random stealer node + int stealerNodeOffset = r.nextInt(nodeIdsInZone.size()); + Integer stealerNodeId = nodeIdsInZone.get(stealerNodeOffset); + + // Select random stealer partition + List stealerPartitions = returnCluster.getNodeById(stealerNodeId) + .getPartitionIds(); + int stealerPartitionOffset = r.nextInt(stealerPartitions.size()); + int stealerPartitionId = stealerPartitions.get(stealerPartitionOffset); + + // Select random donor node + List donorNodeIds = new ArrayList(); + donorNodeIds.addAll(nodeIdsInZone); + donorNodeIds.remove(stealerNodeId); + + if(donorNodeIds.isEmpty()) { // No donor nodes! + return returnCluster; + } + int donorIdOffset = r.nextInt(donorNodeIds.size()); + Integer donorNodeId = donorNodeIds.get(donorIdOffset); + + // Select random donor partition + List donorPartitions = returnCluster.getNodeById(donorNodeId).getPartitionIds(); + int donorPartitionOffset = r.nextInt(donorPartitions.size()); + int donorPartitionId = donorPartitions.get(donorPartitionOffset); + + return swapPartitions(returnCluster, + stealerNodeId, + stealerPartitionId, + donorNodeId, + donorPartitionId); + } + + /** + * Randomly shuffle partitions between nodes within every zone. + * + * @param targetCluster Target cluster object. + * @param randomSwapAttempts See RebalanceCLI. + * @param randomSwapSuccesses See RebalanceCLI. + * @param storeDefs List of store definitions + * @return + */ + public static Cluster randomShufflePartitions(final Cluster targetCluster, + final boolean enableXzoneShuffle, + final int randomSwapAttempts, + final int randomSwapSuccesses, + List storeDefs) { + List zoneIds = new ArrayList(targetCluster.getZoneIds()); + Cluster returnCluster = ClusterUtils.copyCluster(targetCluster); + + double currentMaxMinRatio = new ClusterInstance(returnCluster, storeDefs).analyzeBalance(); + + int successes = 0; + for(int i = 0; i < randomSwapAttempts; i++) { + Collections.shuffle(zoneIds, new Random(System.currentTimeMillis())); + for(Integer zoneId: zoneIds) { + Cluster shuffleResults = swapRandomPartitionsWithinZone(returnCluster, zoneId); + double nextMaxMinRatio = new ClusterInstance(shuffleResults, storeDefs).analyzeBalance(); + if(nextMaxMinRatio < currentMaxMinRatio) { + System.out.println("Swap improved max-min ratio: " + currentMaxMinRatio + + " -> " + nextMaxMinRatio + " (improvement " + successes + + " on swap attempt " + i + " in zone " + zoneId + ")"); + int xZoneMoves = 0; + if(!enableXzoneShuffle) { + xZoneMoves = RebalanceUtils.getCrossZoneMoves(shuffleResults, + new RebalanceClusterPlan(returnCluster, + shuffleResults, + storeDefs, + true)); + } + if(xZoneMoves == 0) { + successes++; + returnCluster = shuffleResults; + currentMaxMinRatio = nextMaxMinRatio; + } else { + System.out.println("BUT, swap resulted in a cross zone move and so is ignored."); + } + } + } + if(successes >= randomSwapSuccesses) { + // Enough successes, move on. + break; + } + } + + return returnCluster; + } + + /** + * Within a single zone, tries swapping every partition with every other + * partition (ignoring those on the same node) and chooses the best swap. + * This is very expensive and is not feasible for clusters with a desirable + * number of partitions. + * + * @param targetCluster + * @param zoneId Zone ID within which to shuffle partitions + * @param storeDefs List of store definitions + * @return + */ + public static Cluster swapGreedyPartitionsWithinZone(final Cluster targetCluster, + final int zoneId, + List storeDefs) { + List nodeIdsInZone = new ArrayList(targetCluster.getNodeIdsInZone(zoneId)); + + Cluster returnCluster = ClusterUtils.copyCluster(targetCluster); + double currentMaxMinRatio = new ClusterInstance(returnCluster, storeDefs).analyzeBalance(); + int nodeIdA = -1; + int nodeIdB = -1; + int partitionIdA = -1; + int partitionIdB = -1; + + // O(n^2) where n is the number of partitions. Yikes! + int progressCounter = 0; + for(int nodeIdEh: nodeIdsInZone) { + List partitionIdsEh = returnCluster.getNodeById(nodeIdEh).getPartitionIds(); + for(Integer partitionIdEh: partitionIdsEh) { + for(int nodeIdBee: nodeIdsInZone) { + if(nodeIdBee == nodeIdEh) + continue; + List partitionIdsBee = returnCluster.getNodeById(nodeIdBee) + .getPartitionIds(); + for(Integer partitionIdBee: partitionIdsBee) { + progressCounter++; + if(progressCounter % 500 == 0) + System.out.println("o"); + else if(progressCounter % 25 == 0) + System.out.println("."); + + Cluster swapResult = swapPartitions(returnCluster, + nodeIdEh, + partitionIdEh, + nodeIdBee, + partitionIdBee); + double swapMaxMinRatio = new ClusterInstance(swapResult, storeDefs).analyzeBalance(); + if(swapMaxMinRatio < currentMaxMinRatio) { + currentMaxMinRatio = swapMaxMinRatio; + System.out.println(" -> " + currentMaxMinRatio); + nodeIdA = nodeIdEh; + partitionIdA = partitionIdEh; + nodeIdB = nodeIdBee; + partitionIdB = partitionIdBee; + } + } + } + } + } + + if(nodeIdA == -1) { + return returnCluster; + } + return swapPartitions(returnCluster, nodeIdA, partitionIdA, nodeIdB, partitionIdB); + } + + /** + * Within a single zone, tries swapping some minimum number of random + * partitions per node with some minimum number of random partitions from + * other nodes within the zone. Chooses the best swap in each iteration. + * Large values of the greedSwapMaxPartitions... arguments make this method + * equivalent to comparing every possible swap. This is very expensive. + * + * @param targetCluster + * @param zoneId Zone ID within which to shuffle partitions + * @param greedySwapMaxPartitionsPerNode See RebalanceCLI. + * @param greedySwapMaxPartitionsPerZone See RebalanceCLI. + * @param storeDefs + * @return + */ + public static Cluster swapGreedyRandomPartitionsWithinZone(final Cluster targetCluster, + final int zoneId, + final int greedySwapMaxPartitionsPerNode, + final int greedySwapMaxPartitionsPerZone, + List storeDefs) { + List nodeIdsInZone = new ArrayList(targetCluster.getNodeIdsInZone(zoneId)); + + System.out.println("GreedyRandom : nodeIdsInZone:" + nodeIdsInZone); + + Cluster returnCluster = ClusterUtils.copyCluster(targetCluster); + double currentMaxMinRatio = new ClusterInstance(returnCluster, storeDefs).analyzeBalance(); + int nodeIdA = -1; + int nodeIdB = -1; + int partitionIdA = -1; + int partitionIdB = -1; + + for(int nodeIdEh: nodeIdsInZone) { + System.out.println("GreedyRandom : processing nodeId:" + nodeIdEh); + List partitionIdsEh = new ArrayList(); + partitionIdsEh.addAll(returnCluster.getNodeById(nodeIdEh).getPartitionIds()); + Collections.shuffle(partitionIdsEh); + + int maxPartitionsInEh = Math.min(greedySwapMaxPartitionsPerNode, partitionIdsEh.size()); + for(int offsetEh = 0; offsetEh < maxPartitionsInEh; ++offsetEh) { + Integer partitionIdEh = partitionIdsEh.get(offsetEh); + + List> partitionIdsZone = new ArrayList>(); + for(int nodeIdBee: nodeIdsInZone) { + if(nodeIdBee == nodeIdEh) + continue; + for(Integer partitionIdBee: returnCluster.getNodeById(nodeIdBee) + .getPartitionIds()) { + partitionIdsZone.add(new Pair(nodeIdBee, partitionIdBee)); + } + } + + Collections.shuffle(partitionIdsZone); + int maxPartitionsInZone = Math.min(greedySwapMaxPartitionsPerZone, + partitionIdsZone.size()); + for(int offsetZone = 0; offsetZone < maxPartitionsInZone; offsetZone++) { + Integer nodeIdBee = partitionIdsZone.get(offsetZone).getFirst(); + Integer partitionIdBee = partitionIdsZone.get(offsetZone).getSecond(); + Cluster swapResult = swapPartitions(returnCluster, + nodeIdEh, + partitionIdEh, + nodeIdBee, + partitionIdBee); + double swapMaxMinRatio = new ClusterInstance(swapResult, storeDefs).analyzeBalance(); + if(swapMaxMinRatio < currentMaxMinRatio) { + currentMaxMinRatio = swapMaxMinRatio; + System.out.println(" -> " + currentMaxMinRatio); + nodeIdA = nodeIdEh; + partitionIdA = partitionIdEh; + nodeIdB = nodeIdBee; + partitionIdB = partitionIdBee; + } + } + } + } + + if(nodeIdA == -1) { + return returnCluster; + } + return swapPartitions(returnCluster, nodeIdA, partitionIdA, nodeIdB, partitionIdB); + } + + /** + * Within a single zone, tries swapping some minimum number of random + * partitions per node with some minimum number of random partitions from + * other nodes within the zone. Chooses the best swap in each iteration. + * Large values of the greedSwapMaxPartitions... arguments make this method + * equivalent to comparing every possible swap. This is very expensive. + * + * Normal case should be : + * + * #zones X #nodes/zone X max partitions/node X max partitions/zone + * + * @param targetCluster Target cluster object. + * @param greedyAttempts See RebalanceCLI. + * @param greedySwapMaxPartitionsPerNode See RebalanceCLI. + * @param greedySwapMaxPartitionsPerZone See RebalanceCLI. + * @param storeDefs + * @return + */ + public static Cluster greedyShufflePartitions(final Cluster targetCluster, + final boolean enableXzoneShuffle, + final int greedyAttempts, + final int greedySwapMaxPartitionsPerNode, + final int greedySwapMaxPartitionsPerZone, + List storeDefs) { + List zoneIds = new ArrayList(targetCluster.getZoneIds()); + Cluster returnCluster = ClusterUtils.copyCluster(targetCluster); + + double currentMaxMinRatio = new ClusterInstance(returnCluster, storeDefs).analyzeBalance(); + + for(int i = 0; i < greedyAttempts; i++) { + Collections.shuffle(zoneIds, new Random(System.currentTimeMillis())); + for(Integer zoneId: zoneIds) { + System.out.println("Greedy swap attempt: zone " + zoneId + " , attempt " + i + + " of " + greedyAttempts); + Cluster shuffleResults = swapGreedyRandomPartitionsWithinZone(returnCluster, + zoneId, + greedySwapMaxPartitionsPerNode, + greedySwapMaxPartitionsPerZone, + storeDefs); + double nextMaxMinRatio = new ClusterInstance(shuffleResults, storeDefs).analyzeBalance(); + + if(nextMaxMinRatio == currentMaxMinRatio) { + System.out.println("Not improving for zone: " + zoneId); + } else { + System.out.println("Swap improved max-min ratio: " + currentMaxMinRatio + + " -> " + nextMaxMinRatio + " (swap attempt " + i + + " in zone " + zoneId + ")"); + + int xZoneMoves = 0; + if(!enableXzoneShuffle) { + xZoneMoves = RebalanceUtils.getCrossZoneMoves(shuffleResults, + new RebalanceClusterPlan(returnCluster, + shuffleResults, + storeDefs, + true)); + } + if(xZoneMoves == 0) { + returnCluster = shuffleResults; + currentMaxMinRatio = nextMaxMinRatio; + } else { + System.out.println("BUT, swap resulted in a cross zone move and so is ignored."); + } + } + } + } + + return returnCluster; + } + + /** + * Validate that two cluster metadata instances are consistent with one + * another. I.e., that they have the same number of partitions. Note that + * the Cluster object does additional verification upon construction (e.g., + * that partitions are numbered consecutively) and so there is no risk of + * duplicate partitions. + * + * @param before cluster metadata before any changes + * @param after cluster metadata after any changes + * @return false if the 'after' metadata is not consistent with the 'before' + * metadata + */ + public static boolean validateClusterUpdate(final Cluster before, final Cluster after) { + if(before.getNumberOfPartitions() != after.getNumberOfPartitions()) { + return false; + } + return true; + } + + /** + * This method breaks the inputList into distinct lists that are no longer + * than maxContiguous in length. It does so by removing elements from the + * inputList. This method removes the minimum necessary items to achieve the + * goal. This method chooses items to remove that minimize the length of the + * maximum remaining run. E.g. given an inputList of 20 elements and + * maxContiguous=8, this method will return the 2 elements that break the + * inputList into 3 runs of 6 items. (As opposed to 2 elements that break + * the inputList into two runs of eight items and one run of two items. + * + * @param inputList The list to be broken into separate runs. + * @param maxContiguous The upper limit on sub-list size + * @return A list of Integers to be removed from inputList to achieve the + * maxContiguous goal. + */ + public static List removeItemsToSplitListEvenly(final List inputList, + int maxContiguous) { + List itemsToRemove = new ArrayList(); + int contiguousCount = inputList.size(); + if(contiguousCount > maxContiguous) { + // Determine how many items must be removed to ensure no contig run + // longer than maxContiguous + int numToRemove = contiguousCount / (maxContiguous + 1); + // Breaking in numToRemove places results in numToRemove+1 runs. + int numRuns = numToRemove + 1; + // Num items left to break into numRuns + int numItemsLeft = contiguousCount - numToRemove; + // Determine minimum length of each run after items are removed. + int floorOfEachRun = numItemsLeft / numRuns; + // Determine how many runs need one extra element to evenly + // distribute numItemsLeft among all numRuns + int numOfRunsWithExtra = numItemsLeft - (floorOfEachRun * numRuns); + + int offset = 0; + for(int i = 0; i < numToRemove; ++i) { + offset += floorOfEachRun; + if(i < numOfRunsWithExtra) + offset++; + itemsToRemove.add(inputList.get(offset)); + offset++; + } + } + return itemsToRemove; + } + + public static void dumpClusterToFile(String outputDir, String fileName, Cluster cluster) { + if(outputDir != null) { + try { + FileUtils.writeStringToFile(new File(outputDir, fileName), + new ClusterMapper().writeCluster(cluster)); + } catch(Exception e) {} + } + } + + public static void dumpAnalysisToFile(String outputDir, String fileName, String analysis) { + if(outputDir != null) { + try { + FileUtils.writeStringToFile(new File(outputDir, fileName), analysis); + } catch(Exception e) {} + } + } + +} diff --git a/src/java/voldemort/utils/RebalanceUtils.java b/src/java/voldemort/utils/RebalanceUtils.java index 26cd4d6b44..e66895963c 100644 --- a/src/java/voldemort/utils/RebalanceUtils.java +++ b/src/java/voldemort/utils/RebalanceUtils.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2010 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -25,13 +25,11 @@ import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; -import java.util.LinkedHashMap; import java.util.List; import java.util.Map; -import java.util.Random; +import java.util.Map.Entry; import java.util.Set; import java.util.TreeMap; -import java.util.Map.Entry; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.ThreadFactory; @@ -41,6 +39,7 @@ import org.apache.log4j.Logger; import voldemort.VoldemortException; +import voldemort.client.ClientConfig; import voldemort.client.protocol.admin.AdminClient; import voldemort.client.protocol.admin.AdminClientConfig; import voldemort.client.rebalance.RebalanceClusterPlan; @@ -50,7 +49,6 @@ import voldemort.cluster.Node; import voldemort.routing.RoutingStrategy; import voldemort.routing.RoutingStrategyFactory; -import voldemort.routing.RoutingStrategyType; import voldemort.server.VoldemortConfig; import voldemort.server.rebalance.VoldemortRebalancingException; import voldemort.store.StoreDefinition; @@ -108,7 +106,7 @@ public static HashMap> getOptimizedReplicaToPartitionList // preference list before, a copy of the // data will already exist - Don't copy // it! - if(!RebalanceUtils.containsPreferenceList(cluster, preferenceList, stealerNodeId)) { + if(!ClusterUtils.containsPreferenceList(cluster, preferenceList, stealerNodeId)) { partitionList.add(partition); } } @@ -143,7 +141,7 @@ public static Versioned getLatestCluster(List requiredNodes, clusterList.add(latestCluster); for(Node node: adminClient.getAdminClientCluster().getNodes()) { try { - Versioned versionedCluster = adminClient.getRemoteCluster(node.getId()); + Versioned versionedCluster = adminClient.metadataMgmtOps.getRemoteCluster(node.getId()); VectorClock newClock = (VectorClock) versionedCluster.getVersion(); if(null != newClock && !clusterList.contains(versionedCluster)) { // check no two clocks are concurrent. @@ -240,330 +238,6 @@ public static void assertSameDonor(List partitionInfos, } } - /** - * Outputs an optimized cluster based on the existing cluster and the new - * nodes that are being added. - * - * @param currentCluster Current cluster metadata - * @param targetCluster The target cluster metadata which contains the nodes - * of the current cluster + new nodes with empty partitions - * @param storeDefs List of store definitions - * @param outputDir The output directory where we'll store the cluster - * metadata ( if not null ) - * @param tries Number of times we'll try to optimize the metadata - * generation - */ - public static void generateMinCluster(final Cluster currentCluster, - final Cluster targetCluster, - final List storeDefs, - final String outputDir, - final int tries) { - - HashMap uniqueStores = KeyDistributionGenerator.getUniqueStoreDefinitionsWithCounts(storeDefs); - - List keys = KeyDistributionGenerator.generateKeys(KeyDistributionGenerator.DEFAULT_NUM_KEYS); - Cluster minCluster = targetCluster; - int minMoves = Integer.MAX_VALUE; - double minStdDev = Double.MAX_VALUE; - for(int numTries = 0; numTries < tries; numTries++) { - Pair minClusterMove = RebalanceUtils.generateMinCluster(currentCluster, - targetCluster, - storeDefs); - - double currentStdDev = KeyDistributionGenerator.getStdDeviation(KeyDistributionGenerator.generateOverallDistributionWithUniqueStores(minClusterMove.getFirst(), - uniqueStores, - keys)); - - System.out.println("Optimization number " + numTries + ": " - + minClusterMove.getSecond() + " moves, " + currentStdDev - + " std dev"); - System.out.println("Current min moves: " + minMoves + "; current min std dev: " - + minStdDev); - - if(currentStdDev <= minStdDev) { - if(minClusterMove.getSecond() > minMoves) { - System.out.println("Warning: the newly chosen cluster requires " - + (minClusterMove.getSecond() - minMoves) - + " addition moves!"); - } - minMoves = minClusterMove.getSecond(); - minStdDev = currentStdDev; - minCluster = minClusterMove.getFirst(); - - System.out.println("Current distribution"); - System.out.println(KeyDistributionGenerator.printOverallDistribution(currentCluster, - storeDefs, - keys)); - System.out.println("-------------------------\n"); - - System.out.println("Target distribution"); - System.out.println(KeyDistributionGenerator.printOverallDistribution(minCluster, - storeDefs, - keys)); - System.out.println("=========================\n"); - // If output directory exists, output the optimized cluster - if(outputDir != null) { - try { - FileUtils.writeStringToFile(new File(outputDir, - RebalanceUtils.finalClusterFileName - + numTries), - new ClusterMapper().writeCluster(minCluster)); - } catch(Exception e) {} - } - } - } - - System.out.println("\n=========================="); - System.out.println("Final distribution"); - System.out.println(KeyDistributionGenerator.printOverallDistribution(minCluster, - storeDefs, - keys)); - System.out.println("=========================\n"); - - // If output directory exists, output the optimized cluster - if(outputDir != null) { - try { - FileUtils.writeStringToFile(new File(outputDir, RebalanceUtils.finalClusterFileName), - new ClusterMapper().writeCluster(minCluster)); - } catch(Exception e) {} - } - - return; - - } - - /** - * Takes the current cluster metadata and target cluster metadata ( which - * contains all the nodes of current cluster + new nodes with empty - * partitions ), and generates a new cluster with some partitions moved to - * the new node - * - * @param currentCluster Current cluster metadata - * @param targetCluster Target cluster metadata ( which contains old nodes + - * new nodes [ empty partitions ]) - * @param storeDefs List of store definitions - * @return Return a pair of cluster metadata and number of primary - * partitions that have moved - */ - public static Pair generateMinCluster(final Cluster currentCluster, - final Cluster targetCluster, - final List storeDefs) { - int currentNumNodes = currentCluster.getNumberOfNodes(); - int targetNumNodes = targetCluster.getNumberOfNodes(); - - // Find all the new nodes added + clone to a new list of nodes - List newNodeIds = Lists.newArrayList(); - List donorNodeIds = Lists.newArrayList(); - List allNodes = Lists.newArrayList(); - - HashMap numPartitionsPerZone = Maps.newHashMap(); - HashMap numNodesPerZone = Maps.newHashMap(); - HashMap numDonorNodesPerZone = Maps.newHashMap(); - - for(Node node: targetCluster.getNodes()) { - if(node.getPartitionIds().isEmpty()) { - newNodeIds.add(node.getId()); - } else { - donorNodeIds.add(node.getId()); - - // Update the number of nodes - if(numDonorNodesPerZone.containsKey(node.getZoneId())) { - int currentNumDonorNodesInZone = numNodesPerZone.get(node.getZoneId()); - currentNumDonorNodesInZone += 1; - numDonorNodesPerZone.put(node.getZoneId(), currentNumDonorNodesInZone); - } else { - numDonorNodesPerZone.put(node.getZoneId(), 1); - } - - } - allNodes.add(updateNode(node, Lists.newArrayList(node.getPartitionIds()))); - - // Update the number of partitions - if(numPartitionsPerZone.containsKey(node.getZoneId())) { - int currentNumPartitionsInZone = numPartitionsPerZone.get(node.getZoneId()); - currentNumPartitionsInZone += node.getNumberOfPartitions(); - numPartitionsPerZone.put(node.getZoneId(), currentNumPartitionsInZone); - } else { - numPartitionsPerZone.put(node.getZoneId(), node.getNumberOfPartitions()); - } - - // Update the number of nodes - if(numNodesPerZone.containsKey(node.getZoneId())) { - int currentNumNodesInZone = numNodesPerZone.get(node.getZoneId()); - currentNumNodesInZone += 1; - numNodesPerZone.put(node.getZoneId(), currentNumNodesInZone); - } else { - numNodesPerZone.put(node.getZoneId(), 1); - } - } - - Cluster returnCluster = updateCluster(targetCluster, allNodes); - int totalPrimaryPartitionsMoved = 0; - - if(currentNumNodes == targetNumNodes) { - // Number of nodes is the same, done! - return Pair.create(returnCluster, totalPrimaryPartitionsMoved); - } - - // Go over every new node and give it some partitions - for(int newNodeId: newNodeIds) { - - Node newNode = targetCluster.getNodeById(newNodeId); - int partitionsToSteal = (int) Math.floor(numPartitionsPerZone.get(newNode.getZoneId()) - * 1.0 - / numNodesPerZone.get(newNode.getZoneId())); - - int nodesStolenFrom = 0; - for(int index = 0; index < donorNodeIds.size(); index++) { - int donorNodeId = donorNodeIds.get(index); - - Node donorNode = currentCluster.getNodeById(donorNodeId); - - // Only steal from nodes in same zone - if(donorNode.getZoneId() != newNode.getZoneId()) { - continue; - } - - // Done stealing - if(partitionsToSteal <= 0) - break; - - // One of the valid donor nodes - int partitionsToDonate = Math.max((int) Math.floor(partitionsToSteal - / (numDonorNodesPerZone.get(newNode.getZoneId()) - nodesStolenFrom)), - 1); - - nodesStolenFrom++; - - // Donor node can't donate since itself has few partitions - if(returnCluster.getNodeById(donorNodeId).getNumberOfPartitions() <= partitionsToDonate) { - continue; - } - - List donorPartitions = Lists.newArrayList(returnCluster.getNodeById(donorNodeId) - .getPartitionIds()); - Collections.shuffle(donorPartitions, new Random(System.currentTimeMillis())); - - // Go over every donor partition till we satisfy the - // partitionsToDonate number - int partitionsDonated = 0; - for(int donorPartition: donorPartitions) { - if(partitionsDonated == partitionsToDonate) - break; - Cluster intermediateCluster = createUpdatedCluster(returnCluster, - newNodeId, - Lists.newArrayList(donorPartition)); - if(RebalanceUtils.getCrossZoneMoves(intermediateCluster, - new RebalanceClusterPlan(returnCluster, - intermediateCluster, - storeDefs, - true)) == 0) { - returnCluster = intermediateCluster; - partitionsDonated++; - totalPrimaryPartitionsMoved++; - } - } - partitionsToSteal -= partitionsDonated; - - } - } - - return Pair.create(returnCluster, totalPrimaryPartitionsMoved); - } - - /** - * Check that the key belongs to one of the partitions in the map of replica - * type to partitions - * - * @param nodeId Node on which this is running ( generally stealer node ) - * @param key The key to check - * @param replicaToPartitionList Mapping of replica type to partition list - * @param cluster Cluster metadata - * @param storeDef The store definition - * @return Returns a boolean to indicate if this belongs to the map - */ - public static boolean checkKeyBelongsToPartition(int nodeId, - byte[] key, - HashMap> replicaToPartitionList, - Cluster cluster, - StoreDefinition storeDef) { - boolean checkResult = false; - if(storeDef.getRoutingStrategyType().equals(RoutingStrategyType.TO_ALL_STRATEGY) - || storeDef.getRoutingStrategyType() - .equals(RoutingStrategyType.TO_ALL_LOCAL_PREF_STRATEGY)) { - checkResult = true; - } else { - List keyPartitions = new RoutingStrategyFactory().updateRoutingStrategy(storeDef, - cluster) - .getPartitionList(key); - List nodePartitions = cluster.getNodeById(nodeId).getPartitionIds(); - checkResult = checkKeyBelongsToPartition(keyPartitions, - nodePartitions, - replicaToPartitionList); - } - return checkResult; - } - - /** - * Check that the key belongs to one of the partitions in the map of replica - * type to partitions - * - * @param keyPartitions Preference list of the key - * @param nodePartitions Partition list on this node - * @param replicaToPartitionList Mapping of replica type to partition list - * @return Returns a boolean to indicate if this belongs to the map - */ - public static boolean checkKeyBelongsToPartition(List keyPartitions, - List nodePartitions, - HashMap> replicaToPartitionList) { - // Check for null - replicaToPartitionList = Utils.notNull(replicaToPartitionList); - - for(int replicaNum = 0; replicaNum < keyPartitions.size(); replicaNum++) { - - // If this partition belongs to node partitions + master is in - // replicaToPartitions list -> match - if(nodePartitions.contains(keyPartitions.get(replicaNum))) { - List partitionsToMove = replicaToPartitionList.get(replicaNum); - if(partitionsToMove != null && partitionsToMove.size() > 0) { - if(partitionsToMove.contains(keyPartitions.get(0))) { - return true; - } - } - } - } - return false; - } - - /** - * Given a key and a list of steal infos give back a list of stealer node - * ids which will steal this. - * - * @param key Byte array of key - * @param stealerNodeToMappingTuples Pairs of stealer node id to their - * corresponding [ partition - replica ] tuples - * @param cluster Cluster metadata - * @param storeDef Store definitions - * @return List of node ids - */ - public static List checkKeyBelongsToPartition(byte[] key, - Set>>> stealerNodeToMappingTuples, - Cluster cluster, - StoreDefinition storeDef) { - List keyPartitions = new RoutingStrategyFactory().updateRoutingStrategy(storeDef, - cluster) - .getPartitionList(key); - List nodesToPush = Lists.newArrayList(); - for(Pair>> stealNodeToMap: stealerNodeToMappingTuples) { - List nodePartitions = cluster.getNodeById(stealNodeToMap.getFirst()) - .getPartitionIds(); - if(checkKeyBelongsToPartition(keyPartitions, nodePartitions, stealNodeToMap.getSecond())) { - nodesToPush.add(stealNodeToMap.getFirst()); - } - } - return nodesToPush; - } - /** * Check the execution state of the server by checking the state of * {@link VoldemortState}
        @@ -577,7 +251,7 @@ public static List checkKeyBelongsToPartition(byte[] key, */ public static void validateClusterState(final Cluster cluster, final AdminClient adminClient) { for(Node node: cluster.getNodes()) { - Versioned versioned = adminClient.getRemoteServerState(node.getId()); + Versioned versioned = adminClient.rebalanceOps.getRemoteServerState(node.getId()); if(!VoldemortState.NORMAL_SERVER.equals(versioned.getValue())) { throw new VoldemortRebalancingException("Cannot rebalance since node " @@ -605,8 +279,8 @@ public static void validateClusterState(final Cluster cluster, final AdminClient public static Cluster getClusterWithNewNodes(Cluster currentCluster, Cluster targetCluster) { ArrayList newNodes = new ArrayList(); for(Node node: targetCluster.getNodes()) { - if(!containsNode(currentCluster, node.getId())) { - newNodes.add(updateNode(node, new ArrayList())); + if(!ClusterUtils.containsNode(currentCluster, node.getId())) { + newNodes.add(NodeUtils.updateNode(node, new ArrayList())); } } return updateCluster(currentCluster, newNodes); @@ -635,43 +309,6 @@ public static Cluster updateCluster(Cluster currentCluster, List updatedNo Lists.newArrayList(currentCluster.getZones())); } - /** - * Given a cluster and a node id checks if the node exists - * - * @param cluster The cluster metadata to check in - * @param nodeId The node id to search for - * @return True if cluster contains the node id, else false - */ - public static boolean containsNode(Cluster cluster, int nodeId) { - try { - cluster.getNodeById(nodeId); - return true; - } catch(VoldemortException e) { - return false; - } - } - - /** - * Given a preference list and a node id, check if any one of the partitions - * is on the node in picture - * - * @param cluster Cluster metadata - * @param preferenceList Preference list of partition ids - * @param nodeId Node id which we are checking for - * @return True if the preference list contains a node whose id = nodeId - */ - public static boolean containsPreferenceList(Cluster cluster, - List preferenceList, - int nodeId) { - - for(int partition: preferenceList) { - if(RebalanceUtils.getNodeByPartitionId(cluster, partition).getId() == nodeId) { - return true; - } - } - return false; - } - /** * Updates the existing cluster such that we remove partitions mentioned * from the stealer node and add them to the donor node @@ -695,7 +332,7 @@ public static Cluster createUpdatedCluster(Cluster currentCluster, for(int donatedPartition: donatedPartitions) { // Gets the donor Node that owns this donated partition - Node donorNode = RebalanceUtils.getNodeByPartitionId(updatedCluster, donatedPartition); + Node donorNode = ClusterUtils.getNodeByPartitionId(updatedCluster, donatedPartition); Node stealerNode = updatedCluster.getNodeById(stealerNodeId); if(donorNode == stealerNode) { @@ -704,111 +341,18 @@ public static Cluster createUpdatedCluster(Cluster currentCluster, } // Update the list of partitions for this node - donorNode = RebalanceUtils.removePartitionToNode(donorNode, donatedPartition); - stealerNode = RebalanceUtils.addPartitionToNode(stealerNode, donatedPartition); + donorNode = NodeUtils.removePartitionToNode(donorNode, donatedPartition); + stealerNode = NodeUtils.addPartitionToNode(stealerNode, donatedPartition); // Sort the nodes - updatedCluster = updateCluster(updatedCluster, Lists.newArrayList(donorNode, - stealerNode)); + updatedCluster = updateCluster(updatedCluster, + Lists.newArrayList(donorNode, stealerNode)); } return updatedCluster; } - /** - * Creates a replica of the node with the new partitions list - * - * @param node The node whose replica we are creating - * @param partitionsList The new partitions list - * @return Replica of node with new partitions list - */ - public static Node updateNode(Node node, List partitionsList) { - return new Node(node.getId(), - node.getHost(), - node.getHttpPort(), - node.getSocketPort(), - node.getAdminPort(), - node.getZoneId(), - partitionsList); - } - - /** - * Add a partition to the node provided - * - * @param node The node to which we'll add the partition - * @param donatedPartition The partition to add - * @return The new node with the new partition - */ - public static Node addPartitionToNode(final Node node, Integer donatedPartition) { - return addPartitionToNode(node, Sets.newHashSet(donatedPartition)); - } - - /** - * Remove a partition from the node provided - * - * @param node The node from which we're removing the partition - * @param donatedPartition The partitions to remove - * @return The new node without the partition - */ - public static Node removePartitionToNode(final Node node, Integer donatedPartition) { - return removePartitionToNode(node, Sets.newHashSet(donatedPartition)); - } - - /** - * Add the set of partitions to the node provided - * - * @param node The node to which we'll add the partitions - * @param donatedPartitions The list of partitions to add - * @return The new node with the new partitions - */ - public static Node addPartitionToNode(final Node node, final Set donatedPartitions) { - List deepCopy = new ArrayList(node.getPartitionIds()); - deepCopy.addAll(donatedPartitions); - Collections.sort(deepCopy); - return updateNode(node, deepCopy); - } - - /** - * Remove the set of partitions from the node provided - * - * @param node The node from which we're removing the partitions - * @param donatedPartitions The list of partitions to remove - * @return The new node without the partitions - */ - public static Node removePartitionToNode(final Node node, final Set donatedPartitions) { - List deepCopy = new ArrayList(node.getPartitionIds()); - deepCopy.removeAll(donatedPartitions); - return updateNode(node, deepCopy); - } - - /** - * Given the cluster metadata returns a mapping of partition to node - * - * @param currentCluster Cluster metadata - * @return Map of partition id to node id - */ - public static Map getCurrentPartitionMapping(Cluster currentCluster) { - - Map partitionToNode = new LinkedHashMap(); - - for(Node node: currentCluster.getNodes()) { - for(Integer partition: node.getPartitionIds()) { - // Check if partition is on another node - Integer previousRegisteredNodeId = partitionToNode.get(partition); - if(previousRegisteredNodeId != null) { - throw new IllegalArgumentException("Partition id " + partition - + " found on two nodes : " + node.getId() - + " and " + previousRegisteredNodeId); - } - - partitionToNode.put(partition, node.getId()); - } - } - - return partitionToNode; - } - /** * Attempt to propagate a cluster definition to all nodes. Also rollback is * in place in case one of them fails @@ -828,7 +372,7 @@ public static void propagateCluster(AdminClient adminClient, Cluster cluster) { for(Node node: cluster.getNodes()) { try { - Versioned versionedCluster = adminClient.getRemoteCluster(node.getId()); + Versioned versionedCluster = adminClient.metadataMgmtOps.getRemoteCluster(node.getId()); VectorClock newClock = (VectorClock) versionedCluster.getVersion(); // Update the current cluster information @@ -862,7 +406,7 @@ public static void propagateCluster(AdminClient adminClient, Cluster cluster) { try { for(Node node: cluster.getNodes()) { logger.info("Updating cluster definition on remote node " + node); - adminClient.updateRemoteCluster(node.getId(), cluster, latestClock); + adminClient.metadataMgmtOps.updateRemoteCluster(node.getId(), cluster, latestClock); logger.info("Updated cluster definition " + cluster + " on remote node " + node.getId()); completedNodeIds.add(node.getId()); @@ -871,9 +415,9 @@ public static void propagateCluster(AdminClient adminClient, Cluster cluster) { // Fail early... for(Integer completedNodeId: completedNodeIds) { try { - adminClient.updateRemoteCluster(completedNodeId, - currentClusters.get(completedNodeId), - latestClock); + adminClient.metadataMgmtOps.updateRemoteCluster(completedNodeId, + currentClusters.get(completedNodeId), + latestClock); } catch(VoldemortException exception) { logger.error("Could not revert cluster metadata back on node " + completedNodeId); @@ -902,7 +446,7 @@ public static List getStolenPrimaryPartitions(final Cluster currentClus .getPartitionIds()); List currentList = new ArrayList(); - if(containsNode(currentCluster, stealNodeId)) + if(ClusterUtils.containsNode(currentCluster, stealNodeId)) currentList = currentCluster.getNodeById(stealNodeId).getPartitionIds(); // remove all current partitions from targetList @@ -931,7 +475,7 @@ public static Map>> getStolenPartitionTuples true); Map>> stealerNodeToStolenPartitionTuples = Maps.newHashMap(); - for(int stealerId: getNodeIds(Lists.newArrayList(targetCluster.getNodes()))) { + for(int stealerId: NodeUtils.getNodeIds(Lists.newArrayList(targetCluster.getNodes()))) { Set> clusterStealerReplicas = currentNodeIdToReplicas.get(stealerId); Set> targetStealerReplicas = targetNodeIdToReplicas.get(stealerId); @@ -985,7 +529,7 @@ public static Map>> getNodeIdToAllPartitions cluster); final Map>> nodeIdToReplicas = new HashMap>>(); - final Map partitionToNodeIdMap = getCurrentPartitionMapping(cluster); + final Map partitionToNodeIdMap = ClusterUtils.getCurrentPartitionMapping(cluster); // Map initialization. for(Node node: cluster.getNodes()) { @@ -1027,36 +571,6 @@ public static Map>> getNodeIdToAllPartitions return nodeIdToReplicas; } - /** - * Given the initial and final cluster dumps it into the output directory - * - * @param initialCluster Initial cluster metadata - * @param finalCluster Final cluster metadata - * @param outputDir Output directory where to dump this file - * @throws IOException - */ - public static void dumpCluster(Cluster initialCluster, Cluster finalCluster, File outputDir) { - - // Create the output directory if it doesn't exist - if(!outputDir.exists()) { - Utils.mkdirs(outputDir); - } - - // Get the file paths - File initialClusterFile = new File(outputDir, initialClusterFileName); - File finalClusterFile = new File(outputDir, finalClusterFileName); - - // Write the output - ClusterMapper mapper = new ClusterMapper(); - try { - FileUtils.writeStringToFile(initialClusterFile, mapper.writeCluster(initialCluster)); - FileUtils.writeStringToFile(finalClusterFile, mapper.writeCluster(finalCluster)); - } catch(IOException e) { - logger.error("Error writing cluster metadata to file"); - } - - } - /** * Print log to the following logger ( Info level ) * @@ -1083,22 +597,6 @@ public static void printErrorLog(int taskId, Logger logger, String message, Exce } } - /** - * Returns the Node associated to the provided partition. - * - * @param cluster The cluster in which to find the node - * @param partitionId Partition id for which we want the corresponding node - * @return Node that owns the partition - */ - public static Node getNodeByPartitionId(Cluster cluster, int partitionId) { - for(Node node: cluster.getNodes()) { - if(node.getPartitionIds().contains(partitionId)) { - return node; - } - } - return null; - } - public static AdminClient createTempAdminClient(VoldemortConfig voldemortConfig, Cluster cluster, int numConnPerNode) { @@ -1107,7 +605,7 @@ public static AdminClient createTempAdminClient(VoldemortConfig voldemortConfig, .setAdminSocketTimeoutSec(voldemortConfig.getAdminSocketTimeout()) .setAdminSocketBufferSize(voldemortConfig.getAdminSocketBufferSize()); - return new AdminClient(cluster, config); + return new AdminClient(cluster, config, new ClientConfig()); } /** @@ -1126,8 +624,8 @@ public static AdminClient createTempAdminClient(VoldemortConfig voldemortConfig, public static List getStoreDefinition(Cluster cluster, AdminClient adminClient) { List storeDefs = null; for(Node node: cluster.getNodes()) { - List storeDefList = adminClient.getRemoteStoreDefList(node.getId()) - .getValue(); + List storeDefList = adminClient.metadataMgmtOps.getRemoteStoreDefList(node.getId()) + .getValue(); if(storeDefs == null) { storeDefs = storeDefList; } else { @@ -1187,19 +685,19 @@ public static List validateRebalanceStore(List public static void validateReadOnlyStores(Cluster cluster, List storeDefs, AdminClient adminClient) { - List readOnlyStores = filterStores(storeDefs, true); + List readOnlyStores = StoreDefinitionUtils.filterStores(storeDefs, true); if(readOnlyStores.size() == 0) { // No read-only stores return; } - List storeNames = getStoreNames(readOnlyStores); + List storeNames = StoreDefinitionUtils.getStoreNames(readOnlyStores); for(Node node: cluster.getNodes()) { if(node.getNumberOfPartitions() != 0) { - for(Entry storeToStorageFormat: adminClient.getROStorageFormat(node.getId(), - storeNames) - .entrySet()) { + for(Entry storeToStorageFormat: adminClient.readonlyOps.getROStorageFormat(node.getId(), + storeNames) + .entrySet()) { if(storeToStorageFormat.getValue() .compareTo(ReadOnlyStorageFormat.READONLY_V2.getCode()) != 0) { throw new VoldemortRebalancingException("Cannot rebalance since node " @@ -1252,6 +750,52 @@ public static String printMap(final Map>> no return sb.toString(); } + /** + * Given the initial and final cluster dumps it into the output directory + * + * @param initialCluster Initial cluster metadata + * @param finalCluster Final cluster metadata + * @param outputDir Output directory where to dump this file + * @param filePrefix String to prepend to the initial & final cluster + * metadata files + * @throws IOException + */ + public static void dumpCluster(Cluster initialCluster, + Cluster finalCluster, + File outputDir, + String filePrefix) { + + // Create the output directory if it doesn't exist + if(!outputDir.exists()) { + Utils.mkdirs(outputDir); + } + + // Get the file paths + File initialClusterFile = new File(outputDir, filePrefix + initialClusterFileName); + File finalClusterFile = new File(outputDir, filePrefix + finalClusterFileName); + + // Write the output + ClusterMapper mapper = new ClusterMapper(); + try { + FileUtils.writeStringToFile(initialClusterFile, mapper.writeCluster(initialCluster)); + FileUtils.writeStringToFile(finalClusterFile, mapper.writeCluster(finalCluster)); + } catch(IOException e) { + logger.error("Error writing cluster metadata to file"); + } + } + + /** + * Given the initial and final cluster dumps it into the output directory + * + * @param initialCluster Initial cluster metadata + * @param finalCluster Final cluster metadata + * @param outputDir Output directory where to dump this file + * @throws IOException + */ + public static void dumpCluster(Cluster initialCluster, Cluster finalCluster, File outputDir) { + dumpCluster(initialCluster, finalCluster, outputDir, ""); + } + /** * Given a list of tuples of [replica_type, partition], flattens it and * generates a map of replica_type to partition mapping @@ -1337,7 +881,7 @@ public static List getPartitionsFromTuples(Set> public static List filterPartitionPlanWithStores(List existingPlanList, List storeDefs) { List plans = Lists.newArrayList(); - List storeNames = getStoreNames(storeDefs); + List storeNames = StoreDefinitionUtils.getStoreNames(storeDefs); for(RebalancePartitionsInfo existingPlan: existingPlanList) { RebalancePartitionsInfo info = RebalancePartitionsInfo.create(existingPlan.toJsonString()); @@ -1392,77 +936,6 @@ public static HashMap> groupPartitionsInf return nodeToPartitionsInfo; } - /** - * Given a store name and a list of store definitions, returns the - * appropriate store definition ( if it exists ) - * - * @param storeDefs List of store definitions - * @param storeName The store name whose store definition is required - * @return The store definition - */ - public static StoreDefinition getStoreDefinitionWithName(List storeDefs, - String storeName) { - StoreDefinition def = null; - for(StoreDefinition storeDef: storeDefs) { - if(storeDef.getName().compareTo(storeName) == 0) { - def = storeDef; - break; - } - } - - if(def == null) { - throw new VoldemortException("Could not find store " + storeName); - } - return def; - } - - /** - * Given a list of store definitions, filters the list depending on the - * boolean - * - * @param storeDefs Complete list of store definitions - * @param isReadOnly Boolean indicating whether filter on read-only or not? - * @return List of filtered store definition - */ - public static List filterStores(List storeDefs, - final boolean isReadOnly) { - List filteredStores = Lists.newArrayList(); - for(StoreDefinition storeDef: storeDefs) { - if(storeDef.getType().equals(ReadOnlyStorageConfiguration.TYPE_NAME) == isReadOnly) { - filteredStores.add(storeDef); - } - } - return filteredStores; - } - - /** - * Given a list of store definitions return a list of store names - * - * @param storeDefList The list of store definitions - * @return Returns a list of store names - */ - public static List getStoreNames(List storeDefList) { - List storeList = new ArrayList(); - for(StoreDefinition def: storeDefList) { - storeList.add(def.getName()); - } - return storeList; - } - - /** - * Given a list of nodes, retrieves the list of node ids - * - * @param nodes The list of nodes - * @return Returns a list of node ids - */ - public static List getNodeIds(List nodes) { - List nodeIds = new ArrayList(nodes.size()); - for(Node node: nodes) { - nodeIds.add(node.getId()); - } - return nodeIds; - } - /** * Wait to shutdown service * @@ -1482,6 +955,7 @@ public static ExecutorService createExecutors(int numThreads) { return Executors.newFixedThreadPool(numThreads, new ThreadFactory() { + @Override public Thread newThread(Runnable r) { Thread thread = new Thread(r); thread.setName(r.getClass().getName()); diff --git a/src/java/voldemort/utils/StoreDefinitionUtils.java b/src/java/voldemort/utils/StoreDefinitionUtils.java new file mode 100644 index 0000000000..3eed947419 --- /dev/null +++ b/src/java/voldemort/utils/StoreDefinitionUtils.java @@ -0,0 +1,87 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.utils; + +import java.util.ArrayList; +import java.util.List; + +import voldemort.VoldemortException; +import voldemort.store.StoreDefinition; +import voldemort.store.readonly.ReadOnlyStorageConfiguration; + +import com.google.common.collect.Lists; + +public class StoreDefinitionUtils { + + /** + * Given a list of store definitions, filters the list depending on the + * boolean + * + * @param storeDefs Complete list of store definitions + * @param isReadOnly Boolean indicating whether filter on read-only or not? + * @return List of filtered store definition + */ + public static List filterStores(List storeDefs, + final boolean isReadOnly) { + List filteredStores = Lists.newArrayList(); + for(StoreDefinition storeDef: storeDefs) { + if(storeDef.getType().equals(ReadOnlyStorageConfiguration.TYPE_NAME) == isReadOnly) { + filteredStores.add(storeDef); + } + } + return filteredStores; + } + + /** + * Given a list of store definitions return a list of store names + * + * @param storeDefList The list of store definitions + * @return Returns a list of store names + */ + public static List getStoreNames(List storeDefList) { + List storeList = new ArrayList(); + for(StoreDefinition def: storeDefList) { + storeList.add(def.getName()); + } + return storeList; + } + + /** + * Given a store name and a list of store definitions, returns the + * appropriate store definition ( if it exists ) + * + * @param storeDefs List of store definitions + * @param storeName The store name whose store definition is required + * @return The store definition + */ + public static StoreDefinition getStoreDefinitionWithName(List storeDefs, + String storeName) { + StoreDefinition def = null; + for(StoreDefinition storeDef: storeDefs) { + if(storeDef.getName().compareTo(storeName) == 0) { + def = storeDef; + break; + } + } + + if(def == null) { + throw new VoldemortException("Could not find store " + storeName); + } + return def; + } + +} diff --git a/src/java/voldemort/utils/StoreInstance.java b/src/java/voldemort/utils/StoreInstance.java new file mode 100644 index 0000000000..bc731b9371 --- /dev/null +++ b/src/java/voldemort/utils/StoreInstance.java @@ -0,0 +1,324 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.utils; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import voldemort.VoldemortException; +import voldemort.cluster.Cluster; +import voldemort.routing.RoutingStrategyFactory; +import voldemort.routing.RoutingStrategyType; +import voldemort.store.StoreDefinition; + +import com.google.common.collect.Lists; + +// TODO: Add StoreInstanceTest unit test for these helper methods. + +/** + * This class wraps up a Cluster object and a StoreDefinition. The methods are + * effectively helper or util style methods for analyzing partitions and so on + * which are a function of both Cluster and StoreDefinition. + */ +public class StoreInstance { + + // TODO: (refactor) Improve upon the name "StoreInstance". Object-oriented + // meaning of 'instance' is too easily confused with system notion of an + // "instance of a cluster" (the intended usage in this class name). + + private final Cluster cluster; + private final StoreDefinition storeDefinition; + + private final Map partitionIdToNodeIdMap; + + public StoreInstance(Cluster cluster, StoreDefinition storeDefinition) { + this.cluster = cluster; + this.storeDefinition = storeDefinition; + + partitionIdToNodeIdMap = ClusterUtils.getCurrentPartitionMapping(cluster); + } + + public Cluster getCluster() { + return cluster; + } + + public StoreDefinition getStoreDefinition() { + return storeDefinition; + } + + /** + * Determines list of partition IDs that replicate the master partition ID. + * + * @param masterPartitionId + * @return List of partition IDs that replicate the master partition ID. + */ + public List getReplicationPartitionList(int masterPartitionId) { + return new RoutingStrategyFactory().updateRoutingStrategy(storeDefinition, cluster) + .getReplicatingPartitionList(masterPartitionId); + } + + /** + * Determines list of partition IDs that replicate the key. + * + * @param key + * @return List of partition IDs that replicate the partition ID. + */ + public List getReplicationPartitionList(final byte[] key) { + return getReplicationPartitionList(getMasterPartitionId(key)); + } + + /** + * Determines master partition ID for the key. + * + * @param key + * @return + */ + public int getMasterPartitionId(final byte[] key) { + return new RoutingStrategyFactory().updateRoutingStrategy(storeDefinition, cluster) + .getMasterPartition(key); + } + + /** + * Determines node ID that hosts the specified partition ID. + * + * @param partitionId + * @return + */ + public int getNodeIdForPartitionId(int partitionId) { + return partitionIdToNodeIdMap.get(partitionId); + } + + /** + * Determines the partition ID that replicates the key on the given node. + * + * @param nodeId of the node + * @param key to look up. + * @return partitionId if found, otherwise null. + */ + public Integer getNodesPartitionIdForKey(int nodeId, final byte[] key) { + List partitionIds = getReplicationPartitionList(key); + for(Integer partitionId: partitionIds) { + if(getNodeIdForPartitionId(partitionId) == nodeId) { + return partitionId; + } + } + return null; + } + + /** + * Converts from partitionId to nodeId. The list of partition IDs, + * partitionIds, is expected to be a "replicating partition list", i.e., the + * mapping from partition ID to node ID should be one to one. + * + * @param partitionIds List of partition IDs for which to find the Node ID + * for the Node that owns the partition. + * @return List of node ids, one for each partition ID in partitionIds + * @throws VoldemortException If multiple partition IDs in partitionIds map + * to the same Node ID. + */ + private List getNodeIdListForPartitionIdList(List partitionIds) + throws VoldemortException { + List nodeIds = new ArrayList(partitionIds.size()); + for(Integer partitionId: partitionIds) { + int nodeId = getNodeIdForPartitionId(partitionId); + if(nodeIds.contains(nodeId)) { + throw new VoldemortException("Node ID " + nodeId + " already in list of Node IDs."); + } else { + nodeIds.add(nodeId); + } + } + return nodeIds; + } + + public List getReplicationNodeList(int partitionId) throws VoldemortException { + return getNodeIdListForPartitionIdList(getReplicationPartitionList(partitionId)); + } + + // TODO: (refactor) Move from static methods to non-static methods that use + // this object's cluster and storeDefinition member for the various + // check*BelongsTo* methods. + + /** + * Check that the key belongs to one of the partitions in the map of replica + * type to partitions + * + * @param keyPartitions Preference list of the key + * @param nodePartitions Partition list on this node + * @param replicaToPartitionList Mapping of replica type to partition list + * @return Returns a boolean to indicate if this belongs to the map + */ + public static boolean checkKeyBelongsToPartition(List keyPartitions, + List nodePartitions, + HashMap> replicaToPartitionList) { + // Check for null + replicaToPartitionList = Utils.notNull(replicaToPartitionList); + + for(int replicaNum = 0; replicaNum < keyPartitions.size(); replicaNum++) { + + // If this partition belongs to node partitions + master is in + // replicaToPartitions list -> match + if(nodePartitions.contains(keyPartitions.get(replicaNum))) { + List partitionsToMove = replicaToPartitionList.get(replicaNum); + if(partitionsToMove != null && partitionsToMove.size() > 0) { + if(partitionsToMove.contains(keyPartitions.get(0))) { + return true; + } + } + } + } + return false; + } + + /** + * Check that the key belongs to one of the partitions in the map of replica + * type to partitions + * + * @param nodeId Node on which this is running ( generally stealer node ) + * @param key The key to check + * @param replicaToPartitionList Mapping of replica type to partition list + * @param cluster Cluster metadata + * @param storeDef The store definition + * @return Returns a boolean to indicate if this belongs to the map + */ + public static boolean checkKeyBelongsToPartition(int nodeId, + byte[] key, + HashMap> replicaToPartitionList, + Cluster cluster, + StoreDefinition storeDef) { + boolean checkResult = false; + if(storeDef.getRoutingStrategyType().equals(RoutingStrategyType.TO_ALL_STRATEGY) + || storeDef.getRoutingStrategyType() + .equals(RoutingStrategyType.TO_ALL_LOCAL_PREF_STRATEGY)) { + checkResult = true; + } else { + List keyPartitions = new RoutingStrategyFactory().updateRoutingStrategy(storeDef, + cluster) + .getPartitionList(key); + List nodePartitions = cluster.getNodeById(nodeId).getPartitionIds(); + checkResult = StoreInstance.checkKeyBelongsToPartition(keyPartitions, + nodePartitions, + replicaToPartitionList); + } + return checkResult; + } + + /*** + * + * @return true if the partition belongs to the node with given replicatype + */ + public static boolean checkPartitionBelongsToNode(int partitionId, + int replicaType, + int nodeId, + Cluster cluster, + StoreDefinition storeDef) { + boolean belongs = false; + List nodePartitions = cluster.getNodeById(nodeId).getPartitionIds(); + List replicatingPartitions = new RoutingStrategyFactory().updateRoutingStrategy(storeDef, + cluster) + .getReplicatingPartitionList(partitionId); + // validate replicaType + if(replicaType < replicatingPartitions.size()) { + // check if the replicaType'th partition in the replicating list, + // belongs to the given node + if(nodePartitions.contains(replicatingPartitions.get(replicaType))) + belongs = true; + } + + return belongs; + } + + /** + * Given a key and a list of steal infos give back a list of stealer node + * ids which will steal this. + * + * @param key Byte array of key + * @param stealerNodeToMappingTuples Pairs of stealer node id to their + * corresponding [ partition - replica ] tuples + * @param cluster Cluster metadata + * @param storeDef Store definitions + * @return List of node ids + */ + public static List checkKeyBelongsToPartition(byte[] key, + Set>>> stealerNodeToMappingTuples, + Cluster cluster, + StoreDefinition storeDef) { + List keyPartitions = new RoutingStrategyFactory().updateRoutingStrategy(storeDef, + cluster) + .getPartitionList(key); + List nodesToPush = Lists.newArrayList(); + for(Pair>> stealNodeToMap: stealerNodeToMappingTuples) { + List nodePartitions = cluster.getNodeById(stealNodeToMap.getFirst()) + .getPartitionIds(); + if(StoreInstance.checkKeyBelongsToPartition(keyPartitions, + nodePartitions, + stealNodeToMap.getSecond())) { + nodesToPush.add(stealNodeToMap.getFirst()); + } + } + return nodesToPush; + } + + /*** + * Checks if a given partition is stored in the node. (It can be primary or + * a secondary) + * + * @param partition + * @param nodeId + * @param cluster + * @param storeDef + * @return + */ + public static boolean checkPartitionBelongsToNode(int partition, + int nodeId, + Cluster cluster, + StoreDefinition storeDef) { + List nodePartitions = cluster.getNodeById(nodeId).getPartitionIds(); + List replicatingPartitions = new RoutingStrategyFactory().updateRoutingStrategy(storeDef, + cluster) + .getReplicatingPartitionList(partition); + // remove all partitions from the list, except those that belong to the + // node + replicatingPartitions.retainAll(nodePartitions); + return replicatingPartitions.size() > 0; + } + + /** + * + * @param key + * @param nodeId + * @param cluster + * @param storeDef + * @return true if the key belongs to the node as some replica + */ + public static boolean checkKeyBelongsToNode(byte[] key, + int nodeId, + Cluster cluster, + StoreDefinition storeDef) { + List nodePartitions = cluster.getNodeById(nodeId).getPartitionIds(); + List replicatingPartitions = new RoutingStrategyFactory().updateRoutingStrategy(storeDef, + cluster) + .getPartitionList(key); + // remove all partitions from the list, except those that belong to the + // node + replicatingPartitions.retainAll(nodePartitions); + return replicatingPartitions.size() > 0; + } + +} diff --git a/src/java/voldemort/utils/Utils.java b/src/java/voldemort/utils/Utils.java index 59e0da14a7..8289cfa49f 100644 --- a/src/java/voldemort/utils/Utils.java +++ b/src/java/voldemort/utils/Utils.java @@ -22,8 +22,10 @@ import java.net.URISyntaxException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Calendar; import java.util.Collections; import java.util.Comparator; +import java.util.GregorianCalendar; import java.util.HashSet; import java.util.List; import java.util.Set; @@ -542,4 +544,46 @@ public static boolean isSymLink(File symlinkFile) { } } + /** + * Given a start time, computes the next time when the wallclock will reach + * a certain hour of the day, on a certain day of the week Eg: From today, + * when is the next Saturday, 12PM ? + * + * @param startTime start time + * @param targetDay day of the week to choose + * @param targetHour hour of the day to choose + * @return calendar object representing the target time + */ + public static GregorianCalendar getCalendarForNextRun(GregorianCalendar startTime, + int targetDay, + int targetHour) { + long startTimeMs = startTime.getTimeInMillis(); + GregorianCalendar cal = new GregorianCalendar(); + cal.setTimeInMillis(startTimeMs); + + // adjust time to targetHour on startDay + cal.set(Calendar.HOUR_OF_DAY, targetHour); + cal.set(Calendar.MINUTE, 0); + cal.set(Calendar.SECOND, 0); + cal.set(Calendar.MILLISECOND, 0); + + // check if we are past the targetHour for the current day + if(cal.get(Calendar.DAY_OF_WEEK) != targetDay || cal.getTimeInMillis() < startTimeMs) { + do { + cal.add(Calendar.DAY_OF_YEAR, 1); + } while(cal.get(Calendar.DAY_OF_WEEK) != targetDay); + } + return cal; + } + + /** + * Returns the day of week, 'nDays' from today + * + * @return Calendar constant representing the day of the week + */ + public static int getDayOfTheWeekFromNow(int nDays) { + GregorianCalendar cal = new GregorianCalendar(); + cal.add(Calendar.DAY_OF_YEAR, nDays); + return cal.get(Calendar.DAY_OF_WEEK); + } } diff --git a/src/java/voldemort/utils/pool/KeyedResourcePool.java b/src/java/voldemort/utils/pool/KeyedResourcePool.java index e71c582b74..4b9a147e2f 100644 --- a/src/java/voldemort/utils/pool/KeyedResourcePool.java +++ b/src/java/voldemort/utils/pool/KeyedResourcePool.java @@ -124,14 +124,12 @@ public V checkout(K key) throws Exception { long startNs = System.nanoTime(); Pool resourcePool = getResourcePoolForKey(key); - // Always attempt to grow. This protects against running out of - // resources because they were destroyed. - attemptGrow(key, resourcePool); - V resource = null; try { checkNotClosed(); - resource = attemptCheckout(resourcePool); + // Must attempt a non blocking checkout before blockingGet to ensure + // resources are created for the pool. + resource = attemptNonBlockingCheckout(key, resourcePool); if(resource == null) { long timeRemainingNs = resourcePoolConfig.getTimeout(TimeUnit.NANOSECONDS) @@ -155,25 +153,27 @@ public V checkout(K key) throws Exception { } /** - * Get a free resource if one exists. This method does not block. It either - * returns null or a resource. + * Get a free resource if one exists. If there are no free resources, + * attempt to create a new resource (up to the max allowed for the pool). + * This method does not block per se. However, creating a resource may be + * (relatively) expensive. This method either returns null or a resource. + * + * This method is the only way in which new resources are created for the + * pool. So, non blocking checkouts must be attempted to populate the + * resource pool. */ - protected V attemptCheckout(Pool pool) throws Exception { + protected V attemptNonBlockingCheckout(K key, Pool pool) throws Exception { V resource = pool.nonBlockingGet(); + if(resource == null) { + while(pool.attemptGrow(key, this.objectFactory)) { + resource = pool.nonBlockingGet(); + if(resource != null) + break; + } + } return resource; } - /** - * Attempt to create a new object and add it to the pool--this only happens - * if there is room for the new object. This method does not block. This - * method returns true if it adds a resource to the pool. (Returning true - * does not guarantee subsequent checkout will succeed because concurrent - * checkouts may occur.) - */ - protected boolean attemptGrow(K key, Pool pool) throws Exception { - return pool.attemptGrow(key, this.objectFactory); - } - /** * Get the pool for the given key. If no pool exists, create one. */ @@ -239,8 +239,6 @@ public void checkin(K key, V resource) throws Exception { } } - // This method may be made protected in the future for the benefit of - // classes which extend from KeyedResourcePool. protected boolean isOpenAndValid(K key, V resource) throws Exception { if(isOpen.get() && objectFactory.validate(key, resource)) { return true; @@ -276,13 +274,13 @@ public void close() { } /** - * Reset a specific resource pool. Destroys all the resources in the pool. - * This method does not affect whether the pool is "open" in the sense of - * permitting new resources to be added to it. + * Reset a specific resource pool. Destroys all of the idle resources in the + * pool. This method does not affect whether the pool is "open" in the sense + * of permitting new resources to be added to it. * * @param key The key for the pool to reset. */ - public synchronized void reset(K key) { + public void reset(K key) { Pool resourcePool = getResourcePoolForExistingKey(key); List list = resourcePool.close(); for(V value: list) @@ -302,7 +300,9 @@ public int getTotalResourceCount(K key) { Pool resourcePool = getResourcePoolForExistingKey(key); return resourcePool.size.get(); } catch(IllegalArgumentException iae) { - logger.debug("getTotalResourceCount called on invalid key: ", iae); + if(logger.isDebugEnabled()) { + logger.debug("getTotalResourceCount called on invalid key: ", iae); + } } } return 0; @@ -335,7 +335,9 @@ public int getCheckedInResourcesCount(K key) { Pool resourcePool = getResourcePoolForExistingKey(key); return resourcePool.queue.size(); } catch(IllegalArgumentException iae) { - logger.debug("getCheckedInResourceCount called on invalid key: ", iae); + if(logger.isDebugEnabled()) { + logger.debug("getCheckedInResourceCount called on invalid key: ", iae); + } } } return 0; @@ -368,7 +370,9 @@ public int getBlockingGetsCount(K key) { Pool resourcePool = getResourcePoolForExistingKey(key); return resourcePool.blockingGets.get(); } catch(IllegalArgumentException iae) { - logger.debug("getBlockingGetsCount called on invalid key: ", iae); + if(logger.isDebugEnabled()) { + logger.debug("getBlockingGetsCount called on invalid key: ", iae); + } } } return 0; @@ -442,10 +446,24 @@ public boolean attemptGrow(K key, ResourceFactory objectFactory) throw if(!nonBlockingPut(resource)) { this.size.decrementAndGet(); objectFactory.destroy(key, resource); + if(logger.isInfoEnabled()) { + logger.info("attemptGrow established new connection for key " + + key.toString() + + " and immediately destroyed the new connection " + + "because there were too many connections already established."); + } return false; } + if(logger.isDebugEnabled()) { + logger.debug("attemptGrow established new connection for key " + + key.toString() + ". " + + " After checking in to KeyedResourcePool, there are " + + queue.size() + " destinations checked in."); + } } } catch(Exception e) { + // If nonBlockingPut throws an exception, then we could leak + // the resource created by objectFactory.create(). this.size.decrementAndGet(); throw e; } diff --git a/src/java/voldemort/utils/pool/QueuedKeyedResourcePool.java b/src/java/voldemort/utils/pool/QueuedKeyedResourcePool.java index 8407fc748e..4c34537e75 100644 --- a/src/java/voldemort/utils/pool/QueuedKeyedResourcePool.java +++ b/src/java/voldemort/utils/pool/QueuedKeyedResourcePool.java @@ -103,24 +103,17 @@ public void registerResourceRequest(K key, AsyncResourceRequest resourceReque Queue> requestQueue = getRequestQueueForKey(key); if(requestQueue.isEmpty()) { - // Optimistically attempt non-blocking checkout iff requestQueue is - // empty. - Pool resourcePool = getResourcePoolForKey(key); - try { - attemptGrow(key, resourcePool); - } catch(Exception e) { - resourceRequest.handleException(e); - return; - } + // Attempt non-blocking checkout iff requestQueue is empty. + Pool resourcePool = getResourcePoolForKey(key); V resource = null; - try { - resource = attemptCheckout(resourcePool); + resource = attemptNonBlockingCheckout(key, resourcePool); } catch(Exception e) { destroyResource(key, resourcePool, resource); resource = null; resourceRequest.handleException(e); + return; } if(resource != null) { resourceRequest.useResource(resource); @@ -173,9 +166,9 @@ private boolean processQueue(K key) { V resource = null; try { - // Always attempt to grow to deal with destroyed resources. - attemptGrow(key, resourcePool); - resource = attemptCheckout(resourcePool); + // Must attempt non-blocking checkout to ensure resources are + // created for the pool. + resource = attemptNonBlockingCheckout(key, resourcePool); } catch(Exception e) { destroyResource(key, resourcePool, resource); resource = null; @@ -250,11 +243,13 @@ protected void destroyRequest(AsyncResourceRequest resourceRequest) { * @param requestQueue The queue for which all resource requests are to be * destroyed. */ - private synchronized void destroyRequestQueue(Queue> requestQueue) { - AsyncResourceRequest resourceRequest = requestQueue.poll(); - while(resourceRequest != null) { - destroyRequest(resourceRequest); - resourceRequest = requestQueue.poll(); + private void destroyRequestQueue(Queue> requestQueue) { + if(requestQueue != null) { + AsyncResourceRequest resourceRequest = requestQueue.poll(); + while(resourceRequest != null) { + destroyRequest(resourceRequest); + resourceRequest = requestQueue.poll(); + } } } @@ -280,25 +275,6 @@ public void close() { internalClose(); } - /** - * Reset a specific resource pool and resource request queue. First, - * "destroy" all registered resource requests. Second, destroy all resources - * in the pool. - * - * @param key The key for the pool to reset. - */ - @Override - public void reset(K key) { - // First, destroy enqueued resource requests (if any exist). - Queue> requestQueue = requestQueueMap.get(key); - if(requestQueue != null) { - destroyRequestQueue(requestQueue); - } - - // Second, destroy resources in the pool. - super.reset(key); - } - /* * Get the queue of work for the given key. If no queue exists, create one. */ @@ -340,7 +316,9 @@ public int getRegisteredResourceRequestCount(K key) { // FYI: .size() is not constant time in the next call. ;) return requestQueue.size(); } catch(IllegalArgumentException iae) { - logger.debug("getRegisteredResourceRequestCount called on invalid key: ", iae); + if(logger.isDebugEnabled()) { + logger.debug("getRegisteredResourceRequestCount called on invalid key: ", iae); + } } } return 0; diff --git a/src/java/voldemort/versioning/ClockEntry.java b/src/java/voldemort/versioning/ClockEntry.java index 481026cfa3..27bc0cac19 100644 --- a/src/java/voldemort/versioning/ClockEntry.java +++ b/src/java/voldemort/versioning/ClockEntry.java @@ -32,8 +32,16 @@ public final class ClockEntry implements Cloneable, Serializable { private static final long serialVersionUID = 1; - private final short nodeId; - private final long version; + private short nodeId; + private long version; + + /** + * Default constructor + */ + public ClockEntry() { + this.nodeId = -1; + this.version = -1; + } /** * Create a new Version from constituate parts @@ -99,4 +107,12 @@ public String toString() { return nodeId + ":" + version; } + public void setNodeId(short nodeId) { + this.nodeId = nodeId; + } + + public void setVersion(long version) { + this.version = version; + } + } diff --git a/src/java/voldemort/versioning/VectorClock.java b/src/java/voldemort/versioning/VectorClock.java index db35c1fc88..de38a009ea 100644 --- a/src/java/voldemort/versioning/VectorClock.java +++ b/src/java/voldemort/versioning/VectorClock.java @@ -114,24 +114,27 @@ public VectorClock(byte[] bytes, int offset) { public byte[] toBytes() { byte[] serialized = new byte[sizeInBytes()]; + toBytes(serialized, 0); + return serialized; + } + + public int toBytes(byte[] buf, int offset) { // write the number of versions - ByteUtils.writeShort(serialized, (short) versions.size(), 0); + ByteUtils.writeShort(buf, (short) versions.size(), offset); + offset += ByteUtils.SIZE_OF_SHORT; // write the size of each version in bytes byte versionSize = ByteUtils.numberOfBytesRequired(getMaxVersion()); - serialized[2] = versionSize; + buf[offset] = versionSize; + offset++; int clockEntrySize = ByteUtils.SIZE_OF_SHORT + versionSize; - int start = 3; for(ClockEntry v: versions) { - ByteUtils.writeShort(serialized, v.getNodeId(), start); - ByteUtils.writeBytes(serialized, - v.getVersion(), - start + ByteUtils.SIZE_OF_SHORT, - versionSize); - start += clockEntrySize; + ByteUtils.writeShort(buf, v.getNodeId(), offset); + ByteUtils.writeBytes(buf, v.getVersion(), offset + ByteUtils.SIZE_OF_SHORT, versionSize); + offset += clockEntrySize; } - ByteUtils.writeLong(serialized, this.timestamp, start); - return serialized; + ByteUtils.writeLong(buf, this.timestamp, offset); + return sizeInBytes(); } public int sizeInBytes() { @@ -225,6 +228,7 @@ public String toString() { builder.append(this.versions.get(this.versions.size() - 1)); } builder.append(")"); + builder.append(" ts:" + timestamp); return builder.toString(); } diff --git a/src/java/voldemort/xml/ClusterMapper.java b/src/java/voldemort/xml/ClusterMapper.java index ab2973a11d..01343f7d63 100644 --- a/src/java/voldemort/xml/ClusterMapper.java +++ b/src/java/voldemort/xml/ClusterMapper.java @@ -66,6 +66,8 @@ public class ClusterMapper { private static final String SOCKET_PORT_ELMT = "socket-port"; private static final String ADMIN_PORT_ELMT = "admin-port"; + public static final Integer MAX_PARTITIONID = 65535; + private final Schema schema; public ClusterMapper() { @@ -135,7 +137,7 @@ private Zone readZone(Element zone) { return new Zone(zoneId, proximityList); } - public Node readServer(Element server) { + public Node readServer(Element server) throws SAXException { int id = Integer.parseInt(server.getChildText(SERVER_ID_ELMT)); String host = server.getChildText(HOST_ELMT); int httpPort = Integer.parseInt(server.getChildText(HTTP_PORT_ELMT)); @@ -149,8 +151,13 @@ public Node readServer(Element server) { String partitionsText = server.getChildText(SERVER_PARTITIONS_ELMT).trim(); List partitions = new ArrayList(); for(String aPartition: Utils.COMMA_SEP.split(partitionsText)) - if(aPartition.trim().length() > 0) - partitions.add(Integer.parseInt(aPartition.trim())); + if(aPartition.trim().length() > 0) { + Integer partition = Integer.parseInt(aPartition.trim()); + if(partition > MAX_PARTITIONID) { + throw new SAXException("Partition id cannot be greater than " + MAX_PARTITIONID); + } + partitions.add(partition); + } return new Node(id, host, httpPort, socketPort, adminPort, zoneId, partitions); } diff --git a/src/proto/slop.proto b/src/proto/slop.proto index 981c04c327..0b92aa1e78 100644 --- a/src/proto/slop.proto +++ b/src/proto/slop.proto @@ -1,3 +1,8 @@ +// Please use protoc version 2.3.0 to recompile: +// https://code.google.com/p/protobuf/downloads/detail?name=protobuf-2.3.0.tar.gz&can=2&q= +// Use commmand like the below to recompile: +// $ protoc -I=src/proto --java_out=src/java src/proto/slop.proto + package voldemort; option java_package = "voldemort.serialization"; @@ -16,4 +21,4 @@ message Slop { - \ No newline at end of file + diff --git a/src/proto/voldemort-admin.proto b/src/proto/voldemort-admin.proto index 581ee2e016..4d8710ff0a 100644 --- a/src/proto/voldemort-admin.proto +++ b/src/proto/voldemort-admin.proto @@ -1,3 +1,8 @@ +// Please use protoc version 2.3.0 to recompile: +// https://code.google.com/p/protobuf/downloads/detail?name=protobuf-2.3.0.tar.gz&can=2&q= +// Use commmand like the below to recompile: +// $ protoc -I=src/proto --java_out=src/java src/proto/voldemort-admin.proto + package voldemort; option java_package = "voldemort.client.protocol.pb"; @@ -71,8 +76,10 @@ message FetchPartitionEntriesRequest { required string store = 2; optional VoldemortFilter filter = 3; optional bool fetch_values = 4; - optional int64 skip_records = 5; + optional int64 OBSOLETE__DO_NOT_USE__skip_records = 5; optional string initial_cluster = 6; + optional bool fetch_orphaned = 7; + optional int64 records_per_partition = 8; } message FetchPartitionEntriesResponse { diff --git a/src/proto/voldemort-client.proto b/src/proto/voldemort-client.proto index f6c0879709..4da84f942d 100644 --- a/src/proto/voldemort-client.proto +++ b/src/proto/voldemort-client.proto @@ -1,3 +1,8 @@ +// Please use protoc version 2.3.0 to recompile: +// https://code.google.com/p/protobuf/downloads/detail?name=protobuf-2.3.0.tar.gz&can=2&q= +// Use commmand like the below to recompile: +// $ protoc -I=src/proto --java_out=src/java src/proto/voldemort-client.proto + package voldemort; option java_package = "voldemort.client.protocol.pb"; @@ -97,4 +102,4 @@ message VoldemortRequest { optional PutRequest put = 6; optional DeleteRequest delete = 7; optional int32 requestRouteType = 8; -} \ No newline at end of file +} diff --git a/test/common/voldemort/ServerTestUtils.java b/test/common/voldemort/ServerTestUtils.java index 1da8a3994c..e8aa77b6b7 100644 --- a/test/common/voldemort/ServerTestUtils.java +++ b/test/common/voldemort/ServerTestUtils.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2012 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -39,6 +39,7 @@ import org.mortbay.jetty.servlet.Context; import org.mortbay.jetty.servlet.ServletHolder; +import voldemort.client.ClientConfig; import voldemort.client.RoutingTier; import voldemort.client.protocol.RequestFormatFactory; import voldemort.client.protocol.RequestFormatType; @@ -653,12 +654,12 @@ public static VoldemortConfig createServerConfig(boolean useNio, public static AdminClient getAdminClient(Cluster cluster) { AdminClientConfig config = new AdminClientConfig(); - return new AdminClient(cluster, config); + return new AdminClient(cluster, config, new ClientConfig()); } public static AdminClient getAdminClient(String bootstrapURL) { AdminClientConfig config = new AdminClientConfig(); - return new AdminClient(bootstrapURL, config); + return new AdminClient(bootstrapURL, config, new ClientConfig()); } public static RequestHandlerFactory getSocketRequestHandlerFactory(StoreRepository repository) { @@ -673,9 +674,25 @@ public static void stopVoldemortServer(VoldemortServer server) throws IOExceptio } } + /** + * Starts a Voldemort server for testing purposes. + * + * Unless the ports passed in via cluster are guaranteed to be available, + * this method is susceptible to BindExceptions in VoldemortServer.start(). + * (And, there is no good way of guaranteeing that ports will be available, + * so...) + * + * The method {@link ServerTestUtils#startVoldemortCluster} should be used + * in preference to this method.} + * + * @param socketStoreFactory + * @param config + * @param cluster + * @return + */ public static VoldemortServer startVoldemortServer(SocketStoreFactory socketStoreFactory, VoldemortConfig config, - Cluster cluster) { + Cluster cluster) throws BindException { // TODO: Some tests that use this method fail intermittently with the // following output: @@ -689,10 +706,19 @@ public static VoldemortServer startVoldemortServer(SocketStoreFactory socketStor // config, Cluster cluster) to understand how this error is possible, // and why it only happens intermittently. VoldemortServer server = new VoldemortServer(config, cluster); - server.start(); + try { + server.start(); + } catch(VoldemortException ve) { + if(ve.getCause() instanceof BindException) { + ve.printStackTrace(); + throw new BindException(ve.getMessage()); + } else { + throw ve; + } + } ServerTestUtils.waitForServerStart(socketStoreFactory, server.getIdentityNode()); - // wait till server start or throw exception + // wait till server starts or throw exception return server; } @@ -760,11 +786,13 @@ protected static Cluster internalStartVoldemortCluster(int numServers, } /** - * This method wraps up work that is done in many different tests to set up - * some number of Voldemort servers in a cluster. This method masks an - * intermittent TOCTOU problem with the ports identified by + * This method wraps up all of the work that is done in many different tests + * to set up some number of Voldemort servers in a cluster. This method + * masks an intermittent TOCTOU problem with the ports identified by * {@link #findFreePorts(int)} not actually being free when a server needs - * to bind to them. + * to bind to them. If this method returns, it will return a non-null + * cluster. This method is not guaranteed to return, but will likely + * eventually do so... * * @param numServers * @param voldemortServers @@ -778,6 +806,8 @@ protected static Cluster internalStartVoldemortCluster(int numServers, * servers. * @throws IOException */ + // TODO: numServers is likely not needed. If this method is refactored in + // the future, then try and drop the numServers argument. public static Cluster startVoldemortCluster(int numServers, VoldemortServer[] voldemortServers, int[][] partitionMap, diff --git a/test/common/voldemort/TestUtils.java b/test/common/voldemort/TestUtils.java index 9d0a6e2612..997d9611ac 100644 --- a/test/common/voldemort/TestUtils.java +++ b/test/common/voldemort/TestUtils.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2009 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -17,11 +17,14 @@ package voldemort; import java.io.File; +import java.io.StringReader; import java.io.UnsupportedEncodingException; import java.lang.reflect.Field; import java.util.ArrayList; import java.util.Arrays; +import java.util.Calendar; import java.util.Collections; +import java.util.GregorianCalendar; import java.util.List; import java.util.Random; import java.util.SortedSet; @@ -30,12 +33,15 @@ import junit.framework.AssertionFailedError; import voldemort.cluster.Cluster; import voldemort.cluster.Node; +import voldemort.routing.RoutingStrategy; +import voldemort.routing.RoutingStrategyFactory; import voldemort.store.Store; import voldemort.store.StoreDefinition; import voldemort.utils.ByteArray; import voldemort.utils.Utils; import voldemort.versioning.VectorClock; import voldemort.versioning.Versioned; +import voldemort.xml.StoreDefinitionsMapper; /** * Helper utilities for tests @@ -64,6 +70,16 @@ public static VectorClock getClock(int... nodes) { return clock; } + /** + * Helper method to construct Versioned byte value. + * + * @param nodes See getClock method for explanation of this argument + * @return + */ + public static Versioned getVersioned(byte[] value, int... nodes) { + return new Versioned(value, getClock(nodes)); + } + /** * Record events for the given sequence of nodes * @@ -405,4 +421,36 @@ public static StoreDefinition makeStoreDefinition(String storeName, long memFoot null, memFootprintMB); } + + /** + * Provides a routing strategy for local tests to work with + * + * @return + */ + public static RoutingStrategy makeSingleNodeRoutingStrategy() { + Cluster cluster = VoldemortTestConstants.getOneNodeCluster(); + StoreDefinitionsMapper mapper = new StoreDefinitionsMapper(); + List storeDefs = mapper.readStoreList(new StringReader(VoldemortTestConstants.getSingleStoreDefinitionsXml())); + return new RoutingStrategyFactory().updateRoutingStrategy(storeDefs.get(0), cluster); + } + + /** + * Constructs a calendar object representing the given time + */ + public static GregorianCalendar getCalendar(int year, + int month, + int day, + int hour, + int mins, + int secs) { + GregorianCalendar cal = new GregorianCalendar(); + cal.set(Calendar.YEAR, year); + cal.set(Calendar.MONTH, month); + cal.set(Calendar.DATE, day); + cal.set(Calendar.HOUR_OF_DAY, hour); + cal.set(Calendar.MINUTE, mins); + cal.set(Calendar.SECOND, secs); + cal.set(Calendar.MILLISECOND, 0); + return cal; + } } diff --git a/test/common/voldemort/VoldemortTestConstants.java b/test/common/voldemort/VoldemortTestConstants.java index 723e95fa6e..3f74f99c3b 100644 --- a/test/common/voldemort/VoldemortTestConstants.java +++ b/test/common/voldemort/VoldemortTestConstants.java @@ -90,6 +90,10 @@ public static String getSingleStoreWithZonesXml() { return readString("config/single-store-with-zones.xml"); } + public static String getTwoStoresWithZonesXml() { + return readString("config/two-stores-with-zones.xml"); + } + public static Cluster getTenNodeCluster() { return new ClusterMapper().readCluster(new StringReader(getTenNodeClusterXml())); } diff --git a/test/common/voldemort/cluster/failuredetector/MutableStoreVerifier.java b/test/common/voldemort/cluster/failuredetector/MutableStoreVerifier.java index 6ec1fd5e22..b5a6b1061e 100644 --- a/test/common/voldemort/cluster/failuredetector/MutableStoreVerifier.java +++ b/test/common/voldemort/cluster/failuredetector/MutableStoreVerifier.java @@ -7,6 +7,7 @@ import voldemort.VoldemortException; import voldemort.cluster.Node; +import voldemort.store.CompositeVoldemortRequest; import voldemort.store.Store; import voldemort.store.StoreCapabilityType; import voldemort.store.UnreachableStoreException; @@ -66,38 +67,67 @@ public static MutableStoreVerifier create(Collection nodes) { private static Store createStore() { return new Store() { + @Override public void close() throws VoldemortException {} + @Override public boolean delete(ByteArray key, Version version) throws VoldemortException { return false; } + @Override public List> get(ByteArray key, byte[] transforms) throws VoldemortException { return null; } + @Override public Map>> getAll(Iterable keys, Map transforms) throws VoldemortException { return null; } + @Override public Object getCapability(StoreCapabilityType capability) { return null; } + @Override public String getName() { return null; } + @Override public List getVersions(ByteArray key) { return null; } + @Override public void put(ByteArray key, Versioned value, byte[] transforms) throws VoldemortException {} + @Override + public List> get(CompositeVoldemortRequest request) + throws VoldemortException { + return null; + } + + @Override + public Map>> getAll(CompositeVoldemortRequest request) + throws VoldemortException { + return null; + } + + @Override + public void put(CompositeVoldemortRequest request) + throws VoldemortException {} + + @Override + public boolean delete(CompositeVoldemortRequest request) + throws VoldemortException { + return false; + } }; } diff --git a/test/common/voldemort/config/consistency-stores.xml b/test/common/voldemort/config/consistency-stores.xml new file mode 100644 index 0000000000..52f43260be --- /dev/null +++ b/test/common/voldemort/config/consistency-stores.xml @@ -0,0 +1,41 @@ + + + + consistency-check + memory + client + 4 + 1 + 1 + 2 + 2 + + string + UTF-8 + + + string + UTF-8 + + 1 + + + consistency-fix + memory + client + 4 + 1 + 1 + 2 + 2 + + string + UTF-8 + + + string + UTF-8 + + 1 + + diff --git a/test/common/voldemort/config/one-node-cluster.xml b/test/common/voldemort/config/one-node-cluster.xml index fcc503e279..4b7a0ff113 100644 --- a/test/common/voldemort/config/one-node-cluster.xml +++ b/test/common/voldemort/config/one-node-cluster.xml @@ -6,6 +6,6 @@ localhost 8080 6666 - 0,1 + 0,1,2,3,4,5,6,7,8,9 diff --git a/test/common/voldemort/config/two-stores-with-zones.xml b/test/common/voldemort/config/two-stores-with-zones.xml new file mode 100644 index 0000000000..07b41a1ff1 --- /dev/null +++ b/test/common/voldemort/config/two-stores-with-zones.xml @@ -0,0 +1,50 @@ + + + + cstore + bdb + client + consistent-routing + 3 + 3 + 3 + 3 + 2 + 1 + 1 + + string + UTF-8 + + + string + UTF-8 + + + + zstore + bdb + client + zone-routing + 3 + + 1 + 1 + 1 + + 3 + 3 + 3 + 2 + 1 + 1 + + string + UTF-8 + + + string + UTF-8 + + + diff --git a/test/common/voldemort/store/DoNothingStore.java b/test/common/voldemort/store/DoNothingStore.java index a014980761..04d26ef15a 100644 --- a/test/common/voldemort/store/DoNothingStore.java +++ b/test/common/voldemort/store/DoNothingStore.java @@ -16,61 +16,23 @@ package voldemort.store; -import java.util.List; -import java.util.Map; - import voldemort.VoldemortException; -import voldemort.utils.Utils; import voldemort.versioning.Version; -import voldemort.versioning.Versioned; /** * A store that does no Harm :) * * */ -public class DoNothingStore implements Store { - - private final String name; +public class DoNothingStore extends AbstractStore { public DoNothingStore(String name) { - this.name = Utils.notNull(name); - } - - public void close() throws VoldemortException { - // Do nothing; - } - - public List> get(K key, T transforms) throws VoldemortException { - // do nothing - return null; - } - - public String getName() { - return name; + super(name); } + @Override public boolean delete(K key, Version value) throws VoldemortException { // Do nothing return true; } - - public void put(K key, Versioned value, T transforms) throws VoldemortException { - // Do nothing - } - - public Map>> getAll(Iterable keys, Map transforms) - throws VoldemortException { - return null; - } - - public Object getCapability(StoreCapabilityType capability) { - throw new NoSuchCapabilityException(capability, getName()); - } - - public List getVersions(K key) { - // Do nothing - return null; - } - } diff --git a/test/common/voldemort/store/FailingReadsStore.java b/test/common/voldemort/store/FailingReadsStore.java index b0a7da6779..5d95f39d64 100644 --- a/test/common/voldemort/store/FailingReadsStore.java +++ b/test/common/voldemort/store/FailingReadsStore.java @@ -8,43 +8,37 @@ import voldemort.versioning.Version; import voldemort.versioning.Versioned; -public class FailingReadsStore implements Store { +public class FailingReadsStore extends AbstractStore { - private final String name; private final InMemoryStorageEngine engine; public FailingReadsStore(String name) { - this.name = name; + super(name); this.engine = new InMemoryStorageEngine(name); } - public void close() throws VoldemortException {} - + @Override public boolean delete(K key, Version version) throws VoldemortException { return engine.delete(key, version); } + @Override public List> get(K key, T transforms) throws VoldemortException { throw new VoldemortException("Operation failed"); } + @Override public java.util.List getVersions(K key) { throw new VoldemortException("Operation failed"); } + @Override public Map>> getAll(Iterable keys, Map transforms) throws VoldemortException { throw new VoldemortException("Operation failed"); } - public Object getCapability(StoreCapabilityType capability) { - throw new NoSuchCapabilityException(capability, getName()); - } - - public String getName() { - return name; - } - + @Override public void put(K key, Versioned value, T transforms) throws VoldemortException { engine.put(key, value, transforms); } diff --git a/test/common/voldemort/store/FailingStore.java b/test/common/voldemort/store/FailingStore.java index 9bdceadf94..9c0a4551ba 100644 --- a/test/common/voldemort/store/FailingStore.java +++ b/test/common/voldemort/store/FailingStore.java @@ -20,7 +20,6 @@ import java.util.Map; import voldemort.VoldemortException; -import voldemort.utils.Utils; import voldemort.versioning.Version; import voldemort.versioning.Versioned; @@ -29,9 +28,8 @@ * * */ -public class FailingStore implements Store { +public class FailingStore extends AbstractStore { - private final String name; private final VoldemortException exception; public FailingStore(String name) { @@ -39,40 +37,38 @@ public FailingStore(String name) { } public FailingStore(String name, VoldemortException e) { - this.name = Utils.notNull(name); + super(name); this.exception = e; } + @Override public void close() throws VoldemortException { throw exception; } + @Override public List> get(K key, T transforms) throws VoldemortException { throw exception; } - public String getName() { - return name; - } - + @Override public boolean delete(K key, Version value) throws VoldemortException { throw exception; } + @Override public void put(K key, Versioned value, T transforms) throws VoldemortException { throw exception; } + @Override public Map>> getAll(Iterable keys, Map transforms) throws VoldemortException { throw exception; } + @Override public java.util.List getVersions(K key) { throw exception; } - - public Object getCapability(StoreCapabilityType capability) { - throw new NoSuchCapabilityException(capability, getName()); - } } diff --git a/test/common/voldemort/store/RandomlyFailingDelegatingStore.java b/test/common/voldemort/store/RandomlyFailingDelegatingStore.java index 9ed86f6e45..bb77a4ca90 100644 --- a/test/common/voldemort/store/RandomlyFailingDelegatingStore.java +++ b/test/common/voldemort/store/RandomlyFailingDelegatingStore.java @@ -16,19 +16,23 @@ public RandomlyFailingDelegatingStore(StorageEngine innerStorageEngine) this.innerStorageEngine = innerStorageEngine; } + @Override public ClosableIterator>> entries() { return new ClosableIterator>>() { ClosableIterator>> iterator = innerStorageEngine.entries(); + @Override public void close() { iterator.close(); } + @Override public boolean hasNext() { return iterator.hasNext(); } + @Override public Pair> next() { if(Math.random() > FAIL_PROBABILITY) return iterator.next(); @@ -36,23 +40,28 @@ public Pair> next() { throw new VoldemortException("Failing now !!"); } + @Override public void remove() {} }; } + @Override public ClosableIterator keys() { return new ClosableIterator() { ClosableIterator iterator = innerStorageEngine.keys(); + @Override public void close() { iterator.close(); } + @Override public boolean hasNext() { return iterator.hasNext(); } + @Override public K next() { if(Math.random() > FAIL_PROBABILITY) return iterator.next(); @@ -60,10 +69,70 @@ public K next() { throw new VoldemortException("Failing now !!"); } + @Override public void remove() {} }; } + @Override + public ClosableIterator>> entries(final int partition) { + return new ClosableIterator>>() { + + ClosableIterator>> iterator = innerStorageEngine.entries(partition); + + @Override + public void close() { + iterator.close(); + } + + @Override + public boolean hasNext() { + return iterator.hasNext(); + } + + @Override + public Pair> next() { + if(Math.random() > FAIL_PROBABILITY) + return iterator.next(); + + throw new VoldemortException("Failing now !!"); + } + + @Override + public void remove() {} + }; + } + + @Override + public ClosableIterator keys(final int partition) { + return new ClosableIterator() { + + ClosableIterator iterator = innerStorageEngine.keys(partition); + + @Override + public void close() { + iterator.close(); + } + + @Override + public boolean hasNext() { + return iterator.hasNext(); + } + + @Override + public K next() { + if(Math.random() > FAIL_PROBABILITY) + return iterator.next(); + + throw new VoldemortException("Failing now !!"); + } + + @Override + public void remove() {} + }; + } + + @Override public void truncate() { if(Math.random() > FAIL_PROBABILITY) { innerStorageEngine.truncate(); @@ -72,7 +141,23 @@ public void truncate() { throw new VoldemortException("Failing now !!"); } + @Override public boolean isPartitionAware() { return innerStorageEngine.isPartitionAware(); } + + @Override + public boolean isPartitionScanSupported() { + return innerStorageEngine.isPartitionScanSupported(); + } + + @Override + public boolean beginBatchModifications() { + return false; + } + + @Override + public boolean endBatchModifications() { + return false; + } } \ No newline at end of file diff --git a/test/integration/voldemort/CatBdbStore.java b/test/integration/voldemort/CatBdbStore.java index 44b170fa8d..2d62e0762b 100644 --- a/test/integration/voldemort/CatBdbStore.java +++ b/test/integration/voldemort/CatBdbStore.java @@ -24,6 +24,7 @@ import voldemort.store.StorageEngine; import voldemort.store.bdb.BdbRuntimeConfig; import voldemort.store.bdb.BdbStorageEngine; +import voldemort.store.bdb.PartitionPrefixedBdbStorageEngine; import voldemort.store.serialized.SerializingStorageEngine; import voldemort.utils.ByteArray; import voldemort.utils.Pair; @@ -58,12 +59,19 @@ public static void main(String[] args) throws Exception { DatabaseConfig databaseConfig = new DatabaseConfig(); databaseConfig.setAllowCreate(true); databaseConfig.setTransactional(config.isBdbWriteTransactionsEnabled()); - databaseConfig.setSortedDuplicates(config.isBdbSortedDuplicatesEnabled()); + databaseConfig.setSortedDuplicates(false); Database database = environment.openDatabase(null, storeName, databaseConfig); - StorageEngine store = new BdbStorageEngine(storeName, - environment, - database, - new BdbRuntimeConfig()); + + StorageEngine store = null; + if(config.getBdbPrefixKeysWithPartitionId()) { + store = new PartitionPrefixedBdbStorageEngine(storeName, + environment, + database, + new BdbRuntimeConfig(), + TestUtils.makeSingleNodeRoutingStrategy()); + } else { + store = new BdbStorageEngine(storeName, environment, database, new BdbRuntimeConfig()); + } StorageEngine stringStore = SerializingStorageEngine.wrap(store, new StringSerializer(), new StringSerializer(), diff --git a/test/integration/voldemort/nonblocking/E2ENonblockingCheckoutTest.java b/test/integration/voldemort/nonblocking/E2ENonblockingCheckoutTest.java index b6eebbd048..10f1dea45d 100644 --- a/test/integration/voldemort/nonblocking/E2ENonblockingCheckoutTest.java +++ b/test/integration/voldemort/nonblocking/E2ENonblockingCheckoutTest.java @@ -154,7 +154,7 @@ public void setUp() throws Exception { + InMemoryStorageConfiguration.class.getName() + "," + SlowStorageConfiguration.class.getName(); p.setProperty("storage.configs", storageConfigs); - p.setProperty("slow.queueing.put.ms", Long.toString(SLOW_PUT_MS)); + p.setProperty("testing.slow.queueing.put.ms", Long.toString(SLOW_PUT_MS)); p.setProperty("client.connection.timeout.ms", Integer.toString(CONNECTION_TIMEOUT_MS)); p.setProperty("client.routing.timeout.ms", Integer.toString(ROUTING_TIMEOUT_MS)); diff --git a/test/integration/voldemort/performance/AdminTest.java b/test/integration/voldemort/performance/AdminTest.java index 497940a84f..aa43a5ba57 100644 --- a/test/integration/voldemort/performance/AdminTest.java +++ b/test/integration/voldemort/performance/AdminTest.java @@ -131,11 +131,11 @@ public void testFetch(final SetMultimap nodePartitions) { public long apply() { long i = 0; - Iterator>> result = adminClient.fetchEntries(node, - storeName, - new ArrayList(nodePartitions.get(node)), - null, - false); + Iterator>> result = adminClient.bulkFetchOps.fetchEntries(node, + storeName, + new ArrayList(nodePartitions.get(node)), + null, + false); while(result.hasNext()) { i++; result.next(); @@ -154,13 +154,13 @@ public void testFetchAndUpdate(final SetMultimap from, final i public void apply() { HashMap> replicaToPartitionList = Maps.newHashMap(); replicaToPartitionList.put(0, Lists.newArrayList(from.get(node))); - adminClient.migratePartitions(node, - to, - storeName, - replicaToPartitionList, - null, - null, - false); + adminClient.storeMntOps.migratePartitions(node, + to, + storeName, + replicaToPartitionList, + null, + null, + false); } }, 1); diff --git a/test/integration/voldemort/performance/CacheStorageEnginePerformanceTest.java b/test/integration/voldemort/performance/CacheStorageEnginePerformanceTest.java index 49d8584018..903b057866 100644 --- a/test/integration/voldemort/performance/CacheStorageEnginePerformanceTest.java +++ b/test/integration/voldemort/performance/CacheStorageEnginePerformanceTest.java @@ -39,7 +39,8 @@ public static void main(String[] args) { final int mod = 100; final int readMax = (int) readPercent * mod; - final Store store = new CacheStorageConfiguration(null).getStore(TestUtils.makeStoreDefinition("test")); + final Store store = new CacheStorageConfiguration(null).getStore(TestUtils.makeStoreDefinition("test"), + TestUtils.makeSingleNodeRoutingStrategy()); final AtomicInteger obsoletes = new AtomicInteger(0); PerformanceTest readWriteTest = new PerformanceTest() { diff --git a/test/integration/voldemort/performance/RequestFileFilter.java b/test/integration/voldemort/performance/RequestFileFilter.java index 027731855a..5014bff3bf 100644 --- a/test/integration/voldemort/performance/RequestFileFilter.java +++ b/test/integration/voldemort/performance/RequestFileFilter.java @@ -1,12 +1,12 @@ /* - * Copyright 2008-2010 LinkedIn, Inc - * + * Copyright 2008-2013 LinkedIn, Inc + * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the @@ -16,10 +16,19 @@ package voldemort.performance; -import com.google.common.base.Joiner; +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.util.List; +import java.util.Set; + import joptsimple.OptionParser; import joptsimple.OptionSet; import voldemort.VoldemortException; +import voldemort.client.ClientConfig; import voldemort.client.protocol.admin.AdminClient; import voldemort.client.protocol.admin.AdminClientConfig; import voldemort.cluster.Cluster; @@ -33,14 +42,13 @@ import voldemort.utils.CmdUtils; import voldemort.utils.Utils; -import java.io.*; -import java.util.List; -import java.util.Set; +import com.google.common.base.Joiner; /** * Filter a request file for keys that are mastered by a specified node. */ public class RequestFileFilter { + private final StoreDefinition storeDefinition; private final RoutingStrategy routingStrategy; private final String inputFile; @@ -62,7 +70,6 @@ public RequestFileFilter(StoreDefinition storeDefinition, this.stringKeys = stringKeys; } - // TODO: support keys other than integer or string, general cleanup public void filter() throws IOException { SerializerFactory factory = new DefaultSerializerFactory(); @@ -73,7 +80,7 @@ public void filter() throws IOException { BufferedWriter out = new BufferedWriter(new FileWriter(outputFile)); try { String line = null; - while ((line = in.readLine()) != null) { + while((line = in.readLine()) != null) { String keyStr = line.replaceAll("\\s+$", ""); Object key = null; if(stringKeys) { @@ -83,7 +90,7 @@ public void filter() throws IOException { } byte[] keyBytes = keySerializer.toBytes(key); List nodes = routingStrategy.routeRequest(keyBytes); - if (nodes.contains(node)) { + if(nodes.contains(node)) { out.write(key + "\n"); } } @@ -97,13 +104,13 @@ public void filter() throws IOException { } /** - * Filter requests specified in a file, generating a new file containing only - * requests destined for a specific node. - * + * Filter requests specified in a file, generating a new file containing + * only requests destined for a specific node. + * * @param args See usage for more information * @throws Exception In case of I/O or Voldemort-specific errors */ - public static void main (String [] args) throws Exception { + public static void main(String[] args) throws Exception { OptionParser parser = new OptionParser(); parser.accepts("help", "print usage information"); parser.accepts("node", "[REQUIRED] node id") @@ -126,7 +133,7 @@ public static void main (String [] args) throws Exception { OptionSet options = parser.parse(args); - if (options.has("help")) { + if(options.has("help")) { parser.printHelpOn(System.out); System.exit(0); } @@ -137,7 +144,7 @@ public static void main (String [] args) throws Exception { "url", "input", "output"); - if (missing.size() > 0) { + if(missing.size() > 0) { System.err.println("Missing required arguments: " + Joiner.on(", ").join(missing)); parser.printHelpOn(System.err); System.exit(1); @@ -150,29 +157,33 @@ public static void main (String [] args) throws Exception { String outputFile = (String) options.valueOf("output"); boolean stringKeys = options.has("string-keys"); - AdminClient adminClient = new AdminClient(bootstrapURL, new AdminClientConfig()); - List storeDefinitionList = adminClient.getRemoteStoreDefList(nodeId).getValue(); + AdminClient adminClient = new AdminClient(bootstrapURL, + new AdminClientConfig(), + new ClientConfig()); + List storeDefinitionList = adminClient.metadataMgmtOps.getRemoteStoreDefList(nodeId) + .getValue(); StoreDefinition storeDefinition = null; - for (StoreDefinition def: storeDefinitionList) { - if (storeName.equals(def.getName())) { + for(StoreDefinition def: storeDefinitionList) { + if(storeName.equals(def.getName())) { storeDefinition = def; } } - if (storeDefinition == null) { + if(storeDefinition == null) { Utils.croak("No store found with name\"" + storeName + "\""); } - Cluster cluster = adminClient.getRemoteCluster(nodeId).getValue(); + Cluster cluster = adminClient.metadataMgmtOps.getRemoteCluster(nodeId).getValue(); Node node = null; try { node = cluster.getNodeById(nodeId); - } catch (VoldemortException e) { + } catch(VoldemortException e) { Utils.croak("Can't find a node with id " + nodeId); } - RoutingStrategy routingStrategy = new RoutingStrategyFactory().updateRoutingStrategy(storeDefinition, cluster); + RoutingStrategy routingStrategy = new RoutingStrategyFactory().updateRoutingStrategy(storeDefinition, + cluster); try { new RequestFileFilter(storeDefinition, routingStrategy, @@ -180,7 +191,7 @@ public static void main (String [] args) throws Exception { outputFile, node, stringKeys).filter(); - } catch (FileNotFoundException e) { + } catch(FileNotFoundException e) { Utils.croak(e.getMessage()); } } diff --git a/test/integration/voldemort/performance/StorageEnginePerformanceTest.java b/test/integration/voldemort/performance/StorageEnginePerformanceTest.java index 320de8c44f..d0beb4fd0c 100644 --- a/test/integration/voldemort/performance/StorageEnginePerformanceTest.java +++ b/test/integration/voldemort/performance/StorageEnginePerformanceTest.java @@ -108,7 +108,8 @@ public static void main(String[] args) throws Exception { VoldemortConfig config = new VoldemortConfig(props); StorageConfiguration storageConfig = (StorageConfiguration) ReflectUtils.callConstructor(ReflectUtils.loadClass(storageEngineClass), new Object[] { config }); - StorageEngine engine = storageConfig.getStore(TestUtils.makeStoreDefinition("test")); + StorageEngine engine = storageConfig.getStore(TestUtils.makeStoreDefinition("test"), + TestUtils.makeSingleNodeRoutingStrategy()); @SuppressWarnings("unchecked") final Store store = new SerializingStore(engine, new StringSerializer(), diff --git a/test/integration/voldemort/performance/benchmark/Benchmark.java b/test/integration/voldemort/performance/benchmark/Benchmark.java index 0ef5285e2c..0ae14809f7 100644 --- a/test/integration/voldemort/performance/benchmark/Benchmark.java +++ b/test/integration/voldemort/performance/benchmark/Benchmark.java @@ -350,6 +350,7 @@ public void initializeStore(Props benchmarkProps) throws Exception { TimeUnit.MILLISECONDS) .setRequestFormatType(RequestFormatType.VOLDEMORT_V3) .setBootstrapUrls(socketUrl); + // .enableDefaultClient(true); if(clientZoneId >= 0) { clientConfig.setClientZoneId(clientZoneId); @@ -372,7 +373,8 @@ public void initializeStore(Props benchmarkProps) throws Exception { StorageConfiguration conf = (StorageConfiguration) ReflectUtils.callConstructor(ReflectUtils.loadClass(storageEngineClass), new Object[] { ServerTestUtils.getVoldemortConfig() }); - StorageEngine engine = conf.getStore(TestUtils.makeStoreDefinition(DUMMY_DB)); + StorageEngine engine = conf.getStore(TestUtils.makeStoreDefinition(DUMMY_DB), + TestUtils.makeSingleNodeRoutingStrategy()); if(conf.getType().compareTo(ViewStorageConfiguration.TYPE_NAME) == 0) { engine = new ViewStorageEngine(STORE_NAME, engine, diff --git a/test/integration/voldemort/store/noop/NoopStorageConfiguration.java b/test/integration/voldemort/store/noop/NoopStorageConfiguration.java index 741009ad1d..f6992ad4fb 100644 --- a/test/integration/voldemort/store/noop/NoopStorageConfiguration.java +++ b/test/integration/voldemort/store/noop/NoopStorageConfiguration.java @@ -17,6 +17,7 @@ package voldemort.store.noop; import voldemort.VoldemortException; +import voldemort.routing.RoutingStrategy; import voldemort.server.VoldemortConfig; import voldemort.store.StorageConfiguration; import voldemort.store.StorageEngine; @@ -51,7 +52,8 @@ public NoopStorageConfiguration(VoldemortConfig config) { reflect = config.getAllProps().getBoolean(REFLECT_PROPERTY, false); } - public StorageEngine getStore(StoreDefinition storeDef) { + public StorageEngine getStore(StoreDefinition storeDef, + RoutingStrategy strategy) { return new NoopStorageEngine(storeDef.getName(), reflect); } diff --git a/test/integration/voldemort/store/noop/NoopStorageEngine.java b/test/integration/voldemort/store/noop/NoopStorageEngine.java index d40f75156d..1bb6fe873a 100644 --- a/test/integration/voldemort/store/noop/NoopStorageEngine.java +++ b/test/integration/voldemort/store/noop/NoopStorageEngine.java @@ -22,8 +22,8 @@ import java.util.Map; import voldemort.VoldemortException; +import voldemort.store.AbstractStorageEngine; import voldemort.store.NoSuchCapabilityException; -import voldemort.store.StorageEngine; import voldemort.store.StoreCapabilityType; import voldemort.store.StoreUtils; import voldemort.utils.ByteArray; @@ -39,9 +39,8 @@ * knowledge of the serializer being used * */ -public class NoopStorageEngine implements StorageEngine { +public class NoopStorageEngine extends AbstractStorageEngine { - protected String name; protected boolean dataReflect; protected ByteArray key; protected Versioned value; @@ -49,36 +48,38 @@ public class NoopStorageEngine implements StorageEngine>> dataMap = new MyMap(); public NoopStorageEngine(String name, boolean reflect) { - this.name = name; + super(name); this.dataReflect = reflect; } - public ClosableIterator>> entries() { - return null; + @Override + public ClosableIterator>> entries(int partition) { + throw new UnsupportedOperationException("Partition based entries scan not supported for this storage type"); } - public ClosableIterator keys() { - return null; - } - - public void truncate() { - + @Override + public ClosableIterator keys(int partition) { + throw new UnsupportedOperationException("Partition based key scan not supported for this storage type"); } + @Override public List> get(ByteArray key, byte[] transforms) throws VoldemortException { return dataList; } + @Override public Map>> getAll(Iterable keys, Map transforms) throws VoldemortException { return dataMap; } + @Override public List getVersions(ByteArray key) { return StoreUtils.getVersions(get(key, null)); } + @Override public void put(ByteArray key, Versioned value, byte[] transforms) throws VoldemortException { @@ -88,16 +89,12 @@ public void put(ByteArray key, Versioned value, byte[] transforms) } } + @Override public boolean delete(ByteArray key, Version version) throws VoldemortException { return true; } - public String getName() { - return name; - } - - public void close() throws VoldemortException {} - + @Override public Object getCapability(StoreCapabilityType capability) { throw new NoSuchCapabilityException(capability, getName()); } @@ -126,8 +123,4 @@ public int size() { return value == null ? 0 : 1; } } - - public boolean isPartitionAware() { - return false; - } } diff --git a/test/integration/voldemort/store/pausable/PausableStorageConfiguration.java b/test/integration/voldemort/store/pausable/PausableStorageConfiguration.java index 225f68b514..bf29e94453 100644 --- a/test/integration/voldemort/store/pausable/PausableStorageConfiguration.java +++ b/test/integration/voldemort/store/pausable/PausableStorageConfiguration.java @@ -1,6 +1,7 @@ package voldemort.store.pausable; import voldemort.VoldemortException; +import voldemort.routing.RoutingStrategy; import voldemort.server.VoldemortConfig; import voldemort.store.StorageConfiguration; import voldemort.store.StorageEngine; @@ -21,7 +22,8 @@ public PausableStorageConfiguration(@SuppressWarnings("unused") VoldemortConfig public void close() {} - public StorageEngine getStore(StoreDefinition storeDef) { + public StorageEngine getStore(StoreDefinition storeDef, + RoutingStrategy strategy) { return new PausableStorageEngine(new InMemoryStorageEngine(storeDef.getName())); } diff --git a/test/integration/voldemort/store/pausable/PausableStorageEngine.java b/test/integration/voldemort/store/pausable/PausableStorageEngine.java index 91fd4be5ab..32fba9b4b4 100644 --- a/test/integration/voldemort/store/pausable/PausableStorageEngine.java +++ b/test/integration/voldemort/store/pausable/PausableStorageEngine.java @@ -7,7 +7,7 @@ import voldemort.VoldemortException; import voldemort.annotations.jmx.JmxOperation; -import voldemort.store.StorageEngine; +import voldemort.store.AbstractStorageEngine; import voldemort.store.StoreCapabilityType; import voldemort.store.memory.InMemoryStorageEngine; import voldemort.utils.ClosableIterator; @@ -25,7 +25,7 @@ * @param The type of the value * @param The type of the transforms */ -public class PausableStorageEngine implements StorageEngine { +public class PausableStorageEngine extends AbstractStorageEngine { private static final Logger logger = Logger.getLogger(PausableStorageEngine.class); @@ -34,14 +34,16 @@ public class PausableStorageEngine implements StorageEngine { private volatile boolean paused; public PausableStorageEngine(InMemoryStorageEngine inner) { - super(); + super(inner.getName()); this.inner = inner; } + @Override public void close() throws VoldemortException { inner.close(); } + @Override public boolean delete(K key, Version version) { blockIfNecessary(); return inner.delete(key); @@ -59,49 +61,65 @@ private void blockIfNecessary() { } } + @Override public List> get(K key, T transforms) { blockIfNecessary(); return inner.get(key, transforms); } + @Override public Map>> getAll(Iterable keys, Map transforms) { blockIfNecessary(); return inner.getAll(keys, transforms); } + @Override public void put(K key, Versioned value, T transforms) { blockIfNecessary(); inner.put(key, value, transforms); } + @Override public ClosableIterator>> entries() { blockIfNecessary(); return inner.entries(); } + @Override public ClosableIterator keys() { blockIfNecessary(); return inner.keys(); } + @Override + public ClosableIterator>> entries(int partition) { + blockIfNecessary(); + return inner.entries(partition); + } + + @Override + public ClosableIterator keys(int partition) { + blockIfNecessary(); + return inner.keys(partition); + } + + @Override public void truncate() { blockIfNecessary(); inner.deleteAll(); } + @Override public List getVersions(K key) { blockIfNecessary(); return inner.getVersions(key); } + @Override public Object getCapability(StoreCapabilityType capability) { return inner.getCapability(capability); } - public String getName() { - return inner.getName(); - } - @JmxOperation(description = "Pause all operations on the storage engine.") public void pause() { logger.info("Pausing store '" + getName() + "'."); @@ -117,7 +135,13 @@ public void unpause() { } } + @Override public boolean isPartitionAware() { return inner.isPartitionAware(); } + + @Override + public boolean isPartitionScanSupported() { + return inner.isPartitionScanSupported(); + } } diff --git a/test/integration/voldemort/store/slow/SlowStorageConfiguration.java b/test/integration/voldemort/store/slow/SlowStorageConfiguration.java index 062aa4e803..00ab263c47 100644 --- a/test/integration/voldemort/store/slow/SlowStorageConfiguration.java +++ b/test/integration/voldemort/store/slow/SlowStorageConfiguration.java @@ -17,6 +17,7 @@ import voldemort.VoldemortException; import voldemort.common.OpTimeMap; +import voldemort.routing.RoutingStrategy; import voldemort.server.VoldemortConfig; import voldemort.store.StorageConfiguration; import voldemort.store.StorageEngine; @@ -39,7 +40,8 @@ public SlowStorageConfiguration(VoldemortConfig config) { this.voldemortConfig = config; } - public StorageEngine getStore(StoreDefinition storeDef) { + public StorageEngine getStore(StoreDefinition storeDef, + RoutingStrategy strategy) { if(voldemortConfig != null) { return new SlowStorageEngine(new InMemoryStorageEngine(storeDef.getName()), this.voldemortConfig.testingGetSlowQueueingDelays(), diff --git a/test/integration/voldemort/store/slow/SlowStorageEngine.java b/test/integration/voldemort/store/slow/SlowStorageEngine.java index f9a47a94cb..bc10e6001f 100644 --- a/test/integration/voldemort/store/slow/SlowStorageEngine.java +++ b/test/integration/voldemort/store/slow/SlowStorageEngine.java @@ -23,6 +23,7 @@ import voldemort.VoldemortException; import voldemort.common.OpTimeMap; import voldemort.common.VoldemortOpCode; +import voldemort.store.AbstractStorageEngine; import voldemort.store.StorageEngine; import voldemort.store.StoreCapabilityType; import voldemort.utils.ClosableIterator; @@ -47,7 +48,7 @@ * does not affect concurrentDelays. * */ -public class SlowStorageEngine implements StorageEngine { +public class SlowStorageEngine extends AbstractStorageEngine { private final StorageEngine innerStorageEngine; private final OpTimeMap queueingDelays; @@ -60,6 +61,7 @@ public SlowStorageEngine(StorageEngine innerStorageEngine) { public SlowStorageEngine(StorageEngine innerStorageEngine, OpTimeMap queueingDelays, OpTimeMap concurrentDelays) { + super(innerStorageEngine.getName()); this.innerStorageEngine = innerStorageEngine; this.queueingDelays = queueingDelays; this.concurrentDelays = concurrentDelays; @@ -92,58 +94,79 @@ public boolean delete(K key) { return delete(key, null); } + @Override public boolean delete(K key, Version version) { delayByOp(VoldemortOpCode.DELETE_OP_CODE); return innerStorageEngine.delete(key, version); } + @Override public List getVersions(K key) { delayByOp(VoldemortOpCode.GET_VERSION_OP_CODE); return innerStorageEngine.getVersions(key); } + @Override public List> get(K key, T transform) throws VoldemortException { delayByOp(VoldemortOpCode.GET_OP_CODE); return innerStorageEngine.get(key, transform); } + @Override public Map>> getAll(Iterable keys, Map transforms) throws VoldemortException { delayByOp(VoldemortOpCode.GET_ALL_OP_CODE); return innerStorageEngine.getAll(keys, transforms); } + @Override public void put(K key, Versioned value, T transforms) throws VoldemortException { delayByOp(VoldemortOpCode.PUT_OP_CODE); innerStorageEngine.put(key, value, transforms); } + @Override public ClosableIterator>> entries() { return innerStorageEngine.entries(); } + @Override public ClosableIterator keys() { return innerStorageEngine.keys(); } + @Override public void truncate() { innerStorageEngine.truncate(); } + @Override public boolean isPartitionAware() { return innerStorageEngine.isPartitionAware(); } - public String getName() { - return innerStorageEngine.getName(); - } - + @Override public void close() { innerStorageEngine.close(); } + @Override public Object getCapability(StoreCapabilityType capability) { return innerStorageEngine.getCapability(capability); } + @Override + public ClosableIterator>> entries(int partition) { + return innerStorageEngine.entries(partition); + } + + @Override + public ClosableIterator keys(int partition) { + return innerStorageEngine.keys(partition); + } + + @Override + public boolean isPartitionScanSupported() { + return innerStorageEngine.isPartitionScanSupported(); + } } diff --git a/test/long/voldemort/client/rebalance/RebalanceLongTest.java b/test/long/voldemort/client/rebalance/RebalanceLongTest.java index dba0de81ef..34bc0c6099 100644 --- a/test/long/voldemort/client/rebalance/RebalanceLongTest.java +++ b/test/long/voldemort/client/rebalance/RebalanceLongTest.java @@ -1,44 +1,23 @@ package voldemort.client.rebalance; -import java.io.IOException; import java.util.Arrays; import java.util.Collection; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Properties; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; import org.junit.runners.Parameterized.Parameters; -import voldemort.ServerTestUtils; -import voldemort.TestUtils; -import voldemort.VoldemortException; -import voldemort.cluster.Cluster; -import voldemort.server.VoldemortConfig; -import voldemort.server.VoldemortServer; -import voldemort.store.metadata.MetadataStore.VoldemortState; - /** - * Start VoldemortServer locally using ServerTestUtils and run rebalancing - * tests. - * + * Run a version of RebalanceTests with a lot more keys. * */ @RunWith(Parameterized.class) -public class RebalanceLongTest extends AbstractRebalanceTest { +public class RebalanceLongTest extends RebalanceTest { - Map serverMap; - private final boolean useNio; - private final boolean useDonorBased; - protected static int NUM_MANY_KEYS = 10100; + private final int NUM_KEYS = 10100; public RebalanceLongTest(boolean useNio, boolean useDonorBased) { - this.useNio = useNio; - this.useDonorBased = useDonorBased; - this.serverMap = new HashMap(); + super(useNio, useDonorBased); } @Parameters @@ -49,73 +28,7 @@ public static Collection configs() { @Override protected int getNumKeys() { - return NUM_MANY_KEYS; - } - - @Override - protected VoldemortState getCurrentState(int nodeId) { - VoldemortServer server = serverMap.get(nodeId); - if(server == null) { - throw new VoldemortException("Node id " + nodeId + " does not exist"); - } else { - return server.getMetadataStore().getServerState(); - } + return NUM_KEYS; } - @Override - protected Cluster getCurrentCluster(int nodeId) { - VoldemortServer server = serverMap.get(nodeId); - if(server == null) { - throw new VoldemortException("Node id " + nodeId + " does not exist"); - } else { - return server.getMetadataStore().getCluster(); - } - } - - @Override - protected Cluster startServers(Cluster cluster, - String storeXmlFile, - List nodeToStart, - Map configProps) throws IOException { - for(int node: nodeToStart) { - Properties properties = new Properties(); - if(null != configProps) { - for(Entry property: configProps.entrySet()) { - properties.put(property.getKey(), property.getValue()); - } - } - - VoldemortConfig config = ServerTestUtils.createServerConfig(useNio, - node, - TestUtils.createTempDir() - .getAbsolutePath(), - null, - storeXmlFile, - properties); - - VoldemortServer server = ServerTestUtils.startVoldemortServer(socketStoreFactory, - config, - cluster); - serverMap.put(node, server); - } - - return cluster; - } - - @Override - protected void stopServer(List nodesToStop) throws IOException { - for(int node: nodesToStop) { - try { - ServerTestUtils.stopVoldemortServer(serverMap.get(node)); - } catch(VoldemortException e) { - // ignore these at stop time - } - } - serverMap = null; - } - - @Override - protected boolean useDonorBased() { - return this.useDonorBased; - } } diff --git a/test/long/voldemort/socketpool/E2EClientRequestExecutorPoolAndFailureDetectorTest.java b/test/long/voldemort/socketpool/E2EClientRequestExecutorPoolAndFailureDetectorTest.java new file mode 100644 index 0000000000..a9d5667653 --- /dev/null +++ b/test/long/voldemort/socketpool/E2EClientRequestExecutorPoolAndFailureDetectorTest.java @@ -0,0 +1,342 @@ +/* + * Copyright 2012 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.socketpool; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Properties; +import java.util.Random; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; + +import org.junit.Test; + +import voldemort.ServerTestUtils; +import voldemort.TestUtils; +import voldemort.client.ClientConfig; +import voldemort.client.RoutingTier; +import voldemort.client.SocketStoreClientFactory; +import voldemort.client.StoreClient; +import voldemort.client.StoreClientFactory; +import voldemort.cluster.Cluster; +import voldemort.cluster.Node; +import voldemort.routing.RoutingStrategyType; +import voldemort.serialization.SerializerDefinition; +import voldemort.server.VoldemortConfig; +import voldemort.server.VoldemortServer; +import voldemort.store.InsufficientOperationalNodesException; +import voldemort.store.StoreDefinition; +import voldemort.store.StoreDefinitionBuilder; +import voldemort.store.bdb.BdbStorageConfiguration; +import voldemort.store.memory.InMemoryStorageConfiguration; +import voldemort.store.slop.strategy.HintedHandoffStrategyType; +import voldemort.store.slow.SlowStorageConfiguration; +import voldemort.store.socket.SocketStoreFactory; +import voldemort.store.socket.clientrequest.ClientRequestExecutorPool; +import voldemort.versioning.ObsoleteVersionException; + +/** + * Does an end-to-end unit test of a Voldemort cluster with in memory storage + * servers. Applies so much load that timeouts and connection resets are + * expected. + * + */ +public class E2EClientRequestExecutorPoolAndFailureDetectorTest { + + private final boolean useNio = true; + private static final String STORE_NAME = "test"; + + private Random random = new Random(); + + private static final int KEY_RANGE = 100; + private static final int SOCKET_BUFFER_SIZE = 32 * 1024; + private static final boolean SOCKET_KEEP_ALIVE = false; + + private static final int CONNECTION_TIMEOUT_MS = 20; + private static final int SOCKET_TIMEOUT_MS = 40; + private static final int ROUTING_TIMEOUT_MS = 40; + + private SocketStoreFactory socketStoreFactory = null; + + private final int numServers = 4; + private List servers; + private Cluster cluster; + StoreClientFactory storeClientFactory; + + public E2EClientRequestExecutorPoolAndFailureDetectorTest() {} + + public static List getStoreDef() { + List defs = new ArrayList(); + SerializerDefinition serDef = new SerializerDefinition("string"); + String storageConfiguration = SlowStorageConfiguration.TYPE_NAME; + defs.add(new StoreDefinitionBuilder().setName(STORE_NAME) + .setType(storageConfiguration) + .setKeySerializer(serDef) + .setValueSerializer(serDef) + .setRoutingPolicy(RoutingTier.SERVER) + .setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY) + .setHintedHandoffStrategy(HintedHandoffStrategyType.PROXIMITY_STRATEGY) + .setReplicationFactor(3) + .setPreferredReads(1) + .setRequiredReads(1) + .setPreferredWrites(1) + .setRequiredWrites(1) + .build()); + return defs; + } + + public void setUp(int opSlowMs, int numSelectors, int connectionsPerNode) throws Exception { + socketStoreFactory = new ClientRequestExecutorPool(numSelectors, + connectionsPerNode, + CONNECTION_TIMEOUT_MS, + SOCKET_TIMEOUT_MS, + SOCKET_BUFFER_SIZE, + SOCKET_KEEP_ALIVE); + + cluster = ServerTestUtils.getLocalCluster(numServers, new int[][] { { 0, 4 }, { 1, 5 }, + { 2, 6 }, { 3, 7 } }); + servers = new ArrayList(); + Properties p = new Properties(); + String storageConfigs = BdbStorageConfiguration.class.getName() + "," + + InMemoryStorageConfiguration.class.getName() + "," + + SlowStorageConfiguration.class.getName(); + p.setProperty("storage.configs", storageConfigs); + p.setProperty("testing.slow.queueing.put.ms", Long.toString(opSlowMs)); + p.setProperty("testing.slow.queueing.get.ms", Long.toString(opSlowMs)); + + // Susceptible to BindException... + for(int i = 0; i < numServers; i++) { + VoldemortConfig voldemortConfig = ServerTestUtils.createServerConfigWithDefs(this.useNio, + i, + TestUtils.createTempDir() + .getAbsolutePath(), + cluster, + getStoreDef(), + p); + VoldemortServer voldemortServer = ServerTestUtils.startVoldemortServer(socketStoreFactory, + voldemortConfig); + servers.add(voldemortServer); + } + + Node node = cluster.getNodeById(0); + String bootstrapUrl = "tcp://" + node.getHost() + ":" + node.getSocketPort(); + storeClientFactory = new SocketStoreClientFactory(new ClientConfig().setBootstrapUrls(bootstrapUrl) + .setMaxConnectionsPerNode(connectionsPerNode) + .setConnectionTimeout(CONNECTION_TIMEOUT_MS, + TimeUnit.MILLISECONDS) + .setSocketTimeout(SOCKET_TIMEOUT_MS, + TimeUnit.MILLISECONDS) + .setRoutingTimeout(ROUTING_TIMEOUT_MS, + TimeUnit.MILLISECONDS) + .setFailureDetectorThreshold(99) + .setFailureDetectorThresholdInterval(250)); + } + + public void tearDown() throws IOException { + // Servers + for(VoldemortServer server: servers) { + ServerTestUtils.stopVoldemortServer(server); + } + servers = null; + cluster = null; + + // Clients + storeClientFactory.close(); + storeClientFactory = null; + socketStoreFactory.close(); + socketStoreFactory = null; + } + + public abstract class Oper implements Runnable { + + private final StoreClient storeClient; + private final CountDownLatch startSignal; + private final CountDownLatch doneSignal; + private final int numOps; + + private int numIONEs; + + Oper(CountDownLatch startSignal, CountDownLatch doneSignal, int numOps) { + this.startSignal = startSignal; + this.doneSignal = doneSignal; + this.numOps = numOps; + + this.numIONEs = 0; + + storeClient = storeClientFactory.getStoreClient(STORE_NAME); + } + + public String getKey() { + return new Integer(random.nextInt(KEY_RANGE)).toString(); + } + + public String getValue() { + return "Value ..............................................................................................................."; + } + + abstract public void doOp(); + + @Override + public void run() { + startSignal.countDown(); + try { + try { + startSignal.await(); + } catch(InterruptedException e) { + e.printStackTrace(); + return; + } + + for(int i = 0; i < this.numOps; ++i) { + try { + doOp(); + } catch(InsufficientOperationalNodesException ione) { + this.numIONEs++; + // System.out.println("Caught an IONE"); + try { + Thread.sleep(250); + } catch(InterruptedException ie) { + // Noop + } + } + if(i > 0 && i % 500 == 0) { + System.out.println("oper making progress ... (IONES = " + this.numIONEs + + ", op count = " + i + ")"); + } + } + + } finally { + doneSignal.countDown(); + } + if(this.numIONEs > 0) + System.out.println("Number of IONEs: " + this.numIONEs); + } + } + + public class Putter extends Oper { + + Putter(CountDownLatch startSignal, CountDownLatch doneSignal, int numOps) { + super(startSignal, doneSignal, numOps); + } + + @Override + public void doOp() { + String key = getKey(); + String value = getValue(); + try { + super.storeClient.put(key, value); + } catch(ObsoleteVersionException e) { + // System.out.println("ObsoleteVersionException caught on put."); + } + } + } + + public class Getter extends Oper { + + Getter(CountDownLatch startSignal, CountDownLatch doneSignal, int numOps) { + super(startSignal, doneSignal, numOps); + } + + @Override + public void doOp() { + String key = getKey(); + super.storeClient.get(key); + } + } + + public void doStressTest(int numPutters, int numGetters, int numOps) { + int numOpers = numPutters + numGetters; + CountDownLatch waitForStart = new CountDownLatch(numOpers); + CountDownLatch waitForDone = new CountDownLatch(numOpers); + + for(int i = 0; i < numPutters; ++i) { + new Thread(new Putter(waitForStart, waitForDone, numOps)).start(); + } + for(int i = 0; i < numGetters; ++i) { + new Thread(new Getter(waitForStart, waitForDone, numOps)).start(); + } + + try { + waitForDone.await(); + } catch(InterruptedException e) { + e.printStackTrace(); + } + } + + public void runStressTest(int opSlowMs, + int numSelectors, + int connectionsPerNode, + int numPutters, + int numGetters, + int numOps) { + System.out.println("STARTING: opSlowMs (" + opSlowMs + "), numSelectors (" + numSelectors + + "), connectionsPerNode (" + connectionsPerNode + ") putters (" + + numPutters + "), getters (" + numGetters + "), and ops (" + numOps + + ")."); + try { + setUp(opSlowMs, numSelectors, connectionsPerNode); + doStressTest(numPutters, numGetters, numOps); + tearDown(); + } catch(Exception ex) { + ex.printStackTrace(); + } finally { + System.out.println("ENDING..."); + System.out.println("***********************************************************************************"); + } + } + + @Test + public void stressTest() { + final int OP_SLOW_MS = 2; + + final int NUM_SELECTORS_START = 2; + final int NUM_SELECTORS_END = 4; + final int NUM_SELECTORS_STEP = 2; + + final int CONNECTIONS_PER_NODE_START = 10; + final int CONNECTIONS_PER_NODE_END = 20; + final int CONNECTIONS_PER_NODE_STEP = 10; + + final int NUM_PUTTERS_START = 50; + final int NUM_PUTTERS_END = 50; + final int NUM_PUTTERS_STEP = 25; + + final int NUM_GETTERS_START = 50; + final int NUM_GETTERS_END = 50; + final int NUM_GETTERS_STEP = 25; + + final int NUM_OPS = 1000; + + for(int putters = NUM_PUTTERS_START; putters <= NUM_PUTTERS_END; putters += NUM_PUTTERS_STEP) { + for(int getters = NUM_GETTERS_START; getters <= NUM_GETTERS_END; getters += NUM_GETTERS_STEP) { + for(int numSelectors = NUM_SELECTORS_START; numSelectors <= NUM_SELECTORS_END; numSelectors += NUM_SELECTORS_STEP) { + for(int connectionsPerNode = CONNECTIONS_PER_NODE_START; connectionsPerNode <= CONNECTIONS_PER_NODE_END; connectionsPerNode += CONNECTIONS_PER_NODE_STEP) { + if(putters + getters > 0) { + runStressTest(OP_SLOW_MS, + numSelectors, + connectionsPerNode, + putters, + getters, + NUM_OPS); + } + } + } + } + } + } +} diff --git a/test/unit/voldemort/client/AbstractAdminServiceFilterTest.java b/test/unit/voldemort/client/AbstractAdminServiceFilterTest.java index 6f00634603..f5043574b7 100644 --- a/test/unit/voldemort/client/AbstractAdminServiceFilterTest.java +++ b/test/unit/voldemort/client/AbstractAdminServiceFilterTest.java @@ -1,3 +1,18 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ package voldemort.client; import java.util.Iterator; @@ -16,8 +31,8 @@ import voldemort.store.StoreDefinition; import voldemort.utils.ByteArray; import voldemort.utils.ByteUtils; +import voldemort.utils.NodeUtils; import voldemort.utils.Pair; -import voldemort.utils.RebalanceUtils; import voldemort.versioning.Versioned; import com.google.common.collect.Lists; @@ -47,7 +62,7 @@ public void testFetchAsStreamWithFilter() { RoutingStrategy strategy = new RoutingStrategyFactory().updateRoutingStrategy(getStoreDef(), getCluster()); for(Pair> pair: createEntries()) { - if(RebalanceUtils.getNodeIds(strategy.routeRequest(pair.getFirst().get())).contains(0)) { + if(NodeUtils.getNodeIds(strategy.routeRequest(pair.getFirst().get())).contains(0)) { store.put(pair.getFirst(), pair.getSecond(), null); if(!filter.accept(pair.getFirst(), pair.getSecond())) { shouldFilterCount++; @@ -56,12 +71,12 @@ public void testFetchAsStreamWithFilter() { } // make fetch stream call with filter - Iterator>> entryIterator = getAdminClient().fetchEntries(0, - testStoreName, - getCluster().getNodeById(0) - .getPartitionIds(), - filter, - false); + Iterator>> entryIterator = getAdminClient().bulkFetchOps.fetchEntries(0, + testStoreName, + getCluster().getNodeById(0) + .getPartitionIds(), + filter, + false); // assert none of the filtered entries are returned. while(entryIterator.hasNext()) { @@ -84,16 +99,19 @@ public void testDeleteStreamWithFilter() { RoutingStrategy strategy = new RoutingStrategyFactory().updateRoutingStrategy(getStoreDef(), getCluster()); for(Pair> pair: entrySet) { - if(RebalanceUtils.getNodeIds(strategy.routeRequest(pair.getFirst().get())).contains(0)) + if(NodeUtils.getNodeIds(strategy.routeRequest(pair.getFirst().get())).contains(0)) store.put(pair.getFirst(), pair.getSecond(), null); } // make delete stream call with filter - getAdminClient().deletePartitions(0, testStoreName, Lists.newArrayList(0, 1), filter); + getAdminClient().storeMntOps.deletePartitions(0, + testStoreName, + Lists.newArrayList(0, 1), + filter); // assert none of the filtered entries are returned. for(Pair> entry: entrySet) { - if(RebalanceUtils.getNodeIds(strategy.routeRequest(entry.getFirst().get())).contains(0)) { + if(NodeUtils.getNodeIds(strategy.routeRequest(entry.getFirst().get())).contains(0)) { if(filter.accept(entry.getFirst(), entry.getSecond())) { assertEquals("All entries should be deleted except the filtered ones.", 0, @@ -116,7 +134,7 @@ public void testUpdateAsStreamWithFilter() { Set>> entrySet = createEntries(); // make update stream call with filter - getAdminClient().updateEntries(0, testStoreName, entrySet.iterator(), filter); + getAdminClient().streamingOps.updateEntries(0, testStoreName, entrySet.iterator(), filter); // assert none of the filtered entries are updated. // user store should be present diff --git a/test/unit/voldemort/client/AdminFetchTest.java b/test/unit/voldemort/client/AdminFetchTest.java new file mode 100644 index 0000000000..e0d1ccabcb --- /dev/null +++ b/test/unit/voldemort/client/AdminFetchTest.java @@ -0,0 +1,386 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package voldemort.client; + +import static junit.framework.Assert.assertEquals; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Properties; +import java.util.Set; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +import voldemort.ServerTestUtils; +import voldemort.client.protocol.admin.AdminClient; +import voldemort.cluster.Cluster; +import voldemort.cluster.Node; +import voldemort.routing.RoutingStrategy; +import voldemort.routing.RoutingStrategyFactory; +import voldemort.server.VoldemortServer; +import voldemort.store.StoreDefinition; +import voldemort.store.socket.SocketStoreFactory; +import voldemort.store.socket.clientrequest.ClientRequestExecutorPool; +import voldemort.utils.ByteArray; +import voldemort.utils.Pair; +import voldemort.versioning.Versioned; +import voldemort.xml.StoreDefinitionsMapper; + +@RunWith(Parameterized.class) +public class AdminFetchTest { + + private static int TEST_STREAM_KEYS_SIZE = 100; + private static String testStoreName = "users"; + private static String storesXmlfile = "test/common/voldemort/config/stores.xml"; + + private StoreDefinition testStoreDef; + private VoldemortServer[] servers; + private Cluster cluster; + private AdminClient adminClient; + private RoutingStrategy routingStrategy; + private HashMap> partitionToKeysMap; + + private final boolean useNio; + private final Properties properties; + + public AdminFetchTest(boolean useNio, boolean usePIDScan) { + this.useNio = useNio; + properties = new Properties(); + if(usePIDScan) { + properties.put("bdb.prefix.keys.with.partitionid", "true"); + } else { + properties.put("bdb.prefix.keys.with.partitionid", "false"); + } + } + + @Parameters + public static Collection configs() { + return Arrays.asList(new Object[][] { { true, true }, { true, false }, { false, true }, + { false, false } }); + } + + @Before + public void setUp() throws IOException { + + partitionToKeysMap = new HashMap>(); + + SocketStoreFactory socketStoreFactory = new ClientRequestExecutorPool(2, + 10000, + 100000, + 32 * 1024); + + final int numServers = 2; + servers = new VoldemortServer[numServers]; + int partitionMap[][] = { { 0, 1, 2, 3 }, { 4, 5, 6, 7 } }; + + cluster = ServerTestUtils.startVoldemortCluster(numServers, + servers, + partitionMap, + socketStoreFactory, + this.useNio, + null, + storesXmlfile, + properties); + + List storeDefs = new StoreDefinitionsMapper().readStoreList(new File(storesXmlfile)); + + for(StoreDefinition storeDef: storeDefs) + if(storeDef.getName().equals(testStoreName)) + testStoreDef = storeDef; + + routingStrategy = new RoutingStrategyFactory().updateRoutingStrategy(testStoreDef, cluster); + + adminClient = ServerTestUtils.getAdminClient(cluster); + + // load data into the servers + Node firstServer = cluster.getNodes().iterator().next(); + + String bootstrapUrl = "tcp://" + firstServer.getHost() + ":" + firstServer.getSocketPort(); + StoreClientFactory factory = new SocketStoreClientFactory(new ClientConfig().setBootstrapUrls(bootstrapUrl) + .setSelectors(2)); + + // create a client that executes operations on a single store + StoreClient voldClient = factory.getStoreClient("users"); + for(int i = 0; i < TEST_STREAM_KEYS_SIZE; i++) { + String key = "key" + i; + byte[] bkey = key.getBytes("UTF-8"); + int partition = routingStrategy.getPartitionList(bkey).get(0); + if(!partitionToKeysMap.containsKey(partition)) + partitionToKeysMap.put(partition, new HashSet()); + partitionToKeysMap.get(partition).add(key); + voldClient.put(key, "value" + i); + } + } + + @After + public void tearDown() throws IOException { + for(VoldemortServer server: servers) { + ServerTestUtils.stopVoldemortServer(server); + } + } + + private Set getEntries(Iterator>> itr) { + HashSet keySet = new HashSet(); + while(itr.hasNext()) { + Pair> entry = itr.next(); + keySet.add(new String(entry.getFirst().get())); + } + return keySet; + } + + @Test + public void testFetchPartitionPrimaryEntries() { + HashMap> replicaToPartitionList = new HashMap>(); + replicaToPartitionList.put(0, Arrays.asList(0, 3)); + Iterator>> entriesItr = adminClient.bulkFetchOps.fetchEntries(0, + testStoreName, + replicaToPartitionList, + null, + false, + cluster, + 0); + // gather all the keys obtained + Set fetchedKeys = getEntries(entriesItr); + // make sure it fetched all the entries from the partitions requested + Set partition0Keys = new HashSet(partitionToKeysMap.get(0)); + Set partition3Keys = new HashSet(partitionToKeysMap.get(3)); + + partition0Keys.removeAll(fetchedKeys); + partition3Keys.removeAll(fetchedKeys); + assertEquals("Remainder in partition 0" + partition0Keys, 0, partition0Keys.size()); + assertEquals("Remainder in partition 3" + partition3Keys, 0, partition3Keys.size()); + } + + @Test + public void testFetchPartitionSecondaryEntries() { + HashMap> replicaToPartitionList = new HashMap>(); + replicaToPartitionList.put(1, Arrays.asList(4, 6)); + Iterator>> entriesItr = adminClient.bulkFetchOps.fetchEntries(0, + testStoreName, + replicaToPartitionList, + null, + false, + cluster, + 0); + // gather all the keys obtained + Set fetchedKeys = getEntries(entriesItr); + // make sure it fetched all the entries from the partitions requested + Set partition4Keys = new HashSet(partitionToKeysMap.get(4)); + Set partition6Keys = new HashSet(partitionToKeysMap.get(6)); + + partition4Keys.removeAll(fetchedKeys); + partition6Keys.removeAll(fetchedKeys); + assertEquals("Remainder in partition 4" + partition4Keys, 0, partition4Keys.size()); + assertEquals("Remainder in partition 6" + partition6Keys, 0, partition6Keys.size()); + } + + @Test + public void testFetchPartitionPrimaryTwoEntries() { + HashMap> replicaToPartitionList = new HashMap>(); + replicaToPartitionList.put(0, Arrays.asList(0, 3)); + Iterator>> entriesItr = adminClient.bulkFetchOps.fetchEntries(0, + testStoreName, + replicaToPartitionList, + null, + false, + cluster, + 2); + Set fetchedKeys = getEntries(entriesItr); + + Set partition0Keys = new HashSet(partitionToKeysMap.get(0)); + int numPartition0Keys = partition0Keys.size(); + partition0Keys.removeAll(fetchedKeys); + assertEquals("Remainder in partition 0 should be two less.", + numPartition0Keys - 2, + partition0Keys.size()); + + Set partition3Keys = new HashSet(partitionToKeysMap.get(3)); + int numPartition3Keys = partition3Keys.size(); + partition3Keys.removeAll(fetchedKeys); + assertEquals("Remainder in partition 3 should be two less.", + numPartition3Keys - 2, + partition3Keys.size()); + + assertEquals("Total of four entries fetched.", 4, fetchedKeys.size()); + } + + @Test + public void testFetchNonExistentEntriesPrimary() { + HashMap> replicaToPartitionList = new HashMap>(); + replicaToPartitionList.put(0, Arrays.asList(5, 7)); + Iterator>> entriesItr = adminClient.bulkFetchOps.fetchEntries(0, + testStoreName, + replicaToPartitionList, + null, + false, + cluster, + 0); + // gather all the keys obtained + Set fetchedKeys = getEntries(entriesItr); + // make sure it fetched nothing since these partitions belong to server + // 1 + assertEquals("Obtained something:" + fetchedKeys, 0, fetchedKeys.size()); + } + + @Test + public void testFetchNonExistentEntriesSecondary() { + HashMap> replicaToPartitionList = new HashMap>(); + replicaToPartitionList.put(1, Arrays.asList(1, 2)); + Iterator>> entriesItr = adminClient.bulkFetchOps.fetchEntries(0, + testStoreName, + replicaToPartitionList, + null, + false, + cluster, + 0); + // gather all the keys obtained + Set fetchedKeys = getEntries(entriesItr); + // make sure it fetched nothing since these partitions belong to server + // 0 as primary + assertEquals("Obtained something:" + fetchedKeys, 0, fetchedKeys.size()); + } + + private Set getKeys(Iterator itr) { + HashSet keySet = new HashSet(); + while(itr.hasNext()) { + keySet.add(new String(itr.next().get())); + } + return keySet; + } + + @Test + public void testFetchPartitionPrimaryKeys() { + HashMap> replicaToPartitionList = new HashMap>(); + replicaToPartitionList.put(0, Arrays.asList(0, 3)); + Iterator keysItr = adminClient.bulkFetchOps.fetchKeys(0, + testStoreName, + replicaToPartitionList, + null, + false, + cluster, + 0); + // gather all the keys obtained + Set fetchedKeys = getKeys(keysItr); + // make sure it fetched all the keys from the partitions requested + Set partition0Keys = new HashSet(partitionToKeysMap.get(0)); + Set partition3Keys = new HashSet(partitionToKeysMap.get(3)); + + partition0Keys.removeAll(fetchedKeys); + partition3Keys.removeAll(fetchedKeys); + assertEquals("Remainder in partition 0" + partition0Keys, 0, partition0Keys.size()); + assertEquals("Remainder in partition 3" + partition3Keys, 0, partition3Keys.size()); + } + + @Test + public void testFetchPartitionSecondaryKeys() { + HashMap> replicaToPartitionList = new HashMap>(); + replicaToPartitionList.put(1, Arrays.asList(4, 6)); + Iterator keysItr = adminClient.bulkFetchOps.fetchKeys(0, + testStoreName, + replicaToPartitionList, + null, + false, + cluster, + 0); + // gather all the keys obtained + Set fetchedKeys = getKeys(keysItr); + // make sure it fetched all the keys from the partitions requested + Set partition4Keys = new HashSet(partitionToKeysMap.get(4)); + Set partition6Keys = new HashSet(partitionToKeysMap.get(6)); + + partition4Keys.removeAll(fetchedKeys); + partition6Keys.removeAll(fetchedKeys); + assertEquals("Remainder in partition 4" + partition4Keys, 0, partition4Keys.size()); + assertEquals("Remainder in partition 6" + partition6Keys, 0, partition6Keys.size()); + } + + @Test + public void testFetchPartitionPrimaryTwoKeys() { + HashMap> replicaToPartitionList = new HashMap>(); + replicaToPartitionList.put(0, Arrays.asList(0, 3)); + Iterator keysItr = adminClient.bulkFetchOps.fetchKeys(0, + testStoreName, + replicaToPartitionList, + null, + false, + cluster, + 2); + Set fetchedKeys = getKeys(keysItr); + + Set partition0Keys = new HashSet(partitionToKeysMap.get(0)); + int numPartition0Keys = partition0Keys.size(); + partition0Keys.removeAll(fetchedKeys); + assertEquals("Remainder in partition 0 should be two less.", + numPartition0Keys - 2, + partition0Keys.size()); + + Set partition3Keys = new HashSet(partitionToKeysMap.get(3)); + int numPartition3Keys = partition3Keys.size(); + partition3Keys.removeAll(fetchedKeys); + assertEquals("Remainder in partition 3 should be two less.", + numPartition3Keys - 2, + partition3Keys.size()); + + assertEquals("Total of four keys fetched.", 4, fetchedKeys.size()); + } + + @Test + public void testFetchNonExistentKeysPrimary() { + HashMap> replicaToPartitionList = new HashMap>(); + replicaToPartitionList.put(0, Arrays.asList(5, 7)); + Iterator keysItr = adminClient.bulkFetchOps.fetchKeys(0, + testStoreName, + replicaToPartitionList, + null, + false, + cluster, + 0); + // gather all the keys obtained + Set fetchedKeys = getKeys(keysItr); + // make sure it fetched nothing since these partitions belong to server + // 1 + assertEquals("Obtained something:" + fetchedKeys, 0, fetchedKeys.size()); + } + + @Test + public void testFetchNonExistentKeysSecondary() { + HashMap> replicaToPartitionList = new HashMap>(); + replicaToPartitionList.put(1, Arrays.asList(1, 2)); + Iterator keysItr = adminClient.bulkFetchOps.fetchKeys(0, + testStoreName, + replicaToPartitionList, + null, + false, + cluster, + 0); + // gather all the keys obtained + Set fetchedKeys = getKeys(keysItr); + // make sure it fetched nothing since these partitions belong to server + // 0 as primary + assertEquals("Obtained something:" + fetchedKeys, 0, fetchedKeys.size()); + } +} diff --git a/test/unit/voldemort/client/AdminServiceBasicTest.java b/test/unit/voldemort/client/AdminServiceBasicTest.java index 2dbddf1817..1da3a7db89 100644 --- a/test/unit/voldemort/client/AdminServiceBasicTest.java +++ b/test/unit/voldemort/client/AdminServiceBasicTest.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2012 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -55,6 +55,7 @@ import voldemort.VoldemortException; import voldemort.client.protocol.admin.AdminClient; import voldemort.client.protocol.admin.AdminClientConfig; +import voldemort.client.protocol.admin.QueryKeyResult; import voldemort.cluster.Cluster; import voldemort.cluster.Node; import voldemort.cluster.Zone; @@ -80,8 +81,10 @@ import voldemort.utils.ByteUtils; import voldemort.utils.Pair; import voldemort.utils.RebalanceUtils; +import voldemort.utils.StoreDefinitionUtils; import voldemort.utils.Utils; import voldemort.versioning.VectorClock; +import voldemort.versioning.Version; import voldemort.versioning.Versioned; import voldemort.xml.StoreDefinitionsMapper; @@ -109,14 +112,17 @@ public class AdminServiceBasicTest { private AdminClient adminClient; private final boolean useNio; + private final boolean onlineRetention; - public AdminServiceBasicTest(boolean useNio) { + public AdminServiceBasicTest(boolean useNio, boolean onlineRetention) { this.useNio = useNio; + this.onlineRetention = onlineRetention; } @Parameters public static Collection configs() { - return Arrays.asList(new Object[][] { { true }, { false } }); + return Arrays.asList(new Object[][] { { true, false }, { true, true }, { false, false }, + { false, true } }); } @Before @@ -126,6 +132,8 @@ public void setUp() throws IOException { int partitionMap[][] = { { 0, 1, 2, 3 }, { 4, 5, 6, 7 } }; Properties serverProperties = new Properties(); serverProperties.setProperty("client.max.connections.per.node", "20"); + serverProperties.setProperty("enforce.retention.policy.on.read", + Boolean.toString(onlineRetention)); cluster = ServerTestUtils.startVoldemortCluster(numServers, servers, partitionMap, @@ -139,7 +147,9 @@ public void setUp() throws IOException { Properties adminProperties = new Properties(); adminProperties.setProperty("max_connections", "20"); - adminClient = new AdminClient(cluster, new AdminClientConfig(adminProperties)); + adminClient = new AdminClient(cluster, + new AdminClientConfig(adminProperties), + new ClientConfig()); } /** @@ -154,7 +164,7 @@ private VoldemortServer getServer(int nodeId) { @After public void tearDown() throws IOException { - adminClient.stop(); + adminClient.close(); for(VoldemortServer server: servers) { ServerTestUtils.stopVoldemortServer(server); } @@ -195,19 +205,22 @@ public void testUpdateClusterMetadata() { Cluster updatedCluster = ServerTestUtils.getLocalCluster(4); AdminClient client = getAdminClient(); for(int i = 0; i < NUM_RUNS; i++) { - VectorClock clock = ((VectorClock) client.getRemoteCluster(0).getVersion()).incremented(0, - System.currentTimeMillis()); - client.updateRemoteCluster(0, updatedCluster, clock); + VectorClock clock = ((VectorClock) client.metadataMgmtOps.getRemoteCluster(0) + .getVersion()).incremented(0, + System.currentTimeMillis()); + client.metadataMgmtOps.updateRemoteCluster(0, updatedCluster, clock); assertEquals("Cluster should match", updatedCluster, getVoldemortServer(0).getMetadataStore().getCluster()); - assertEquals("AdminClient.getMetdata() should match", client.getRemoteCluster(0) - .getValue(), updatedCluster); + assertEquals("AdminClient.getMetdata() should match", + client.metadataMgmtOps.getRemoteCluster(0).getValue(), + updatedCluster); // version should match - assertEquals("versions should match as well.", clock, client.getRemoteCluster(0) - .getVersion()); + assertEquals("versions should match as well.", + clock, + client.metadataMgmtOps.getRemoteCluster(0).getVersion()); } } @@ -231,7 +244,7 @@ public void testAddStore() throws Exception { .setRequiredWrites(1) .build(); try { - adminClient.addStore(definition); + adminClient.storeMgmtOps.addStore(definition); fail("Should have thrown an exception because we cannot add a store with a replication factor greater than number of nodes"); } catch(Exception e) {} @@ -247,7 +260,7 @@ public void testAddStore() throws Exception { .setPreferredWrites(1) .setRequiredWrites(1) .build(); - adminClient.addStore(definition); + adminClient.storeMgmtOps.addStore(definition); // now test the store StoreClientFactory factory = new SocketStoreClientFactory(new ClientConfig().setBootstrapUrls(cluster.getNodeById(0) @@ -272,12 +285,14 @@ public void testAddStore() throws Exception { } // make sure that the store list we get back from AdminClient - Versioned> list = adminClient.getRemoteStoreDefList(0); + Versioned> list = adminClient.metadataMgmtOps.getRemoteStoreDefList(0); assertTrue(list.getValue().contains(definition)); } @Test public void testReplicationMapping() { + List zones = ServerTestUtils.getZones(2); + List nodes = Lists.newArrayList(); nodes.add(new Node(0, "localhost", 1, 2, 3, 0, Lists.newArrayList(0, 4, 8))); nodes.add(new Node(1, "localhost", 1, 2, 3, 0, Lists.newArrayList(1, 5, 9))); @@ -292,10 +307,10 @@ public void testReplicationMapping() { 1, 1, RoutingStrategyType.CONSISTENT_STRATEGY); - Cluster newCluster = new Cluster("single_zone_cluster", nodes); + Cluster newCluster = new Cluster("single_zone_cluster", nodes, zones); try { - adminClient.getReplicationMapping(0, newCluster, storeDef); + adminClient.helperOps.getReplicationMapping(0, newCluster, storeDef); fail("Should have thrown an exception since rep-factor = 1"); } catch(VoldemortException e) {} @@ -309,9 +324,9 @@ public void testReplicationMapping() { RoutingStrategyType.CONSISTENT_STRATEGY); // On node 0 - Map>> replicationMapping = adminClient.getReplicationMapping(0, - newCluster, - storeDef); + Map>> replicationMapping = adminClient.helperOps.getReplicationMapping(0, + newCluster, + storeDef); { HashMap>> expectedMapping = Maps.newHashMap(); HashMap> partitionTuple = Maps.newHashMap(); @@ -327,7 +342,9 @@ public void testReplicationMapping() { { // On node 1 - replicationMapping = adminClient.getReplicationMapping(1, newCluster, storeDef); + replicationMapping = adminClient.helperOps.getReplicationMapping(1, + newCluster, + storeDef); HashMap>> expectedMapping = Maps.newHashMap(); HashMap> partitionTuple = Maps.newHashMap(); partitionTuple.put(0, Lists.newArrayList(0, 4, 8)); @@ -341,7 +358,9 @@ public void testReplicationMapping() { { // On node 2 - replicationMapping = adminClient.getReplicationMapping(2, newCluster, storeDef); + replicationMapping = adminClient.helperOps.getReplicationMapping(2, + newCluster, + storeDef); HashMap>> expectedMapping = Maps.newHashMap(); HashMap> partitionTuple = Maps.newHashMap(); partitionTuple.put(0, Lists.newArrayList(1, 5, 9)); @@ -354,7 +373,9 @@ public void testReplicationMapping() { } { // On node 3 - replicationMapping = adminClient.getReplicationMapping(3, newCluster, storeDef); + replicationMapping = adminClient.helperOps.getReplicationMapping(3, + newCluster, + storeDef); HashMap>> expectedMapping = Maps.newHashMap(); HashMap> partitionTuple = Maps.newHashMap(); partitionTuple.put(1, Lists.newArrayList(3, 7, 11)); @@ -367,7 +388,6 @@ public void testReplicationMapping() { } // Test 2 - With zone routing strategy - List zones = ServerTestUtils.getZones(2); HashMap zoneReplicationFactors = Maps.newHashMap(); for(int zoneIds = 0; zoneIds < 2; zoneIds++) { zoneReplicationFactors.put(zoneIds, 1); @@ -386,7 +406,9 @@ public void testReplicationMapping() { { // On node 0 - replicationMapping = adminClient.getReplicationMapping(0, newCluster, storeDef); + replicationMapping = adminClient.helperOps.getReplicationMapping(0, + newCluster, + storeDef); HashMap>> expectedMapping = Maps.newHashMap(); HashMap> partitionTuple = Maps.newHashMap(); partitionTuple.put(0, Lists.newArrayList(2, 6, 10)); @@ -400,7 +422,9 @@ public void testReplicationMapping() { } { // On node 1 - replicationMapping = adminClient.getReplicationMapping(1, newCluster, storeDef); + replicationMapping = adminClient.helperOps.getReplicationMapping(1, + newCluster, + storeDef); HashMap>> expectedMapping = Maps.newHashMap(); HashMap> partitionTuple = Maps.newHashMap(); partitionTuple.put(1, Lists.newArrayList(1, 5, 9)); @@ -411,7 +435,9 @@ public void testReplicationMapping() { { // On node 2 - replicationMapping = adminClient.getReplicationMapping(2, newCluster, storeDef); + replicationMapping = adminClient.helperOps.getReplicationMapping(2, + newCluster, + storeDef); HashMap>> expectedMapping = Maps.newHashMap(); HashMap> partitionTuple = Maps.newHashMap(); partitionTuple.put(0, Lists.newArrayList(0, 4, 8)); @@ -426,7 +452,9 @@ public void testReplicationMapping() { { // On node 3 - replicationMapping = adminClient.getReplicationMapping(3, newCluster, storeDef); + replicationMapping = adminClient.helperOps.getReplicationMapping(3, + newCluster, + storeDef); HashMap>> expectedMapping = Maps.newHashMap(); HashMap> partitionTuple = Maps.newHashMap(); partitionTuple.put(1, Lists.newArrayList(3, 7, 11)); @@ -443,10 +471,12 @@ public void testReplicationMapping() { 1, 1, RoutingStrategyType.CONSISTENT_STRATEGY); - newCluster = new Cluster("single_zone_cluster", nodes); + newCluster = new Cluster("single_zone_cluster", nodes, zones); { - replicationMapping = adminClient.getReplicationMapping(0, newCluster, storeDef); + replicationMapping = adminClient.helperOps.getReplicationMapping(0, + newCluster, + storeDef); HashMap>> expectedMapping = Maps.newHashMap(); HashMap> partitionTuple = Maps.newHashMap(); partitionTuple.put(1, Lists.newArrayList(0, 4, 8)); @@ -463,7 +493,9 @@ public void testReplicationMapping() { } { - replicationMapping = adminClient.getReplicationMapping(1, newCluster, storeDef); + replicationMapping = adminClient.helperOps.getReplicationMapping(1, + newCluster, + storeDef); HashMap>> expectedMapping = Maps.newHashMap(); HashMap> partitionTuple = Maps.newHashMap(); partitionTuple.put(0, Lists.newArrayList(0, 4, 8)); @@ -479,7 +511,9 @@ public void testReplicationMapping() { } { - replicationMapping = adminClient.getReplicationMapping(2, newCluster, storeDef); + replicationMapping = adminClient.helperOps.getReplicationMapping(2, + newCluster, + storeDef); HashMap>> expectedMapping = Maps.newHashMap(); HashMap> partitionTuple = Maps.newHashMap(); partitionTuple.put(0, Lists.newArrayList(0, 4, 8)); @@ -496,7 +530,9 @@ public void testReplicationMapping() { } { - replicationMapping = adminClient.getReplicationMapping(3, newCluster, storeDef); + replicationMapping = adminClient.helperOps.getReplicationMapping(3, + newCluster, + storeDef); HashMap>> expectedMapping = Maps.newHashMap(); HashMap> partitionTuple = Maps.newHashMap(); partitionTuple.put(1, Lists.newArrayList(3, 7, 11)); @@ -529,7 +565,9 @@ public void testReplicationMapping() { RoutingStrategyType.ZONE_STRATEGY); newCluster = new Cluster("multi_zone_cluster", nodes, zones); { - replicationMapping = adminClient.getReplicationMapping(0, newCluster, storeDef); + replicationMapping = adminClient.helperOps.getReplicationMapping(0, + newCluster, + storeDef); HashMap>> expectedMapping = Maps.newHashMap(); HashMap> partitionTuple = Maps.newHashMap(); partitionTuple.put(0, Lists.newArrayList(1, 5, 9)); @@ -548,7 +586,9 @@ public void testReplicationMapping() { } { - replicationMapping = adminClient.getReplicationMapping(1, newCluster, storeDef); + replicationMapping = adminClient.helperOps.getReplicationMapping(1, + newCluster, + storeDef); HashMap>> expectedMapping = Maps.newHashMap(); HashMap> partitionTuple = Maps.newHashMap(); partitionTuple.put(0, Lists.newArrayList(0, 4, 8)); @@ -567,7 +607,9 @@ public void testReplicationMapping() { } { - replicationMapping = adminClient.getReplicationMapping(2, newCluster, storeDef); + replicationMapping = adminClient.helperOps.getReplicationMapping(2, + newCluster, + storeDef); HashMap>> expectedMapping = Maps.newHashMap(); HashMap> partitionTuple = Maps.newHashMap(); partitionTuple.put(0, Lists.newArrayList(0, 4, 8)); @@ -586,7 +628,9 @@ public void testReplicationMapping() { } { - replicationMapping = adminClient.getReplicationMapping(3, newCluster, storeDef); + replicationMapping = adminClient.helperOps.getReplicationMapping(3, + newCluster, + storeDef); HashMap>> expectedMapping = Maps.newHashMap(); HashMap> partitionTuple = Maps.newHashMap(); partitionTuple.put(0, Lists.newArrayList(0, 4, 8)); @@ -606,6 +650,8 @@ public void testReplicationMapping() { @Test public void testReplicationMappingWithZonePreference() { + List zones = ServerTestUtils.getZones(2); + List nodes = Lists.newArrayList(); nodes.add(new Node(0, "localhost", 1, 2, 3, 0, Lists.newArrayList(0, 4, 8))); nodes.add(new Node(1, "localhost", 1, 2, 3, 0, Lists.newArrayList(1, 5, 9))); @@ -620,10 +666,10 @@ public void testReplicationMappingWithZonePreference() { 1, 1, RoutingStrategyType.CONSISTENT_STRATEGY); - Cluster newCluster = new Cluster("single_zone_cluster", nodes); + Cluster newCluster = new Cluster("single_zone_cluster", nodes, zones); try { - adminClient.getReplicationMapping(0, newCluster, storeDef, 1); + adminClient.helperOps.getReplicationMapping(0, newCluster, storeDef, 1); fail("Should have thrown an exception since rep-factor = 1"); } catch(VoldemortException e) {} @@ -635,10 +681,10 @@ public void testReplicationMappingWithZonePreference() { 1, 1, RoutingStrategyType.CONSISTENT_STRATEGY); - newCluster = new Cluster("single_zone_cluster", nodes); + newCluster = new Cluster("single_zone_cluster", nodes, zones); try { - adminClient.getReplicationMapping(0, newCluster, storeDef, 0); + adminClient.helperOps.getReplicationMapping(0, newCluster, storeDef, 0); fail("Should have thrown an exception since rep-factor = 1"); } catch(VoldemortException e) {} @@ -652,10 +698,10 @@ public void testReplicationMappingWithZonePreference() { RoutingStrategyType.CONSISTENT_STRATEGY); // On node 0; zone id 1 - Map>> replicationMapping = adminClient.getReplicationMapping(0, - newCluster, - storeDef, - 1); + Map>> replicationMapping = adminClient.helperOps.getReplicationMapping(0, + newCluster, + storeDef, + 1); { HashMap>> expectedMapping = Maps.newHashMap(); HashMap> partitionTuple = Maps.newHashMap(); @@ -671,7 +717,7 @@ public void testReplicationMappingWithZonePreference() { } // On node 0; zone id 0 - replicationMapping = adminClient.getReplicationMapping(0, newCluster, storeDef, 0); + replicationMapping = adminClient.helperOps.getReplicationMapping(0, newCluster, storeDef, 0); { HashMap>> expectedMapping = Maps.newHashMap(); HashMap> partitionTuple = Maps.newHashMap(); @@ -686,7 +732,6 @@ public void testReplicationMappingWithZonePreference() { } // Test 2 - With zone routing strategy, and zone replication factor 1 - List zones = ServerTestUtils.getZones(2); HashMap zoneReplicationFactors = Maps.newHashMap(); for(int zoneIds = 0; zoneIds < 2; zoneIds++) { zoneReplicationFactors.put(zoneIds, 1); @@ -707,14 +752,20 @@ public void testReplicationMappingWithZonePreference() { // On node 0, zone 0 - failure case since zoneReplicationFactor is 1 try { - replicationMapping = adminClient.getReplicationMapping(0, newCluster, storeDef, 0); + replicationMapping = adminClient.helperOps.getReplicationMapping(0, + newCluster, + storeDef, + 0); fail("Should have thrown an exception since zoneReplicationFactor is 1"); } catch(VoldemortException e) {} } { // On node 0, zone 1 - replicationMapping = adminClient.getReplicationMapping(0, newCluster, storeDef, 1); + replicationMapping = adminClient.helperOps.getReplicationMapping(0, + newCluster, + storeDef, + 1); HashMap>> expectedMapping = Maps.newHashMap(); HashMap> partitionTuple = Maps.newHashMap(); partitionTuple.put(0, Lists.newArrayList(2, 6, 10)); @@ -729,7 +780,10 @@ public void testReplicationMappingWithZonePreference() { { // On node 1, zone 1 - replicationMapping = adminClient.getReplicationMapping(1, newCluster, storeDef, 1); + replicationMapping = adminClient.helperOps.getReplicationMapping(1, + newCluster, + storeDef, + 1); HashMap>> expectedMapping = Maps.newHashMap(); HashMap> partitionTuple = Maps.newHashMap(); partitionTuple.put(1, Lists.newArrayList(1, 5, 9)); @@ -755,7 +809,7 @@ public void testDeleteStore() throws Exception { .setPreferredWrites(1) .setRequiredWrites(1) .build(); - adminClient.addStore(definition); + adminClient.storeMgmtOps.addStore(definition); // now test the store StoreClientFactory factory = new SocketStoreClientFactory(new ClientConfig().setBootstrapUrls(cluster.getNodeById(0) @@ -764,13 +818,20 @@ public void testDeleteStore() throws Exception { StoreClient client = factory.getStoreClient("deleteTest"); - int numStores = adminClient.getRemoteStoreDefList(0).getValue().size(); + int numStores = adminClient.metadataMgmtOps.getRemoteStoreDefList(0).getValue().size(); // delete the store - assertEquals(adminClient.getRemoteStoreDefList(0).getValue().contains(definition), true); - adminClient.deleteStore("deleteTest"); - assertEquals(adminClient.getRemoteStoreDefList(0).getValue().size(), numStores - 1); - assertEquals(adminClient.getRemoteStoreDefList(0).getValue().contains(definition), false); + assertEquals(adminClient.metadataMgmtOps.getRemoteStoreDefList(0) + .getValue() + .contains(definition), + true); + adminClient.storeMgmtOps.deleteStore("deleteTest"); + assertEquals(adminClient.metadataMgmtOps.getRemoteStoreDefList(0).getValue().size(), + numStores - 1); + assertEquals(adminClient.metadataMgmtOps.getRemoteStoreDefList(0) + .getValue() + .contains(definition), + false); // test with deleted store try { @@ -784,7 +845,7 @@ public void testDeleteStore() throws Exception { throw e; } // try adding the store again - adminClient.addStore(definition); + adminClient.storeMgmtOps.addStore(definition); client = factory.getStoreClient("deleteTest"); client.put("abc", "123"); @@ -792,14 +853,34 @@ public void testDeleteStore() throws Exception { assertEquals(s, "123"); } + /** + * Update the server state ( + * {@link voldemort.store.metadata.MetadataStore.VoldemortState}) on a + * remote node. + * + * @param nodeId The node id on which we want to update the state + * @param state The state to update to + * @param clock The vector clock + */ + private void updateRemoteServerState(AdminClient client, + int nodeId, + MetadataStore.VoldemortState state, + Version clock) { + client.metadataMgmtOps.updateRemoteMetadata(nodeId, + MetadataStore.SERVER_STATE_KEY, + new Versioned(state.toString(), clock)); + } + @Test public void testStateTransitions() { // change to REBALANCING STATE AdminClient client = getAdminClient(); - client.updateRemoteServerState(getVoldemortServer(0).getIdentityNode().getId(), - MetadataStore.VoldemortState.REBALANCING_MASTER_SERVER, - ((VectorClock) client.getRemoteServerState(0).getVersion()).incremented(0, - System.currentTimeMillis())); + updateRemoteServerState(client, + getVoldemortServer(0).getIdentityNode().getId(), + MetadataStore.VoldemortState.REBALANCING_MASTER_SERVER, + ((VectorClock) client.rebalanceOps.getRemoteServerState(0) + .getVersion()).incremented(0, + System.currentTimeMillis())); MetadataStore.VoldemortState state = getVoldemortServer(0).getMetadataStore() .getServerState(); @@ -808,10 +889,12 @@ public void testStateTransitions() { state); // change back to NORMAL state - client.updateRemoteServerState(getVoldemortServer(0).getIdentityNode().getId(), - MetadataStore.VoldemortState.NORMAL_SERVER, - ((VectorClock) client.getRemoteServerState(0).getVersion()).incremented(0, - System.currentTimeMillis())); + updateRemoteServerState(client, + getVoldemortServer(0).getIdentityNode().getId(), + MetadataStore.VoldemortState.NORMAL_SERVER, + ((VectorClock) client.rebalanceOps.getRemoteServerState(0) + .getVersion()).incremented(0, + System.currentTimeMillis())); state = getVoldemortServer(0).getMetadataStore().getServerState(); assertEquals("State should be changed correctly to rebalancing state", @@ -819,10 +902,12 @@ public void testStateTransitions() { state); // lets revert back to REBALANCING STATE AND CHECK - client.updateRemoteServerState(getVoldemortServer(0).getIdentityNode().getId(), - MetadataStore.VoldemortState.REBALANCING_MASTER_SERVER, - ((VectorClock) client.getRemoteServerState(0).getVersion()).incremented(0, - System.currentTimeMillis())); + updateRemoteServerState(client, + getVoldemortServer(0).getIdentityNode().getId(), + MetadataStore.VoldemortState.REBALANCING_MASTER_SERVER, + ((VectorClock) client.rebalanceOps.getRemoteServerState(0) + .getVersion()).incremented(0, + System.currentTimeMillis())); state = getVoldemortServer(0).getMetadataStore().getServerState(); @@ -830,10 +915,12 @@ public void testStateTransitions() { MetadataStore.VoldemortState.REBALANCING_MASTER_SERVER, state); - client.updateRemoteServerState(getVoldemortServer(0).getIdentityNode().getId(), - MetadataStore.VoldemortState.NORMAL_SERVER, - ((VectorClock) client.getRemoteServerState(0).getVersion()).incremented(0, - System.currentTimeMillis())); + updateRemoteServerState(client, + getVoldemortServer(0).getIdentityNode().getId(), + MetadataStore.VoldemortState.NORMAL_SERVER, + ((VectorClock) client.rebalanceOps.getRemoteServerState(0) + .getVersion()).incremented(0, + System.currentTimeMillis())); state = getVoldemortServer(0).getMetadataStore().getServerState(); assertEquals("State should be changed correctly to rebalancing state", @@ -854,7 +941,7 @@ public void testDeletePartitionEntries() { List deletePartitionsList = Arrays.asList(0, 2); // do delete partitions request - getAdminClient().deletePartitions(0, testStoreName, deletePartitionsList, null); + getAdminClient().storeMntOps.deletePartitions(0, testStoreName, deletePartitionsList, null); store = getStore(0, testStoreName); for(Entry entry: entrySet.entrySet()) { @@ -882,11 +969,11 @@ public void testFetchPartitionKeys() { } } - Iterator fetchIt = getAdminClient().fetchKeys(0, - testStoreName, - fetchPartitionsList, - null, - false); + Iterator fetchIt = getAdminClient().bulkFetchOps.fetchKeys(0, + testStoreName, + fetchPartitionsList, + null, + false); // check values int count = 0; while(fetchIt.hasNext()) { @@ -949,8 +1036,8 @@ private void generateROFiles(int numChunks, private void generateAndFetchFiles(int numChunks, long versionId, long indexSize, long dataSize) throws IOException { Map>> buckets = RebalanceUtils.getNodeIdToAllPartitions(cluster, - RebalanceUtils.getStoreDefinitionWithName(storeDefs, - "test-readonly-fetchfiles"), + StoreDefinitionUtils.getStoreDefinitionWithName(storeDefs, + "test-readonly-fetchfiles"), true); for(Node node: cluster.getNodes()) { ReadOnlyStorageEngine store = (ReadOnlyStorageEngine) getStore(node.getId(), @@ -998,24 +1085,24 @@ private void generateAndFetchFiles(int numChunks, long versionId, long indexSize HashMap> dumbMap = Maps.newHashMap(); dumbMap.put(0, Lists.newArrayList(100)); try { - getAdminClient().fetchPartitionFiles(node.getId(), - "test-readonly-fetchfiles", - dumbMap, - tempDir.getAbsolutePath(), - null, - running); + getAdminClient().readonlyOps.fetchPartitionFiles(node.getId(), + "test-readonly-fetchfiles", + dumbMap, + tempDir.getAbsolutePath(), + null, + running); fail("Should throw exception since partition map passed is bad"); } catch(VoldemortException e) {} // Test 1) Fetch all the primary partitions... tempDir = TestUtils.createTempDir(); - getAdminClient().fetchPartitionFiles(node.getId(), - "test-readonly-fetchfiles", - primaryNodeBuckets, - tempDir.getAbsolutePath(), - null, - running); + getAdminClient().readonlyOps.fetchPartitionFiles(node.getId(), + "test-readonly-fetchfiles", + primaryNodeBuckets, + tempDir.getAbsolutePath(), + null, + running); // Check it... assertEquals(tempDir.list().length, 2 * primaryPartitions * numChunks + 1); @@ -1047,12 +1134,12 @@ private void generateAndFetchFiles(int numChunks, long versionId, long indexSize // Test 2) Fetch all the replica partitions... tempDir = TestUtils.createTempDir(); - getAdminClient().fetchPartitionFiles(node.getId(), - "test-readonly-fetchfiles", - replicaNodeBuckets, - tempDir.getAbsolutePath(), - null, - running); + getAdminClient().readonlyOps.fetchPartitionFiles(node.getId(), + "test-readonly-fetchfiles", + replicaNodeBuckets, + tempDir.getAbsolutePath(), + null, + running); // Check it... assertEquals(tempDir.list().length, 2 * replicaPartitions * numChunks + 1); @@ -1082,12 +1169,12 @@ private void generateAndFetchFiles(int numChunks, long versionId, long indexSize // Test 3) Fetch all the partitions... tempDir = TestUtils.createTempDir(); - getAdminClient().fetchPartitionFiles(node.getId(), - "test-readonly-fetchfiles", - nodeBuckets, - tempDir.getAbsolutePath(), - null, - running); + getAdminClient().readonlyOps.fetchPartitionFiles(node.getId(), + "test-readonly-fetchfiles", + nodeBuckets, + tempDir.getAbsolutePath(), + null, + running); // Check it... assertEquals(tempDir.list().length, 2 * (primaryPartitions + replicaPartitions) @@ -1122,9 +1209,9 @@ private void generateAndFetchFiles(int numChunks, long versionId, long indexSize @Test public void testGetROStorageFormat() { - Map storesToStorageFormat = getAdminClient().getROStorageFormat(0, - Lists.newArrayList("test-readonly-fetchfiles", - "test-readonly-versions")); + Map storesToStorageFormat = getAdminClient().readonlyOps.getROStorageFormat(0, + Lists.newArrayList("test-readonly-fetchfiles", + "test-readonly-versions")); assertEquals(storesToStorageFormat.size(), 2); assertEquals(storesToStorageFormat.get("test-readonly-fetchfiles"), ReadOnlyStorageFormat.READONLY_V0.getCode()); @@ -1136,24 +1223,24 @@ public void testGetROStorageFormat() { public void testGetROVersions() { // Tests get current version - Map storesToVersions = getAdminClient().getROCurrentVersion(0, - Lists.newArrayList("test-readonly-fetchfiles", - "test-readonly-versions")); + Map storesToVersions = getAdminClient().readonlyOps.getROCurrentVersion(0, + Lists.newArrayList("test-readonly-fetchfiles", + "test-readonly-versions")); assertEquals(storesToVersions.size(), 2); assertEquals(storesToVersions.get("test-readonly-fetchfiles").longValue(), 0); assertEquals(storesToVersions.get("test-readonly-versions").longValue(), 0); // Tests get maximum version - storesToVersions = getAdminClient().getROMaxVersion(0, - Lists.newArrayList("test-readonly-fetchfiles", - "test-readonly-versions")); + storesToVersions = getAdminClient().readonlyOps.getROMaxVersion(0, + Lists.newArrayList("test-readonly-fetchfiles", + "test-readonly-versions")); assertEquals(storesToVersions.size(), 2); assertEquals(storesToVersions.get("test-readonly-fetchfiles").longValue(), 0); assertEquals(storesToVersions.get("test-readonly-versions").longValue(), 0); // Tests global get maximum versions - storesToVersions = getAdminClient().getROMaxVersion(Lists.newArrayList("test-readonly-fetchfiles", - "test-readonly-versions")); + storesToVersions = getAdminClient().readonlyOps.getROMaxVersion(Lists.newArrayList("test-readonly-fetchfiles", + "test-readonly-versions")); assertEquals(storesToVersions.size(), 2); assertEquals(storesToVersions.get("test-readonly-fetchfiles").longValue(), 0); assertEquals(storesToVersions.get("test-readonly-versions").longValue(), 0); @@ -1170,29 +1257,29 @@ public void testGetROVersions() { // Node 0 // Test current version - storesToVersions = getAdminClient().getROCurrentVersion(0, - Lists.newArrayList("test-readonly-fetchfiles")); + storesToVersions = getAdminClient().readonlyOps.getROCurrentVersion(0, + Lists.newArrayList("test-readonly-fetchfiles")); assertEquals(storesToVersions.get("test-readonly-fetchfiles").longValue(), 0); // Test max version - storesToVersions = getAdminClient().getROMaxVersion(0, - Lists.newArrayList("test-readonly-fetchfiles")); + storesToVersions = getAdminClient().readonlyOps.getROMaxVersion(0, + Lists.newArrayList("test-readonly-fetchfiles")); assertEquals(storesToVersions.get("test-readonly-fetchfiles").longValue(), 10); // Node 1 // Test current version - storesToVersions = getAdminClient().getROCurrentVersion(1, - Lists.newArrayList("test-readonly-fetchfiles")); + storesToVersions = getAdminClient().readonlyOps.getROCurrentVersion(1, + Lists.newArrayList("test-readonly-fetchfiles")); assertEquals(storesToVersions.get("test-readonly-fetchfiles").longValue(), 11); // Test max version - storesToVersions = getAdminClient().getROMaxVersion(1, - Lists.newArrayList("test-readonly-fetchfiles")); + storesToVersions = getAdminClient().readonlyOps.getROMaxVersion(1, + Lists.newArrayList("test-readonly-fetchfiles")); assertEquals(storesToVersions.get("test-readonly-fetchfiles").longValue(), 11); // Test global max - storesToVersions = getAdminClient().getROMaxVersion(Lists.newArrayList("test-readonly-fetchfiles", - "test-readonly-versions")); + storesToVersions = getAdminClient().readonlyOps.getROMaxVersion(Lists.newArrayList("test-readonly-fetchfiles", + "test-readonly-versions")); assertEquals(storesToVersions.get("test-readonly-fetchfiles").longValue(), 11); assertEquals(storesToVersions.get("test-readonly-versions").longValue(), 0); @@ -1209,7 +1296,7 @@ public void testTruncate() throws Exception { } // do truncate request - getAdminClient().truncate(0, testStoreName); + getAdminClient().storeMntOps.truncate(0, testStoreName); store = getStore(0, testStoreName); @@ -1234,11 +1321,11 @@ public void testFetch() { } } - Iterator>> fetchIt = getAdminClient().fetchEntries(0, - testStoreName, - fetchPartitionsList, - null, - false); + Iterator>> fetchIt = getAdminClient().bulkFetchOps.fetchEntries(0, + testStoreName, + fetchPartitionsList, + null, + false); // check values int count = 0; while(fetchIt.hasNext()) { @@ -1325,109 +1412,112 @@ public void testQuery() { ArrayList notBelongToServer0AndOutsideBothKeys = new ArrayList(notBelongToServer0AndOutsideBoth.keySet()); List queryKeys; - Iterator>, Exception>>> results; - Pair>, Exception>> entry; + Iterator results; + QueryKeyResult entry; // test one key on store 0 queryKeys = new ArrayList(); queryKeys.add(belongToAndInsideServer0Keys.get(0)); - results = getAdminClient().queryKeys(0, testStoreName, queryKeys.iterator()); + results = getAdminClient().streamingOps.queryKeys(0, testStoreName, queryKeys.iterator()); assertTrue("Results should not be empty", results.hasNext()); entry = results.next(); - assertEquals(queryKeys.get(0), entry.getFirst()); - assertNull("There should not be exception in response", entry.getSecond().getSecond()); - assertEquals("There should be only 1 value in versioned list", 1, entry.getSecond() - .getFirst() - .size()); + assertEquals(queryKeys.get(0), entry.getKey()); + assertNull("There should not be exception in response", entry.getException()); + assertEquals("There should be only 1 value in versioned list", 1, entry.getValues().size()); assertEquals("Two byte[] should be equal", 0, ByteUtils.compare(belongToAndInsideServer0.get(queryKeys.get(0)), - entry.getSecond().getFirst().get(0).getValue())); + entry.getValues().get(0).getValue())); assertFalse("There should be only one result", results.hasNext()); // test one key belongs to but not exists in server 0 queryKeys = new ArrayList(); queryKeys.add(belongToServer0ButOutsideBothKeys.get(0)); - results = getAdminClient().queryKeys(0, testStoreName, queryKeys.iterator()); + results = getAdminClient().streamingOps.queryKeys(0, testStoreName, queryKeys.iterator()); assertTrue("Results should not be empty", results.hasNext()); entry = results.next(); assertFalse("There should not be more results", results.hasNext()); - assertEquals("Not the right key", queryKeys.get(0), entry.getFirst()); - assertNotNull("Response should be non-null", entry.getSecond()); - assertEquals("Value should be empty list", 0, entry.getSecond().getFirst().size()); - assertNull("There should not be exception", entry.getSecond().getSecond()); + assertEquals("Not the right key", queryKeys.get(0), entry.getKey()); + assertFalse("There should not be exception", entry.hasException()); + assertTrue("There should be values", entry.hasValues()); + assertNotNull("Response should be non-null", entry.getValues()); + assertEquals("Value should be empty list", 0, entry.getValues().size()); + assertNull("There should not be exception", entry.getException()); // test one key not exist and does not belong to server 0 queryKeys = new ArrayList(); queryKeys.add(notBelongToServer0AndOutsideBothKeys.get(0)); - results = getAdminClient().queryKeys(0, testStoreName, queryKeys.iterator()); + results = getAdminClient().streamingOps.queryKeys(0, testStoreName, queryKeys.iterator()); assertTrue("Results should not be empty", results.hasNext()); entry = results.next(); assertFalse("There should not be more results", results.hasNext()); - assertEquals("Not the right key", queryKeys.get(0), entry.getFirst()); - assertNotNull("Response should be non-null", entry.getSecond()); - assertNull("Value should be null", entry.getSecond().getFirst()); + assertEquals("Not the right key", queryKeys.get(0), entry.getKey()); + assertTrue("There should be exception", entry.hasException()); + assertFalse("There should not be values", entry.hasValues()); + assertNull("Value should be null", entry.getValues()); assertTrue("There should be InvalidMetadataException exception", - entry.getSecond().getSecond() instanceof InvalidMetadataException); + entry.getException() instanceof InvalidMetadataException); // test one key that exists on server 0 but does not belong to server 0 queryKeys = new ArrayList(); queryKeys.add(notBelongServer0ButInsideServer0Keys.get(0)); - results = getAdminClient().queryKeys(0, testStoreName, queryKeys.iterator()); + results = getAdminClient().streamingOps.queryKeys(0, testStoreName, queryKeys.iterator()); assertTrue("Results should not be empty", results.hasNext()); entry = results.next(); assertFalse("There should not be more results", results.hasNext()); - assertEquals("Not the right key", queryKeys.get(0), entry.getFirst()); - assertNotNull("Response should be non-null", entry.getSecond()); - assertNull("Value should be null", entry.getSecond().getFirst()); + assertEquals("Not the right key", queryKeys.get(0), entry.getKey()); + assertTrue("There should be exception", entry.hasException()); + assertFalse("There should not be values", entry.hasValues()); + assertNull("Value should be null", entry.getValues()); assertTrue("There should be InvalidMetadataException exception", - entry.getSecond().getSecond() instanceof InvalidMetadataException); + entry.getException() instanceof InvalidMetadataException); // test one key deleted store0.delete(belongToAndInsideServer0Keys.get(4), null); queryKeys = new ArrayList(); queryKeys.add(belongToAndInsideServer0Keys.get(4)); - results = getAdminClient().queryKeys(0, testStoreName, queryKeys.iterator()); + results = getAdminClient().streamingOps.queryKeys(0, testStoreName, queryKeys.iterator()); assertTrue("Results should not be empty", results.hasNext()); entry = results.next(); assertFalse("There should not be more results", results.hasNext()); - assertEquals("Not the right key", queryKeys.get(0), entry.getFirst()); - assertNotNull("Response should be non-null", entry.getSecond()); - assertEquals("Value should be empty list", 0, entry.getSecond().getFirst().size()); - assertNull("There should not be exception", entry.getSecond().getSecond()); + assertFalse("There should not be exception", entry.hasException()); + assertTrue("There should be values", entry.hasValues()); + assertEquals("Not the right key", queryKeys.get(0), entry.getKey()); + assertEquals("Value should be empty list", 0, entry.getValues().size()); // test empty request queryKeys = new ArrayList(); - results = getAdminClient().queryKeys(0, testStoreName, queryKeys.iterator()); + results = getAdminClient().streamingOps.queryKeys(0, testStoreName, queryKeys.iterator()); assertFalse("Results should be empty", results.hasNext()); // test null key queryKeys = new ArrayList(); queryKeys.add(null); assertEquals(1, queryKeys.size()); - results = getAdminClient().queryKeys(0, testStoreName, queryKeys.iterator()); + results = getAdminClient().streamingOps.queryKeys(0, testStoreName, queryKeys.iterator()); assertTrue("Results should not be empty", results.hasNext()); entry = results.next(); assertFalse("There should not be more results", results.hasNext()); - assertNotNull("Response should be non-null", entry.getSecond()); - assertNull("Value should be null", entry.getSecond().getFirst()); + assertTrue("There should be exception", entry.hasException()); + assertFalse("There should not be values", entry.hasValues()); + assertNull("Value should be null", entry.getValues()); assertTrue("There should be IllegalArgumentException exception", - entry.getSecond().getSecond() instanceof IllegalArgumentException); + entry.getException() instanceof IllegalArgumentException); // test multiple keys (3) on store 1 queryKeys = new ArrayList(); queryKeys.add(belongToAndInsideServer1Keys.get(0)); queryKeys.add(belongToAndInsideServer1Keys.get(1)); queryKeys.add(belongToAndInsideServer1Keys.get(2)); - results = getAdminClient().queryKeys(1, testStoreName, queryKeys.iterator()); + results = getAdminClient().streamingOps.queryKeys(1, testStoreName, queryKeys.iterator()); assertTrue("Results should not be empty", results.hasNext()); Map>> entries = new HashMap>>(); int resultCount = 0; while(results.hasNext()) { resultCount++; entry = results.next(); - assertNull("There should not be exception in response", entry.getSecond().getSecond()); - assertNotNull("Value should not be null for Key: ", entry.getSecond().getFirst()); - entries.put(entry.getFirst(), entry.getSecond().getFirst()); + assertNull("There should not be exception in response", entry.getException()); + assertNotNull("Value should not be null for Key: ", entry.getValues()); + entries.put(entry.getKey(), entry.getValues()); } assertEquals("There should 3 and only 3 results", 3, resultCount); for(ByteArray key: queryKeys) { @@ -1456,44 +1546,44 @@ public void testQuery() { queryKeys.add(belongToAndInsideServer0Keys.get(3)); queryKeys.add(belongToAndInsideServer0Keys.get(5)); queryKeys.add(notBelongServer0ButInsideServer0Keys.get(2)); - results = getAdminClient().queryKeys(0, testStoreName, queryKeys.iterator()); + results = getAdminClient().streamingOps.queryKeys(0, testStoreName, queryKeys.iterator()); // key 0 entry = results.next(); - assertEquals(0, ByteUtils.compare(queryKeys.get(0).get(), entry.getFirst().get())); + assertEquals(0, ByteUtils.compare(queryKeys.get(0).get(), entry.getKey().get())); assertEquals(0, ByteUtils.compare(belongToAndInsideServer0.get(queryKeys.get(0)), - entry.getSecond().getFirst().get(0).getValue())); - assertNull(entry.getSecond().getSecond()); + entry.getValues().get(0).getValue())); + assertNull(entry.getException()); // key 1 entry = results.next(); - assertEquals(0, ByteUtils.compare(queryKeys.get(1).get(), entry.getFirst().get())); + assertEquals(0, ByteUtils.compare(queryKeys.get(1).get(), entry.getKey().get())); assertTrue("There should be InvalidMetadataException exception", - entry.getSecond().getSecond() instanceof InvalidMetadataException); + entry.getException() instanceof InvalidMetadataException); // key 2 entry = results.next(); - assertEquals(0, ByteUtils.compare(queryKeys.get(2).get(), entry.getFirst().get())); - assertEquals(0, entry.getSecond().getFirst().size()); - assertNull(entry.getSecond().getSecond()); + assertEquals(0, ByteUtils.compare(queryKeys.get(2).get(), entry.getKey().get())); + assertEquals(0, entry.getValues().size()); + assertNull(entry.getException()); // key 3 entry = results.next(); - assertEquals(0, ByteUtils.compare(queryKeys.get(3).get(), entry.getFirst().get())); + assertEquals(0, ByteUtils.compare(queryKeys.get(3).get(), entry.getKey().get())); assertTrue("There should be InvalidMetadataException exception", - entry.getSecond().getSecond() instanceof InvalidMetadataException); + entry.getException() instanceof InvalidMetadataException); // key 4 entry = results.next(); - assertEquals(0, ByteUtils.compare(queryKeys.get(4).get(), entry.getFirst().get())); + assertEquals(0, ByteUtils.compare(queryKeys.get(4).get(), entry.getKey().get())); assertEquals(0, ByteUtils.compare(belongToAndInsideServer0.get(queryKeys.get(4)), - entry.getSecond().getFirst().get(0).getValue())); - assertNull(entry.getSecond().getSecond()); + entry.getValues().get(0).getValue())); + assertNull(entry.getException()); // key 5 entry = results.next(); - assertEquals(0, ByteUtils.compare(queryKeys.get(5).get(), entry.getFirst().get())); - assertEquals(0, entry.getSecond().getFirst().size()); - assertNull(entry.getSecond().getSecond()); + assertEquals(0, ByteUtils.compare(queryKeys.get(5).get(), entry.getKey().get())); + assertEquals(0, entry.getValues().size()); + assertNull(entry.getException()); // key 6 entry = results.next(); - assertEquals(0, ByteUtils.compare(queryKeys.get(6).get(), entry.getFirst().get())); + assertEquals(0, ByteUtils.compare(queryKeys.get(6).get(), entry.getKey().get())); assertTrue("There should be InvalidMetadataException exception", - entry.getSecond().getSecond() instanceof InvalidMetadataException); + entry.getException() instanceof InvalidMetadataException); // no more keys assertFalse(results.hasNext()); } @@ -1517,7 +1607,7 @@ protected Pair> computeNext() { } }; - getAdminClient().updateEntries(0, testStoreName, iterator, null); + getAdminClient().streamingOps.updateEntries(0, testStoreName, iterator, null); // check updated values Store store = getStore(0, testStoreName); @@ -1542,7 +1632,7 @@ public void testUpdateSlops() { "test-consistent-with-pref-list"); Iterator> slopIterator = entrySet.iterator(); - getAdminClient().updateSlopEntries(0, slopIterator); + getAdminClient().streamingOps.updateSlopEntries(0, slopIterator); // check updated values Iterator> entrysetItr = entrySet.iterator(); @@ -1586,7 +1676,7 @@ public void testRecoverData() { } // recover all data - adminClient.restoreDataFromReplications(1, 2); + adminClient.restoreOps.restoreDataFromReplications(1, 2); // assert server 1 has all entries for its partitions store = getStore(1, testStoreName); @@ -1617,8 +1707,8 @@ public void testFetchAndUpdateRW() { HashMap keysMoved = Maps.newHashMap(); // insert it into server-0 store - RoutingStrategy strategy = new RoutingStrategyFactory().updateRoutingStrategy(RebalanceUtils.getStoreDefinitionWithName(storeDefs, - "test-recovery-data"), + RoutingStrategy strategy = new RoutingStrategyFactory().updateRoutingStrategy(StoreDefinitionUtils.getStoreDefinitionWithName(storeDefs, + "test-recovery-data"), cluster); Store store0 = getStore(0, "test-recovery-data"); @@ -1644,14 +1734,14 @@ public void testFetchAndUpdateRW() { // Migrate the partition AdminClient client = getAdminClient(); - int id = client.migratePartitions(0, - 1, - "test-recovery-data", - replicaToPartitions, - null, - cluster, - false); - client.waitForCompletion(1, id, 120, TimeUnit.SECONDS); + int id = client.storeMntOps.migratePartitions(0, + 1, + "test-recovery-data", + replicaToPartitions, + null, + cluster, + false); + client.rpcOps.waitForCompletion(1, id, 120, TimeUnit.SECONDS); // Check the values for(Entry entry: keysMoved.entrySet()) { diff --git a/test/unit/voldemort/client/AdminServiceFailureTest.java b/test/unit/voldemort/client/AdminServiceFailureTest.java index 9ee5e902c3..ac542afd1a 100644 --- a/test/unit/voldemort/client/AdminServiceFailureTest.java +++ b/test/unit/voldemort/client/AdminServiceFailureTest.java @@ -1,3 +1,19 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + package voldemort.client; import java.io.IOException; @@ -6,8 +22,8 @@ import java.util.Iterator; import java.util.List; import java.util.Map; -import java.util.Properties; import java.util.Map.Entry; +import java.util.Properties; import junit.framework.TestCase; @@ -124,7 +140,7 @@ private AbstractSocketService getAdminServer(Node node, public void tearDown() throws IOException { try { adminServer.stop(); - adminClient.stop(); + adminClient.close(); } catch(Exception e) { // ignore } @@ -169,33 +185,36 @@ private void doOperation(StreamOperations e, switch(e) { case DELETE_PARTITIONS: putAlltoStore(); - getAdminClient().deletePartitions(nodeId, storeName, partitionList, null); + getAdminClient().storeMntOps.deletePartitions(nodeId, + storeName, + partitionList, + null); return; case FETCH_ENTRIES: putAlltoStore(); - consumeIterator(getAdminClient().fetchEntries(nodeId, - storeName, - partitionList, - null, - false)); + consumeIterator(getAdminClient().bulkFetchOps.fetchEntries(nodeId, + storeName, + partitionList, + null, + false)); return; case FETCH_KEYS: putAlltoStore(); - consumeIterator(getAdminClient().fetchKeys(nodeId, - storeName, - partitionList, - null, - false)); + consumeIterator(getAdminClient().bulkFetchOps.fetchKeys(nodeId, + storeName, + partitionList, + null, + false)); return; case UPDATE_ENTRIES: - getAdminClient().updateEntries(nodeId, - storeName, - getRandomlyFailingIterator(ServerTestUtils.createRandomKeyValuePairs(TEST_KEYS)), - null); + getAdminClient().streamingOps.updateEntries(nodeId, + storeName, + getRandomlyFailingIterator(ServerTestUtils.createRandomKeyValuePairs(TEST_KEYS)), + null); return; case TRUNCATE_ENTRIES: putAlltoStore(); - getAdminClient().truncate(nodeId, storeName); + getAdminClient().storeMntOps.truncate(nodeId, storeName); return; default: diff --git a/test/unit/voldemort/client/AdminServiceFilterTest.java b/test/unit/voldemort/client/AdminServiceFilterTest.java index 1511a3ed55..57d6e4ff3f 100644 --- a/test/unit/voldemort/client/AdminServiceFilterTest.java +++ b/test/unit/voldemort/client/AdminServiceFilterTest.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2009 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -22,9 +22,9 @@ import java.util.Collection; import java.util.HashSet; import java.util.List; +import java.util.Map.Entry; import java.util.Properties; import java.util.Set; -import java.util.Map.Entry; import org.apache.commons.io.FileUtils; import org.junit.After; @@ -43,7 +43,7 @@ import voldemort.store.StoreDefinition; import voldemort.utils.ByteArray; import voldemort.utils.Pair; -import voldemort.utils.RebalanceUtils; +import voldemort.utils.StoreDefinitionUtils; import voldemort.versioning.Versioned; import voldemort.xml.StoreDefinitionsMapper; @@ -89,7 +89,7 @@ public void setUp() throws IOException { config.setEnableNetworkClassLoader(true); List storeDefs = new StoreDefinitionsMapper().readStoreList(new File(storesXmlfile)); - storeDef = RebalanceUtils.getStoreDefinitionWithName(storeDefs, testStoreName); + storeDef = StoreDefinitionUtils.getStoreDefinitionWithName(storeDefs, testStoreName); server = new VoldemortServer(config, cluster); server.start(); @@ -100,7 +100,7 @@ public void setUp() throws IOException { @Override @After public void tearDown() throws IOException, InterruptedException { - adminClient.stop(); + adminClient.close(); server.stop(); FileUtils.deleteDirectory(new File(server.getVoldemortConfig().getVoldemortHome())); } diff --git a/test/unit/voldemort/client/AdminServiceMultiJVMTest.java b/test/unit/voldemort/client/AdminServiceMultiJVMTest.java index 1a50b40d48..b98b66718a 100644 --- a/test/unit/voldemort/client/AdminServiceMultiJVMTest.java +++ b/test/unit/voldemort/client/AdminServiceMultiJVMTest.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2009 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -22,8 +22,8 @@ import java.util.Collection; import java.util.HashSet; import java.util.List; -import java.util.Set; import java.util.Map.Entry; +import java.util.Set; import org.apache.commons.io.FileUtils; import org.junit.After; @@ -43,8 +43,8 @@ import voldemort.store.socket.clientrequest.ClientRequestExecutorPool; import voldemort.utils.ByteArray; import voldemort.utils.Pair; -import voldemort.utils.RebalanceUtils; import voldemort.utils.ServerJVMTestUtils; +import voldemort.utils.StoreDefinitionUtils; import voldemort.versioning.Versioned; import voldemort.xml.StoreDefinitionsMapper; @@ -97,7 +97,7 @@ public void setUp() throws IOException { storesXmlfile, cluster); List storeDefs = new StoreDefinitionsMapper().readStoreList(new File(storesXmlfile)); - storeDef = RebalanceUtils.getStoreDefinitionWithName(storeDefs, testStoreName); + storeDef = StoreDefinitionUtils.getStoreDefinitionWithName(storeDefs, testStoreName); pid = ServerJVMTestUtils.startServerJVM(socketStoreFactory, cluster.getNodeById(0), @@ -108,7 +108,7 @@ public void setUp() throws IOException { @Override @After public void tearDown() throws IOException { - adminClient.stop(); + adminClient.close(); ServerJVMTestUtils.StopServerJVM(pid); FileUtils.deleteDirectory(new File(voldemortHome)); socketStoreFactory.close(); diff --git a/test/unit/voldemort/client/ClientRegistryTest.java b/test/unit/voldemort/client/ClientRegistryTest.java index 87cb8b8512..c1b3e88ba6 100644 --- a/test/unit/voldemort/client/ClientRegistryTest.java +++ b/test/unit/voldemort/client/ClientRegistryTest.java @@ -16,6 +16,11 @@ package voldemort.client; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + import java.io.ByteArrayInputStream; import java.util.ArrayList; import java.util.Iterator; @@ -23,14 +28,11 @@ import java.util.Properties; import java.util.concurrent.TimeUnit; -import junit.framework.TestCase; - import org.junit.After; import org.junit.Before; import org.junit.Test; import voldemort.ServerTestUtils; -import voldemort.TestUtils; import voldemort.client.protocol.admin.AdminClient; import voldemort.cluster.Cluster; import voldemort.serialization.DefaultSerializerFactory; @@ -47,7 +49,7 @@ import com.google.common.collect.Lists; @SuppressWarnings({ "unchecked" }) -public class ClientRegistryTest extends TestCase { +public class ClientRegistryTest { public static final String SERVER_LOCAL_URL = "tcp://localhost:"; public static final String TEST_STORE_NAME = "test-store-eventual-1"; @@ -64,8 +66,7 @@ public class ClientRegistryTest extends TestCase { 32 * 1024); private static VoldemortServer[] servers = null; private static int[] serverPorts = null; - private Cluster cluster = ServerTestUtils.getLocalCluster(2, new int[][] { { 0, 1, 2, 3 }, - { 4, 5, 6, 7 } }); + private Cluster cluster = null; private static AdminClient adminClient; private SerializerFactory serializerFactory = new DefaultSerializerFactory(); @@ -76,22 +77,24 @@ public class ClientRegistryTest extends TestCase { @Before public void setUp() throws Exception { - if(null == servers) { + if(cluster == null) { servers = new VoldemortServer[TOTAL_SERVERS]; - serverPorts = new int[TOTAL_SERVERS]; + int partitionMap[][] = { { 0, 1, 2, 3 }, { 4, 5, 6, 7 } }; + cluster = ServerTestUtils.startVoldemortCluster(TOTAL_SERVERS, + servers, + partitionMap, + socketStoreFactory, + true, // useNio + null, + STORES_XML_FILE, + new Properties()); + + serverPorts = new int[TOTAL_SERVERS]; for(int i = 0; i < TOTAL_SERVERS; i++) { - servers[i] = ServerTestUtils.startVoldemortServer(socketStoreFactory, - ServerTestUtils.createServerConfig(true, - i, - TestUtils.createTempDir() - .getAbsolutePath(), - null, - STORES_XML_FILE, - new Properties()), - cluster); serverPorts[i] = servers[i].getIdentityNode().getSocketPort(); } + adminClient = ServerTestUtils.getAdminClient(cluster); } @@ -101,6 +104,9 @@ public void setUp() throws Exception { @After public void tearDown() throws Exception { this.clearRegistryContent(); + for(VoldemortServer server: servers) { + ServerTestUtils.stopVoldemortServer(server); + } } /* @@ -116,16 +122,17 @@ public void testHappyPath() { .setBootstrapUrls(SERVER_LOCAL_URL + serverPorts[0]) .setClientContextName(CLIENT_CONTEXT_NAME) + .enableDefaultClient(false) .setClientRegistryUpdateIntervalInSecs(CLIENT_REGISTRY_REFRESH_INTERVAL) .setEnableLazy(false); SocketStoreClientFactory socketFactory = new SocketStoreClientFactory(clientConfig); StoreClient client1 = socketFactory.getStoreClient(TEST_STORE_NAME); client1.put("k", "v"); - Iterator>> it = adminClient.fetchEntries(0, - SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), - emptyPartitionList, - null, - false); + Iterator>> it = adminClient.bulkFetchOps.fetchEntries(0, + SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), + emptyPartitionList, + null, + false); ArrayList infoList = getClientRegistryContent(it); assertEquals(TEST_STORE_NAME, infoList.get(0).getStoreName()); assertEquals(CLIENT_CONTEXT_NAME, infoList.get(0).getContext()); @@ -136,11 +143,11 @@ public void testHappyPath() { assertNotNull("Client version is null", infoList.get(0).getReleaseVersion()); assertEquals(1, infoList.size()); - it = adminClient.fetchEntries(1, - SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), - emptyPartitionList, - null, - false); + it = adminClient.bulkFetchOps.fetchEntries(1, + SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), + emptyPartitionList, + null, + false); infoList = getClientRegistryContent(it); assertEquals(TEST_STORE_NAME, infoList.get(0).getStoreName()); assertEquals(CLIENT_CONTEXT_NAME, infoList.get(0).getContext()); @@ -155,11 +162,11 @@ public void testHappyPath() { } catch(InterruptedException e) {} // now the periodical update has gone through, it shall be higher than // the bootstrap time - it = adminClient.fetchEntries(1, - SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), - emptyPartitionList, - null, - false); + it = adminClient.bulkFetchOps.fetchEntries(1, + SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), + emptyPartitionList, + null, + false); infoList = getClientRegistryContent(it); assertTrue("Client registry not updated.", infoList.get(0).getBootstrapTime() < infoList.get(0).getUpdateTime()); @@ -182,6 +189,7 @@ public void testTwoClients() { .setBootstrapUrls(SERVER_LOCAL_URL + serverPorts[0]) .setClientContextName(CLIENT_CONTEXT_NAME) + .enableDefaultClient(false) .setClientRegistryUpdateIntervalInSecs(CLIENT_REGISTRY_REFRESH_INTERVAL) .setEnableLazy(false); SocketStoreClientFactory socketFactory = new SocketStoreClientFactory(clientConfig); @@ -191,11 +199,11 @@ public void testTwoClients() { client1.put("k1", "v1"); client2.put("k2", "v2"); - Iterator>> it = adminClient.fetchEntries(0, - SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), - emptyPartitionList, - null, - false); + Iterator>> it = adminClient.bulkFetchOps.fetchEntries(0, + SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), + emptyPartitionList, + null, + false); ArrayList infoList = getClientRegistryContent(it); assertEquals(TEST_STORE_NAME, infoList.get(0).getStoreName()); assertEquals(CLIENT_CONTEXT_NAME, infoList.get(0).getContext()); @@ -214,11 +222,11 @@ public void testTwoClients() { assertNotNull("Client version is null", infoList.get(1).getReleaseVersion()); assertEquals(infoList.size(), 2); - it = adminClient.fetchEntries(1, - SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), - emptyPartitionList, - null, - false); + it = adminClient.bulkFetchOps.fetchEntries(1, + SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), + emptyPartitionList, + null, + false); infoList = getClientRegistryContent(it); assertEquals(TEST_STORE_NAME, infoList.get(0).getStoreName()); assertEquals(CLIENT_CONTEXT_NAME, infoList.get(0).getContext()); @@ -243,11 +251,11 @@ public void testTwoClients() { } catch(InterruptedException e) {} // now the periodical update has gone through, it shall be higher than // the bootstrap time - it = adminClient.fetchEntries(1, - SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), - emptyPartitionList, - null, - false); + it = adminClient.bulkFetchOps.fetchEntries(1, + SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), + emptyPartitionList, + null, + false); infoList = getClientRegistryContent(it); assertTrue("Client registry not updated.", infoList.get(0).getBootstrapTime() < infoList.get(0).getUpdateTime()); @@ -270,6 +278,7 @@ public void testTwoStores() { .setBootstrapUrls(SERVER_LOCAL_URL + serverPorts[0]) .setClientContextName(CLIENT_CONTEXT_NAME) + .enableDefaultClient(false) .setClientRegistryUpdateIntervalInSecs(CLIENT_REGISTRY_REFRESH_INTERVAL) .setEnableLazy(false); SocketStoreClientFactory socketFactory = new SocketStoreClientFactory(clientConfig); @@ -279,11 +288,11 @@ public void testTwoStores() { client1.put("k1", "v1"); client2.put("k2", "v2"); - Iterator>> it = adminClient.fetchEntries(0, - SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), - emptyPartitionList, - null, - false); + Iterator>> it = adminClient.bulkFetchOps.fetchEntries(0, + SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), + emptyPartitionList, + null, + false); ArrayList infoList = getClientRegistryContent(it); assertEquals(CLIENT_CONTEXT_NAME, infoList.get(0).getContext()); @@ -311,11 +320,11 @@ public void testTwoStores() { infoList.get(0).getBootstrapTime() >= infoList.get(1).getBootstrapTime()); } - it = adminClient.fetchEntries(1, - SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), - emptyPartitionList, - null, - false); + it = adminClient.bulkFetchOps.fetchEntries(1, + SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), + emptyPartitionList, + null, + false); infoList = getClientRegistryContent(it); assertEquals(CLIENT_CONTEXT_NAME, infoList.get(0).getContext()); @@ -348,11 +357,11 @@ public void testTwoStores() { } catch(InterruptedException e) {} // now the periodical update has gone through, it shall be higher than // the bootstrap time - it = adminClient.fetchEntries(1, - SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), - emptyPartitionList, - null, - false); + it = adminClient.bulkFetchOps.fetchEntries(1, + SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), + emptyPartitionList, + null, + false); infoList = getClientRegistryContent(it); assertTrue("Client registry not updated.", infoList.get(0).getBootstrapTime() < infoList.get(0).getUpdateTime()); @@ -375,6 +384,7 @@ public void testTwoFactories() { .setBootstrapUrls(SERVER_LOCAL_URL + serverPorts[0]) .setClientContextName(CLIENT_CONTEXT_NAME) + .enableDefaultClient(false) .setClientRegistryUpdateIntervalInSecs(CLIENT_REGISTRY_REFRESH_INTERVAL) .setEnableLazy(false); SocketStoreClientFactory socketFactory1 = new SocketStoreClientFactory(clientConfig); @@ -385,6 +395,7 @@ public void testTwoFactories() { .setBootstrapUrls(SERVER_LOCAL_URL + serverPorts[0]) .setClientContextName(CLIENT_CONTEXT_NAME2) + .enableDefaultClient(false) .setClientRegistryUpdateIntervalInSecs(CLIENT_REGISTRY_REFRESH_INTERVAL) .setEnableLazy(false); SocketStoreClientFactory socketFactory2 = new SocketStoreClientFactory(clientConfig2); @@ -395,11 +406,11 @@ public void testTwoFactories() { client1.put("k1", "v1"); client2.put("k2", "v2"); - Iterator>> it = adminClient.fetchEntries(0, - SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), - emptyPartitionList, - null, - false); + Iterator>> it = adminClient.bulkFetchOps.fetchEntries(0, + SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), + emptyPartitionList, + null, + false); ArrayList infoList = getClientRegistryContent(it); assertNotNull("Client version is null", infoList.get(0).getReleaseVersion()); @@ -437,11 +448,11 @@ public void testTwoFactories() { infoList.get(0).getBootstrapTime() >= infoList.get(1).getBootstrapTime()); } - it = adminClient.fetchEntries(1, - SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), - emptyPartitionList, - null, - false); + it = adminClient.bulkFetchOps.fetchEntries(1, + SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), + emptyPartitionList, + null, + false); infoList = getClientRegistryContent(it); assertNotNull("Client version is null", infoList.get(0).getReleaseVersion()); @@ -484,11 +495,11 @@ public void testTwoFactories() { } catch(InterruptedException e) {} // now the periodical update has gone through, it shall be higher than // the bootstrap time - it = adminClient.fetchEntries(1, - SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), - emptyPartitionList, - null, - false); + it = adminClient.bulkFetchOps.fetchEntries(1, + SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), + emptyPartitionList, + null, + false); infoList = getClientRegistryContent(it); assertTrue("Client registry not updated.", infoList.get(0).getBootstrapTime() < infoList.get(0).getUpdateTime()); @@ -514,6 +525,7 @@ public void testOneServerFailure() { .setBootstrapUrls(SERVER_LOCAL_URL + serverPorts[1]) .setClientContextName(CLIENT_CONTEXT_NAME) + .enableDefaultClient(false) .setClientRegistryUpdateIntervalInSecs(CLIENT_REGISTRY_REFRESH_INTERVAL) .setEnableLazy(false); SocketStoreClientFactory socketFactory1 = new SocketStoreClientFactory(clientConfig); @@ -524,6 +536,7 @@ public void testOneServerFailure() { .setBootstrapUrls(SERVER_LOCAL_URL + serverPorts[1]) .setClientContextName(CLIENT_CONTEXT_NAME2) + .enableDefaultClient(false) .setClientRegistryUpdateIntervalInSecs(CLIENT_REGISTRY_REFRESH_INTERVAL) .setEnableLazy(false); SocketStoreClientFactory socketFactory2 = new SocketStoreClientFactory(clientConfig2); @@ -534,11 +547,11 @@ public void testOneServerFailure() { client1.put("k1", "v1"); client2.put("k2", "v2"); - Iterator>> it = adminClient.fetchEntries(1, - SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), - emptyPartitionList, - null, - false); + Iterator>> it = adminClient.bulkFetchOps.fetchEntries(1, + SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), + emptyPartitionList, + null, + false); ArrayList infoList = getClientRegistryContent(it); assertNotNull("Client version is null", infoList.get(0).getReleaseVersion()); @@ -581,11 +594,11 @@ public void testOneServerFailure() { } catch(InterruptedException e) {} // now the periodical update has gone through, it shall be higher than // the bootstrap time - it = adminClient.fetchEntries(1, - SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), - emptyPartitionList, - null, - false); + it = adminClient.bulkFetchOps.fetchEntries(1, + SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), + emptyPartitionList, + null, + false); infoList = getClientRegistryContent(it); assertTrue("Client registry not updated.", infoList.get(0).getBootstrapTime() < infoList.get(0).getUpdateTime()); @@ -609,6 +622,7 @@ public void testRepeatRegistrationSameFactory() { .setBootstrapUrls(SERVER_LOCAL_URL + serverPorts[1]) .setClientContextName(CLIENT_CONTEXT_NAME) + .enableDefaultClient(false) .setClientRegistryUpdateIntervalInSecs(CLIENT_REGISTRY_REFRESH_INTERVAL) .setEnableLazy(false); SocketStoreClientFactory socketFactory1 = new SocketStoreClientFactory(clientConfig); @@ -619,6 +633,7 @@ public void testRepeatRegistrationSameFactory() { .setBootstrapUrls(SERVER_LOCAL_URL + serverPorts[1]) .setClientContextName(CLIENT_CONTEXT_NAME2) + .enableDefaultClient(false) .setClientRegistryUpdateIntervalInSecs(CLIENT_REGISTRY_REFRESH_INTERVAL) .setEnableLazy(false); SocketStoreClientFactory socketFactory2 = new SocketStoreClientFactory(clientConfig2); @@ -633,11 +648,11 @@ public void testRepeatRegistrationSameFactory() { } - Iterator>> it = adminClient.fetchEntries(1, - SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), - emptyPartitionList, - null, - false); + Iterator>> it = adminClient.bulkFetchOps.fetchEntries(1, + SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), + emptyPartitionList, + null, + false); ArrayList infoList = getClientRegistryContent(it); assertEquals("Incrrect # of entries created in client registry", 6, infoList.size()); @@ -662,6 +677,7 @@ public void testRepeatRegistrationDifferentFactories() { .setBootstrapUrls(SERVER_LOCAL_URL + serverPorts[1]) .setClientContextName(CLIENT_CONTEXT_NAME) + .enableDefaultClient(false) .setClientRegistryUpdateIntervalInSecs(CLIENT_REGISTRY_REFRESH_INTERVAL) .setEnableLazy(false); SocketStoreClientFactory socketFactory1 = new SocketStoreClientFactory(clientConfig); @@ -672,6 +688,7 @@ public void testRepeatRegistrationDifferentFactories() { .setBootstrapUrls(SERVER_LOCAL_URL + serverPorts[1]) .setClientContextName(CLIENT_CONTEXT_NAME2) + .enableDefaultClient(false) .setClientRegistryUpdateIntervalInSecs(CLIENT_REGISTRY_REFRESH_INTERVAL) .setEnableLazy(false); SocketStoreClientFactory socketFactory2 = new SocketStoreClientFactory(clientConfig2); @@ -682,11 +699,11 @@ public void testRepeatRegistrationDifferentFactories() { client1.put("k1", "v1"); client2.put("k2", "v2"); - Iterator>> it = adminClient.fetchEntries(1, - SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), - emptyPartitionList, - null, - false); + Iterator>> it = adminClient.bulkFetchOps.fetchEntries(1, + SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), + emptyPartitionList, + null, + false); ArrayList infoList = getClientRegistryContent(it); assertEquals("Incrrect # of entries created in client registry", 2, infoList.size()); @@ -732,11 +749,11 @@ public void testRepeatRegistrationDifferentFactories() { // now the periodical update has gone through, it shall be higher // than // the bootstrap time - it = adminClient.fetchEntries(1, - SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), - emptyPartitionList, - null, - false); + it = adminClient.bulkFetchOps.fetchEntries(1, + SystemStoreConstants.SystemStoreName.voldsys$_client_registry.name(), + emptyPartitionList, + null, + false); infoList = getClientRegistryContent(it); assertTrue("Client registry not updated.", diff --git a/test/unit/voldemort/client/EndToEndRebootstrapTest.java b/test/unit/voldemort/client/EndToEndRebootstrapTest.java index a2ee510627..d33fbb2020 100644 --- a/test/unit/voldemort/client/EndToEndRebootstrapTest.java +++ b/test/unit/voldemort/client/EndToEndRebootstrapTest.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2009 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -30,7 +30,6 @@ import org.junit.Test; import voldemort.ServerTestUtils; -import voldemort.TestUtils; import voldemort.VoldemortAdminTool; import voldemort.VoldemortException; import voldemort.client.protocol.admin.AdminClient; @@ -73,26 +72,17 @@ public class EndToEndRebootstrapTest { @Before public void setUp() throws Exception { - cluster = ServerTestUtils.getLocalCluster(2, new int[][] { { 0, 1, 2, 3 }, { 4, 5, 6, 7 } }); - servers = new VoldemortServer[2]; - servers[0] = ServerTestUtils.startVoldemortServer(socketStoreFactory, - ServerTestUtils.createServerConfig(true, - 0, - TestUtils.createTempDir() - .getAbsolutePath(), - null, - storesXmlfile, - new Properties()), - cluster); - servers[1] = ServerTestUtils.startVoldemortServer(socketStoreFactory, - ServerTestUtils.createServerConfig(true, - 1, - TestUtils.createTempDir() - .getAbsolutePath(), - null, - storesXmlfile, - new Properties()), - cluster); + final int numServers = 2; + servers = new VoldemortServer[numServers]; + int partitionMap[][] = { { 0, 1, 2, 3 }, { 4, 5, 6, 7 } }; + cluster = ServerTestUtils.startVoldemortCluster(numServers, + servers, + partitionMap, + socketStoreFactory, + true, // useNio + null, + storesXmlfile, + new Properties()); socketUrl = servers[0].getIdentityNode().getSocketUrl().toString(); bootStrapUrls = new String[1]; @@ -128,8 +118,9 @@ public void setUp() throws Exception { @After public void tearDown() throws Exception { - servers[0].stop(); - servers[1].stop(); + for(VoldemortServer server: servers) { + ServerTestUtils.stopVoldemortServer(server); + } } /* @@ -152,6 +143,7 @@ public void testEndToEndRebootstrap() { String newBootstrapTime = ""; AdminClient adminClient = new AdminClient(bootStrapUrls[0], new AdminClientConfig(), + new ClientConfig(), CLIENT_ZONE_ID); try { diff --git a/test/unit/voldemort/client/SocketStoreClientFactoryMbeanTest.java b/test/unit/voldemort/client/SocketStoreClientFactoryMbeanTest.java index e785ee5641..7888aeed54 100644 --- a/test/unit/voldemort/client/SocketStoreClientFactoryMbeanTest.java +++ b/test/unit/voldemort/client/SocketStoreClientFactoryMbeanTest.java @@ -106,7 +106,7 @@ public void testMultipleDistinctClientsOnSingleFactory() { bootStrap(clients, 10); checkMbeanIdCount(CLIENT_DOMAIN, "ClientThreadPool*", 1, true); - checkMbeanIdCount(CLIENT_DOMAIN, "ZenStoreClient*", 2, true); + checkMbeanIdCount(CLIENT_DOMAIN, "*StoreClient*", 2, true); checkMbeanIdCount(CLUSTER_FAILUREDETECTOR_DOMAIN, "ThresholdFailureDetector*", 1, true); checkMbeanIdCount(PIPELINE_ROUTED_STATS_DOMAIN, "*", 2, true); checkMbeanIdCount(CLIENT_REQUEST_DOMAIN, "aggregated*", 1, true); @@ -135,7 +135,7 @@ public void testMultipleIndistinctClientsOnSingleFactory() { bootStrap(clients, 10); checkMbeanIdCount(CLIENT_DOMAIN, "ClientThreadPool*", 1, true); - checkMbeanIdCount(CLIENT_DOMAIN, "ZenStoreClient*", 2, true); + checkMbeanIdCount(CLIENT_DOMAIN, "*StoreClient*", 2, true); checkMbeanIdCount(CLUSTER_FAILUREDETECTOR_DOMAIN, "ThresholdFailureDetector*", 1, true); checkMbeanIdCount(PIPELINE_ROUTED_STATS_DOMAIN, "*", 2, true); checkMbeanIdCount(CLIENT_REQUEST_DOMAIN, "aggregated*", 1, true); @@ -163,7 +163,7 @@ public void testMultipleDistinctClientsOnMultipleFactories() { bootStrap(clients, 10); checkMbeanIdCount(CLIENT_DOMAIN, "ClientThreadPool*", 2, true); - checkMbeanIdCount(CLIENT_DOMAIN, "ZenStoreClient*", 2, true); + checkMbeanIdCount(CLIENT_DOMAIN, "*StoreClient*", 2, true); checkMbeanIdCount(CLUSTER_FAILUREDETECTOR_DOMAIN, "ThresholdFailureDetector*", 2, true); checkMbeanIdCount(PIPELINE_ROUTED_STATS_DOMAIN, "*", 2, true); checkMbeanIdCount(CLIENT_REQUEST_DOMAIN, "aggregated*", 2, true); @@ -198,7 +198,7 @@ public void testMultipleInDistinctClientsOnMultipleFactories() { bootStrap(clients, 10); checkMbeanIdCount(CLIENT_DOMAIN, "ClientThreadPool*", 2, true); - checkMbeanIdCount(CLIENT_DOMAIN, "ZenStoreClient*", 2, true); + checkMbeanIdCount(CLIENT_DOMAIN, "*StoreClient*", 2, true); checkMbeanIdCount(CLUSTER_FAILUREDETECTOR_DOMAIN, "ThresholdFailureDetector*", 2, true); checkMbeanIdCount(PIPELINE_ROUTED_STATS_DOMAIN, "*", 4, true); checkMbeanIdCount(CLIENT_REQUEST_DOMAIN, "aggregated*", 2, true); diff --git a/test/unit/voldemort/client/protocol/admin/QueryKeyResultTest.java b/test/unit/voldemort/client/protocol/admin/QueryKeyResultTest.java new file mode 100644 index 0000000000..cbfbe375a7 --- /dev/null +++ b/test/unit/voldemort/client/protocol/admin/QueryKeyResultTest.java @@ -0,0 +1,67 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package voldemort.client.protocol.admin; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.util.ArrayList; +import java.util.List; + +import org.junit.Test; + +import voldemort.TestUtils; +import voldemort.utils.ByteArray; +import voldemort.versioning.Versioned; + +public class QueryKeyResultTest { + + @Test + public void testStandardCtor() { + ByteArray key = new ByteArray("key".getBytes()); + List> values = new ArrayList>(0); + + Versioned value1 = TestUtils.getVersioned(TestUtils.randomBytes(10), 1, 1, 1); + values.add(value1); + + Versioned value2 = TestUtils.getVersioned(TestUtils.randomBytes(10), 1, 1, 2); + values.add(value2); + + QueryKeyResult queryKeyResult = new QueryKeyResult(key, values); + + assertTrue(queryKeyResult.hasValues()); + assertEquals(values, queryKeyResult.getValues()); + + assertFalse(queryKeyResult.hasException()); + assertEquals(null, queryKeyResult.getException()); + } + + @Test + public void testExceptionCtor() { + ByteArray key = new ByteArray("key".getBytes()); + + Exception e = new Exception(); + QueryKeyResult queryKeyResult = new QueryKeyResult(key, e); + + assertFalse(queryKeyResult.hasValues()); + assertEquals(null, queryKeyResult.getValues()); + + assertTrue(queryKeyResult.hasException()); + assertEquals(e, queryKeyResult.getException()); + } + +} diff --git a/test/unit/voldemort/client/protocol/admin/StreamingClientTest.java b/test/unit/voldemort/client/protocol/admin/StreamingClientTest.java new file mode 100644 index 0000000000..6226cc1f1e --- /dev/null +++ b/test/unit/voldemort/client/protocol/admin/StreamingClientTest.java @@ -0,0 +1,246 @@ +package voldemort.client.protocol.admin; + +import static org.junit.Assert.assertEquals; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Properties; +import java.util.concurrent.Callable; + +import org.apache.commons.io.FileUtils; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +import voldemort.ServerTestUtils; +import voldemort.TestUtils; +import voldemort.client.RoutingTier; +import voldemort.cluster.Cluster; +import voldemort.cluster.Node; +import voldemort.routing.RoutingStrategy; +import voldemort.routing.RoutingStrategyFactory; +import voldemort.routing.RoutingStrategyType; +import voldemort.serialization.DefaultSerializerFactory; +import voldemort.serialization.Serializer; +import voldemort.serialization.SerializerDefinition; +import voldemort.serialization.SerializerFactory; +import voldemort.server.VoldemortServer; +import voldemort.store.StoreDefinition; +import voldemort.store.StoreDefinitionBuilder; +import voldemort.store.compress.CompressionStrategy; +import voldemort.store.compress.CompressionStrategyFactory; +import voldemort.store.memory.InMemoryStorageConfiguration; +import voldemort.store.socket.SocketStoreFactory; +import voldemort.store.socket.clientrequest.ClientRequestExecutorPool; +import voldemort.utils.ByteArray; +import voldemort.utils.Props; +import voldemort.versioning.Versioned; +import voldemort.xml.StoreDefinitionsMapper; + +import com.google.common.collect.Lists; + +/* + * Starts a streaming session and inserts some keys Using fetchKeys we check if + * they keys made it to the responsible node + */ +public class StreamingClientTest { + + private static long startTime; + public static final String SERVER_LOCAL_URL = "tcp://localhost:"; + public static final String TEST_STORE_NAME = "test-store-streaming-1"; + public static final String STORES_XML_FILE = "test/common/voldemort/config/stores.xml"; + + public static final int TOTAL_SERVERS = 2; + + private static int NUM_KEYS_1 = 4000; + private static SocketStoreFactory socketStoreFactory = new ClientRequestExecutorPool(TOTAL_SERVERS, + 10000, + 100000, + 32 * 1024); + private static VoldemortServer[] servers = null; + private static int[] serverPorts = null; + private static Cluster cluster = ServerTestUtils.getLocalCluster(2, new int[][] { + { 0, 1, 2, 3 }, { 4, 5, 6, 7 } }); + private static AdminClient adminClient; + + private static SerializerFactory serializerFactory = new DefaultSerializerFactory(); + + private static StoreDefinition storeDef; + + @BeforeClass + public static void testSetup() { + + if(null == servers) { + servers = new VoldemortServer[TOTAL_SERVERS]; + serverPorts = new int[TOTAL_SERVERS]; + + storeDef = new StoreDefinitionBuilder().setName(TEST_STORE_NAME) + .setType(InMemoryStorageConfiguration.TYPE_NAME) + .setKeySerializer(new SerializerDefinition("string")) + .setValueSerializer(new SerializerDefinition("string")) + .setRoutingPolicy(RoutingTier.SERVER) + .setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY) + .setReplicationFactor(2) + .setPreferredReads(1) + .setRequiredReads(1) + .setPreferredWrites(2) + .setRequiredWrites(2) + .build(); + + File tempStoreXml = new File(TestUtils.createTempDir(), "stores.xml"); + try { + FileUtils.writeStringToFile(tempStoreXml, + new StoreDefinitionsMapper().writeStoreList(Lists.newArrayList(storeDef))); + } catch(IOException e1) { + // TODO Auto-generated catch block + e1.printStackTrace(); + } + + for(int i = 0; i < TOTAL_SERVERS; i++) { + try { + servers[i] = ServerTestUtils.startVoldemortServer(socketStoreFactory, + ServerTestUtils.createServerConfig(true, + i, + TestUtils.createTempDir() + .getAbsolutePath(), + null, + tempStoreXml.getAbsolutePath(), + new Properties()), + cluster); + } catch(IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + serverPorts[i] = servers[i].getIdentityNode().getSocketPort(); + } + adminClient = ServerTestUtils.getAdminClient(cluster); + } + + startTime = System.currentTimeMillis(); + + } + + @AfterClass + public static void testCleanup() { + // Teardown for data used by the unit tests + } + + @Test + public void testStreaming() { + + Props property = new Props(); + property.put("streaming.platform.bootstrapURL", SERVER_LOCAL_URL + serverPorts[0]); + StreamingClientConfig config = new StreamingClientConfig(property); + + StreamingClient streamer = new StreamingClient(config); + + streamer.initStreamingSession(TEST_STORE_NAME, new Callable() { + + @Override + public Object call() throws Exception { + + return null; + } + }, new Callable() { + + @Override + public Object call() throws Exception { + + return null; + } + }, true); + + for(int i = 0; i < NUM_KEYS_1; i++) { + String key = i + ""; + String value = key; + + Versioned outputValue = Versioned.value(value.getBytes()); + // adminClient.streamingPut(new ByteArray(key.getBytes()), + // outputValue); + streamer.streamingPut(new ByteArray(key.getBytes()), outputValue); + } + streamer.commitToVoldemort(); + streamer.closeStreamingSession(); + assertEquals(verifyKeysExist(), true); + + } + + /* + * Checks if each node has the keys it is reponsible for returns false + * otherwise + */ + public boolean verifyKeysExist() { + RoutingStrategyFactory factory = new RoutingStrategyFactory(); + RoutingStrategy storeRoutingStrategy = factory.updateRoutingStrategy(storeDef, + adminClient.getAdminClientCluster()); + + HashMap> expectedNodeIdToKeys; + expectedNodeIdToKeys = new HashMap(); + Collection nodesInCluster = adminClient.getAdminClientCluster().getNodes(); + for(Node node: nodesInCluster) { + ArrayList keysForNode = new ArrayList(); + expectedNodeIdToKeys.put(node.getId(), keysForNode); + } + for(int i = 0; i < NUM_KEYS_1; i++) { + String key = i + ""; + String value = key; + List nodeList = storeRoutingStrategy.routeRequest(key.getBytes()); + for(Node node: nodeList) { + ArrayList keysForNode = expectedNodeIdToKeys.get(node.getId()); + keysForNode.add(key); + } + } + + ArrayList fetchedKeysForNode = new ArrayList(); + for(Node node: nodesInCluster) { + + List partitionIdList = Lists.newArrayList(); + partitionIdList.addAll(node.getPartitionIds()); + + Iterator keyIteratorRef = null; + keyIteratorRef = adminClient.bulkFetchOps.fetchKeys(node.getId(), + TEST_STORE_NAME, + partitionIdList, + null, + false); + + final SerializerDefinition serializerDef = storeDef.getKeySerializer(); + final SerializerFactory serializerFactory = new DefaultSerializerFactory(); + @SuppressWarnings("unchecked") + final Serializer serializer = (Serializer) serializerFactory.getSerializer(serializerDef); + + final CompressionStrategy keysCompressionStrategy; + if(serializerDef != null && serializerDef.hasCompression()) { + keysCompressionStrategy = new CompressionStrategyFactory().get(serializerDef.getCompression()); + } else { + keysCompressionStrategy = null; + } + final Iterator keyIterator = keyIteratorRef; + while(keyIterator.hasNext()) { + + byte[] keyBytes = keyIterator.next().get(); + try { + Object keyObject = serializer.toObject((null == keysCompressionStrategy) ? keyBytes + : keysCompressionStrategy.inflate(keyBytes)); + fetchedKeysForNode.add((String) keyObject); + + } catch(IOException e) { + + e.printStackTrace(); + } + } + + } + + ArrayList keysForNode = expectedNodeIdToKeys.get(0); + if(!fetchedKeysForNode.containsAll(keysForNode)) + return false; + else + return true; + } +} diff --git a/test/unit/voldemort/client/rebalance/AbstractRebalanceTest.java b/test/unit/voldemort/client/rebalance/AbstractRebalanceTest.java index 0d4d0aad3d..acf18a6800 100644 --- a/test/unit/voldemort/client/rebalance/AbstractRebalanceTest.java +++ b/test/unit/voldemort/client/rebalance/AbstractRebalanceTest.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2012 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -75,8 +75,10 @@ import voldemort.utils.ByteArray; import voldemort.utils.ByteUtils; import voldemort.utils.KeyLocationValidation; +import voldemort.utils.NodeUtils; import voldemort.utils.Pair; import voldemort.utils.RebalanceUtils; +import voldemort.utils.StoreInstance; import voldemort.utils.Utils; import voldemort.versioning.ObsoleteVersionException; import voldemort.versioning.VectorClock; @@ -89,7 +91,6 @@ public abstract class AbstractRebalanceTest { private static final Logger logger = Logger.getLogger(AbstractRebalanceTest.class.getName()); - protected static int NUM_KEYS = 20; protected static int NUM_RO_CHUNKS_PER_BUCKET = 10; protected static String testStoreNameRW = "test"; protected static String testStoreNameRW2 = "test2"; @@ -221,6 +222,7 @@ public void tearDown() { socketStoreFactory = null; } + // TODO: Any way to not throw exception from here? protected abstract Cluster startServers(Cluster cluster, String StoreDefXmlFile, List nodeToStart, @@ -263,9 +265,12 @@ public void checkConsistentMetadata(Cluster targetCluster, List serverL } } - protected int getNumKeys() { - return NUM_KEYS; - } + /** + * This method determines the "size" of the test to run... + * + * @return + */ + protected abstract int getNumKeys(); @Test(timeout = 600000) public void testRORWRebalance() throws Exception { @@ -504,11 +509,11 @@ public void testRebalanceCleanPrimary() throws Exception { movedPartitions.add(3); AdminClient admin = rebalanceClient.getAdminClient(); Iterator keys = null; - keys = admin.fetchKeys(1, - rwStoreDefWithReplication.getName(), - movedPartitions, - null, - false); + keys = admin.bulkFetchOps.fetchKeys(1, + rwStoreDefWithReplication.getName(), + movedPartitions, + null, + false); int keyIndex = 0; while(keys.hasNext() && keyIndex < 20) { checkKeysNegative[keyIndex++] = keys.next(); @@ -517,11 +522,11 @@ public void testRebalanceCleanPrimary() throws Exception { List stablePartitions = new ArrayList(); stablePartitions.add(1); Iterator keys2 = null; - keys2 = admin.fetchKeys(1, - rwStoreDefWithReplication.getName(), - stablePartitions, - null, - false); + keys2 = admin.bulkFetchOps.fetchKeys(1, + rwStoreDefWithReplication.getName(), + stablePartitions, + null, + false); int keyIndex2 = 0; while(keys2.hasNext() && keyIndex2 < 20) { checkKeysPositive[keyIndex2++] = keys2.next(); @@ -537,7 +542,7 @@ public void testRebalanceCleanPrimary() throws Exception { // Do the cleanup operation for(int i = 0; i < 3; i++) { - admin.repairJob(i); + admin.storeMntOps.repairJob(i); } boolean cleanNode = true; @@ -606,11 +611,11 @@ public void testRebalanceCleanSecondary() throws Exception { movedPartitions.add(3); AdminClient admin = rebalanceClient.getAdminClient(); Iterator keys = null; - keys = admin.fetchKeys(1, - rwStoreDefWithReplication.getName(), - movedPartitions, - null, - false); + keys = admin.bulkFetchOps.fetchKeys(1, + rwStoreDefWithReplication.getName(), + movedPartitions, + null, + false); int keyIndex = 0; while(keys.hasNext() && keyIndex < 20) { checkKeysNegative[keyIndex++] = keys.next(); @@ -620,11 +625,11 @@ public void testRebalanceCleanSecondary() throws Exception { List stablePartitions = new ArrayList(); stablePartitions.add(3); Iterator keys2 = null; - keys2 = admin.fetchKeys(0, - rwStoreDefWithReplication.getName(), - stablePartitions, - null, - false); + keys2 = admin.bulkFetchOps.fetchKeys(0, + rwStoreDefWithReplication.getName(), + stablePartitions, + null, + false); int keyIndex2 = 0; while(keys2.hasNext() && keyIndex2 < 20) { checkKeysPositive[keyIndex2++] = keys2.next(); @@ -640,7 +645,7 @@ public void testRebalanceCleanSecondary() throws Exception { // Do the cleanup operation for(int i = 0; i < 3; i++) { - admin.repairJob(i); + admin.storeMntOps.repairJob(i); } boolean cleanNode = true; @@ -1103,7 +1108,7 @@ protected void populateData(Cluster cluster, cluster); for(Entry entry: testEntries.entrySet()) { ByteArray keyBytes = new ByteArray(ByteUtils.getBytes(entry.getKey(), "UTF-8")); - List preferenceNodes = RebalanceUtils.getNodeIds(routing.routeRequest(keyBytes.get())); + List preferenceNodes = NodeUtils.getNodeIds(routing.routeRequest(keyBytes.get())); // Go over every node for(int nodeId: preferenceNodes) { @@ -1210,7 +1215,7 @@ private void checkGetEntries(Node node, List partitions = routing.getPartitionList(keyBytes.get()); - if(RebalanceUtils.checkKeyBelongsToPartition(partitions, + if(StoreInstance.checkKeyBelongsToPartition(partitions, node.getPartitionIds(), flattenedPresentTuples)) { List> values = store.get(keyBytes, null); diff --git a/test/unit/voldemort/client/rebalance/AdminRebalanceTest.java b/test/unit/voldemort/client/rebalance/AdminRebalanceTest.java index 685a763e9a..2f97eb0e4e 100644 --- a/test/unit/voldemort/client/rebalance/AdminRebalanceTest.java +++ b/test/unit/voldemort/client/rebalance/AdminRebalanceTest.java @@ -1,5 +1,5 @@ /* - * Copyright 2012 LinkedIn, Inc + * Copyright 2012-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -72,6 +72,7 @@ import voldemort.store.socket.SocketStoreFactory; import voldemort.store.socket.clientrequest.ClientRequestExecutorPool; import voldemort.utils.ByteArray; +import voldemort.utils.NodeUtils; import voldemort.utils.Pair; import voldemort.utils.RebalanceUtils; import voldemort.utils.Utils; @@ -329,7 +330,7 @@ private VoldemortServer getServer(int nodeId) { public void shutDown() throws IOException { if(adminClient != null) - adminClient.stop(); + adminClient.close(); for(VoldemortServer server: servers) { if(server != null) ServerTestUtils.stopVoldemortServer(server); @@ -389,7 +390,7 @@ public void testRebalanceNodeRW() throws IOException { } try { - adminClient.rebalanceNode(plans.get(0)); + adminClient.rebalanceOps.rebalanceNode(plans.get(0)); fail("Should have thrown an exception since not in rebalancing state"); } catch(VoldemortException e) {} @@ -401,7 +402,7 @@ public void testRebalanceNodeRW() throws IOException { } try { - adminClient.rebalanceNode(plans.get(0)); + adminClient.rebalanceOps.rebalanceNode(plans.get(0)); fail("Should have thrown an exception since no steal info"); } catch(VoldemortException e) { @@ -409,9 +410,10 @@ public void testRebalanceNodeRW() throws IOException { // Put a plan different from the plan that we actually want to // execute + int incorrectStealerId = (plans.get(0).getStealerId() + 1) % 3; getServer(plans.get(0).getStealerId()).getMetadataStore() .put(MetadataStore.REBALANCING_STEAL_INFO, - new RebalancerState(Lists.newArrayList(new RebalancePartitionsInfo(100, + new RebalancerState(Lists.newArrayList(new RebalancePartitionsInfo(incorrectStealerId, plans.get(0) .getDonorId(), plans.get(0) @@ -423,7 +425,7 @@ public void testRebalanceNodeRW() throws IOException { 0)))); try { - adminClient.rebalanceNode(plans.get(0)); + adminClient.rebalanceOps.rebalanceNode(plans.get(0)); fail("Should have thrown an exception since the two plans eventhough have the same donor are different"); } catch(VoldemortException e) { @@ -444,20 +446,20 @@ public void testRebalanceNodeRW() throws IOException { // Actually run it try { for(RebalancePartitionsInfo currentPlan: plans) { - int asyncId = adminClient.rebalanceNode(currentPlan); + int asyncId = adminClient.rebalanceOps.rebalanceNode(currentPlan); // Try submitting the same job again, should throw // AlreadyRebalancingException try { - adminClient.rebalanceNode(currentPlan); + adminClient.rebalanceOps.rebalanceNode(currentPlan); fail("Should have thrown an exception since it is already rebalancing"); } catch(AlreadyRebalancingException e) {} assertNotSame("Got a valid rebalanceAsyncId", -1, asyncId); - getAdminClient().waitForCompletion(currentPlan.getStealerId(), - asyncId, - 300, - TimeUnit.SECONDS); + getAdminClient().rpcOps.waitForCompletion(currentPlan.getStealerId(), + asyncId, + 300, + TimeUnit.SECONDS); // Test that plan has been removed from the list assertFalse(getServer(currentPlan.getStealerId()).getMetadataStore() @@ -579,12 +581,12 @@ public void testRebalanceNodeRW2() throws IOException { // Actually run it try { for(RebalancePartitionsInfo currentPlan: plans) { - int asyncId = adminClient.rebalanceNode(currentPlan); + int asyncId = adminClient.rebalanceOps.rebalanceNode(currentPlan); assertNotSame("Got a valid rebalanceAsyncId", -1, asyncId); - getAdminClient().waitForCompletion(currentPlan.getStealerId(), - asyncId, - 300, - TimeUnit.SECONDS); + getAdminClient().rpcOps.waitForCompletion(currentPlan.getStealerId(), + asyncId, + 300, + TimeUnit.SECONDS); // Test that plan has been removed from the list assertFalse(getServer(currentPlan.getStealerId()).getMetadataStore() @@ -748,12 +750,12 @@ public void testRebalanceNodeRO() throws IOException { // Actually run it try { for(RebalancePartitionsInfo currentPlan: plans) { - int asyncId = adminClient.rebalanceNode(currentPlan); + int asyncId = adminClient.rebalanceOps.rebalanceNode(currentPlan); assertNotSame("Got a valid rebalanceAsyncId", -1, asyncId); - getAdminClient().waitForCompletion(currentPlan.getStealerId(), - asyncId, - 300, - TimeUnit.SECONDS); + getAdminClient().rpcOps.waitForCompletion(currentPlan.getStealerId(), + asyncId, + 300, + TimeUnit.SECONDS); // Test that plan has been removed from the list assertFalse(getServer(currentPlan.getStealerId()).getMetadataStore() @@ -823,14 +825,14 @@ public void testRebalanceNodeRO() throws IOException { .build())); try { - adminClient.rebalanceStateChange(cluster, - targetCluster, - plans, - true, - true, - false, - true, - true); + adminClient.rebalanceOps.rebalanceStateChange(cluster, + targetCluster, + plans, + true, + true, + false, + true, + true); fail("Should have thrown an exception since one node doesn't have the store"); } catch(VoldemortException e) {} @@ -842,14 +844,14 @@ public void testRebalanceNodeRO() throws IOException { checkRO(cluster); // Test 2) All passes scenario - adminClient.rebalanceStateChange(cluster, - targetCluster, - plans, - true, - true, - false, - true, - true); + adminClient.rebalanceOps.rebalanceStateChange(cluster, + targetCluster, + plans, + true, + true, + false, + true, + true); checkRO(targetCluster); @@ -866,11 +868,11 @@ public void testRebalanceNodeRO() throws IOException { // Actually run it try { - int asyncId = adminClient.rebalanceNode(plans.get(0)); - getAdminClient().waitForCompletion(plans.get(0).getStealerId(), - asyncId, - 300, - TimeUnit.SECONDS); + int asyncId = adminClient.rebalanceOps.rebalanceNode(plans.get(0)); + getAdminClient().rpcOps.waitForCompletion(plans.get(0).getStealerId(), + asyncId, + 300, + TimeUnit.SECONDS); fail("Should throw an exception"); } catch(Exception e) {} } finally { @@ -902,12 +904,12 @@ public void testRebalanceNodeRORW() throws IOException, InterruptedException { // Actually run it try { for(RebalancePartitionsInfo currentPlan: plans) { - int asyncId = adminClient.rebalanceNode(currentPlan); + int asyncId = adminClient.rebalanceOps.rebalanceNode(currentPlan); assertNotSame("Got a valid rebalanceAsyncId", -1, asyncId); - getAdminClient().waitForCompletion(currentPlan.getStealerId(), - asyncId, - 300, - TimeUnit.SECONDS); + getAdminClient().rpcOps.waitForCompletion(currentPlan.getStealerId(), + asyncId, + 300, + TimeUnit.SECONDS); // Test that plan has been removed from the list assertFalse(getServer(currentPlan.getStealerId()).getMetadataStore() @@ -934,14 +936,14 @@ public void testRebalanceNodeRORW() throws IOException, InterruptedException { 0)); try { - adminClient.rebalanceStateChange(cluster, - targetCluster, - plans, - true, - true, - true, - true, - true); + adminClient.rebalanceOps.rebalanceStateChange(cluster, + targetCluster, + plans, + true, + true, + true, + true, + true); fail("Should have thrown an exception since we added state before hand"); } catch(VoldemortRebalancingException e) {} @@ -982,14 +984,14 @@ public void testRebalanceNodeRORW() throws IOException, InterruptedException { .build())); try { - adminClient.rebalanceStateChange(cluster, - targetCluster, - plans, - true, - true, - true, - true, - true); + adminClient.rebalanceOps.rebalanceStateChange(cluster, + targetCluster, + plans, + true, + true, + true, + true, + true); fail("Should have thrown an exception since we added state before hand"); } catch(VoldemortRebalancingException e) {} @@ -1015,14 +1017,14 @@ public void testRebalanceNodeRORW() throws IOException, InterruptedException { storeDef4)); // Test 3) Everything should work - adminClient.rebalanceStateChange(cluster, - targetCluster, - plans, - true, - true, - true, - true, - true); + adminClient.rebalanceOps.rebalanceStateChange(cluster, + targetCluster, + plans, + true, + true, + true, + true, + true); List nodesChecked = Lists.newArrayList(); for(RebalancePartitionsInfo plan: plans) { @@ -1035,7 +1037,7 @@ public void testRebalanceNodeRORW() throws IOException, InterruptedException { targetCluster); } - List allNodes = Lists.newArrayList(RebalanceUtils.getNodeIds(Lists.newArrayList(cluster.getNodes()))); + List allNodes = Lists.newArrayList(NodeUtils.getNodeIds(Lists.newArrayList(cluster.getNodes()))); allNodes.removeAll(nodesChecked); // Check all other nodes @@ -1086,14 +1088,14 @@ public void testRebalanceStateChange() throws IOException { startFourNodeRW(); // Test 1) Normal case where-in all are up - adminClient.rebalanceStateChange(cluster, - targetCluster, - plans, - false, - false, - true, - true, - true); + adminClient.rebalanceOps.rebalanceStateChange(cluster, + targetCluster, + plans, + false, + false, + true, + true, + true); List nodesChecked = Lists.newArrayList(); for(RebalancePartitionsInfo plan: plans) { @@ -1102,7 +1104,7 @@ public void testRebalanceStateChange() throws IOException { new RebalancerState(Lists.newArrayList(plan))); } - List allNodes = Lists.newArrayList(RebalanceUtils.getNodeIds(Lists.newArrayList(cluster.getNodes()))); + List allNodes = Lists.newArrayList(NodeUtils.getNodeIds(Lists.newArrayList(cluster.getNodes()))); allNodes.removeAll(nodesChecked); // Check all other nodes @@ -1126,14 +1128,14 @@ public void testRebalanceStateChange() throws IOException { 0)); try { - adminClient.rebalanceStateChange(cluster, - targetCluster, - plans, - false, - false, - true, - true, - true); + adminClient.rebalanceOps.rebalanceStateChange(cluster, + targetCluster, + plans, + false, + false, + true, + true, + true); fail("Should have thrown an exception since we added state before hand"); } catch(VoldemortRebalancingException e) {} @@ -1154,14 +1156,14 @@ public void testRebalanceStateChange() throws IOException { servers[3] = null; try { - adminClient.rebalanceStateChange(cluster, - targetCluster, - plans, - false, - false, - true, - true, - true); + adminClient.rebalanceOps.rebalanceStateChange(cluster, + targetCluster, + plans, + false, + false, + true, + true, + true); fail("Should have thrown an exception since we added state before hand"); } catch(VoldemortRebalancingException e) {} @@ -1185,14 +1187,14 @@ public void testClusterAndRebalanceStateChange() throws IOException { startFourNodeRW(); // Test 1) Normal case where-in all are up - adminClient.rebalanceStateChange(cluster, - targetCluster, - plans, - false, - true, - true, - true, - true); + adminClient.rebalanceOps.rebalanceStateChange(cluster, + targetCluster, + plans, + false, + true, + true, + true, + true); List nodesChecked = Lists.newArrayList(); for(RebalancePartitionsInfo plan: plans) { @@ -1203,7 +1205,7 @@ public void testClusterAndRebalanceStateChange() throws IOException { targetCluster); } - List allNodes = Lists.newArrayList(RebalanceUtils.getNodeIds(Lists.newArrayList(cluster.getNodes()))); + List allNodes = Lists.newArrayList(NodeUtils.getNodeIds(Lists.newArrayList(cluster.getNodes()))); allNodes.removeAll(nodesChecked); // Check all other nodes @@ -1228,14 +1230,14 @@ public void testClusterAndRebalanceStateChange() throws IOException { 0)); try { - adminClient.rebalanceStateChange(cluster, - targetCluster, - plans, - false, - true, - true, - true, - true); + adminClient.rebalanceOps.rebalanceStateChange(cluster, + targetCluster, + plans, + false, + true, + true, + true, + true); fail("Should have thrown an exception since we added state before hand"); } catch(VoldemortRebalancingException e) {} @@ -1257,14 +1259,14 @@ public void testClusterAndRebalanceStateChange() throws IOException { servers[3] = null; try { - adminClient.rebalanceStateChange(cluster, - targetCluster, - plans, - false, - true, - true, - true, - true); + adminClient.rebalanceOps.rebalanceStateChange(cluster, + targetCluster, + plans, + false, + true, + true, + true, + true); fail("Should have thrown an exception since we added state before hand"); } catch(VoldemortRebalancingException e) {} @@ -1309,7 +1311,9 @@ private void buildROStore(StoreDefinition storeDef, int numChunks) throws IOExce generateROFiles(numChunks, 1200, 1000, tuples, tempDir); // Build for store one - adminClient.swapStore(entry.getKey(), storeDef.getName(), tempDir.getAbsolutePath()); + adminClient.readonlyOps.swapStore(entry.getKey(), + storeDef.getName(), + tempDir.getAbsolutePath()); } } diff --git a/test/unit/voldemort/client/rebalance/RebalanceTest.java b/test/unit/voldemort/client/rebalance/RebalanceTest.java index 9f0f06123c..9b4c445ea6 100644 --- a/test/unit/voldemort/client/rebalance/RebalanceTest.java +++ b/test/unit/voldemort/client/rebalance/RebalanceTest.java @@ -46,13 +46,16 @@ @RunWith(Parameterized.class) public class RebalanceTest extends AbstractRebalanceTest { - Map serverMap = new HashMap(); + private final int NUM_KEYS = 20; + + Map serverMap; private final boolean useNio; private final boolean useDonorBased; public RebalanceTest(boolean useNio, boolean useDonorBased) { this.useNio = useNio; this.useDonorBased = useDonorBased; + this.serverMap = new HashMap(); } @Parameters @@ -81,8 +84,16 @@ protected Cluster getCurrentCluster(int nodeId) { } } - // This method may be susceptible to BindException issues due to TOCTOU - // problem with getLocalCluster. + @Override + protected int getNumKeys() { + return NUM_KEYS; + } + + // This method is susceptible to BindException issues due to TOCTOU + // problem with getLocalCluster (which is used to construct cluster that is + // passed in). + // TODO: Refactor AbstractRebalanceTest to take advantage of + // ServerTestUtils.startVoldemortCluster. @Override protected Cluster startServers(Cluster cluster, String storeXmlFile, diff --git a/test/unit/voldemort/cluster/failuredetector/ThresholdFailureDetectorTest.java b/test/unit/voldemort/cluster/failuredetector/ThresholdFailureDetectorTest.java index 25aa19ebc5..7a3461f417 100644 --- a/test/unit/voldemort/cluster/failuredetector/ThresholdFailureDetectorTest.java +++ b/test/unit/voldemort/cluster/failuredetector/ThresholdFailureDetectorTest.java @@ -24,7 +24,6 @@ import static voldemort.cluster.failuredetector.FailureDetectorUtils.create; import static voldemort.cluster.failuredetector.MutableStoreVerifier.create; -import java.net.ConnectException; import java.net.NoRouteToHostException; import java.net.UnknownHostException; @@ -63,13 +62,6 @@ protected Time createTime() throws Exception { public void testCatastrophicErrors() throws Exception { Node node = Iterables.get(cluster.getNodes(), 8); - failureDetector.recordException(node, - 0, - new UnreachableStoreException("intentionalerror", - new ConnectException("intentionalerror"))); - assertEquals(false, failureDetector.isAvailable(node)); - failureDetector.waitForAvailability(node); - failureDetector.recordException(node, 0, new UnreachableStoreException("intentionalerror", @@ -177,7 +169,7 @@ public void testChangeMetadata() throws Exception { failureDetector.recordException(node, 0, new UnreachableStoreException("intentionalerror", - new ConnectException("intentionalerror"))); + new UnknownHostException("intentionalerror"))); /** * Update the failure detector state with the new cluster diff --git a/test/unit/voldemort/scheduled/DataCleanupJobTest.java b/test/unit/voldemort/scheduled/DataCleanupJobTest.java index 60dd9a4346..bc4f88aa0b 100644 --- a/test/unit/voldemort/scheduled/DataCleanupJobTest.java +++ b/test/unit/voldemort/scheduled/DataCleanupJobTest.java @@ -16,16 +16,32 @@ package voldemort.scheduled; +import static junit.framework.Assert.assertEquals; +import static junit.framework.Assert.assertTrue; +import static junit.framework.Assert.fail; + import java.io.File; +import java.io.StringReader; +import java.util.Arrays; +import java.util.Calendar; +import java.util.Collection; import java.util.Date; +import java.util.GregorianCalendar; import java.util.List; - -import junit.framework.TestCase; +import java.util.Map; +import java.util.Random; import org.apache.commons.io.FileDeleteStrategy; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; import voldemort.MockTime; import voldemort.TestUtils; +import voldemort.VoldemortTestConstants; import voldemort.common.service.SchedulerService; import voldemort.server.VoldemortConfig; import voldemort.server.scheduler.DataCleanupJob; @@ -33,22 +49,37 @@ import voldemort.store.StorageEngine; import voldemort.store.StoreDefinition; import voldemort.store.bdb.BdbStorageConfiguration; +import voldemort.store.retention.RetentionEnforcingStore; import voldemort.utils.ByteArray; import voldemort.utils.EventThrottler; import voldemort.utils.Props; import voldemort.utils.SystemTime; import voldemort.utils.Time; +import voldemort.utils.Utils; import voldemort.versioning.VectorClock; import voldemort.versioning.Versioned; +import voldemort.xml.StoreDefinitionsMapper; -public class DataCleanupJobTest extends TestCase { +@RunWith(Parameterized.class) +public class DataCleanupJobTest { private MockTime time; private StorageEngine engine; private File storeDir; private BdbStorageConfiguration bdbStorage; + private boolean prefixPartitionId; + + public DataCleanupJobTest(boolean prefixPartitionId) { + this.prefixPartitionId = prefixPartitionId; + } - @Override + @Parameters + public static Collection modes() { + Object[][] data = new Object[][] { { true }, { false } }; + return Arrays.asList(data); + } + + @Before public void setUp() throws Exception { time = new MockTime(); storeDir = TestUtils.createTempDir(); @@ -62,15 +93,16 @@ public void setUp() throws Exception { voldemortConfig.setBdbCacheSize(1024 * 1024); voldemortConfig.setBdbOneEnvPerStore(true); voldemortConfig.setBdbDataDirectory(storeDir.toURI().getPath()); + voldemortConfig.setBdbPrefixKeysWithPartitionId(prefixPartitionId); bdbStorage = new BdbStorageConfiguration(voldemortConfig); StoreDefinition defA = TestUtils.makeStoreDefinition("cleanupTestStore"); - engine = bdbStorage.getStore(defA); + engine = bdbStorage.getStore(defA, TestUtils.makeSingleNodeRoutingStrategy()); } - @Override - protected void tearDown() throws Exception { - super.tearDown(); + @After + public void tearDown() throws Exception { + try { if(engine != null) engine.close(); @@ -81,6 +113,7 @@ protected void tearDown() throws Exception { } } + @Test public void testCleanupFrequency() { SchedulerService scheduler = new SchedulerService(1, time); @@ -145,6 +178,7 @@ public void testCleanupFrequency() { } } + @Test public void testCleanupCleansUp() { time.setTime(123); put("a", "b", "c"); @@ -166,6 +200,127 @@ public void testCleanupCleansUp() { assertContains("a", "d", "e", "f"); } + public void testCleanupStartTime() { + // Make sure the default is always the next day. + GregorianCalendar cal = new GregorianCalendar(); + assertEquals("Default is not tomorrow", + Utils.getDayOfTheWeekFromNow(1), + (cal.get(Calendar.DAY_OF_WEEK) + 1) % 7); + + // When starting the server any day in the week from SUN to FRI and + // targeting a saturday, should always start on the next saturday + GregorianCalendar expectedStart = TestUtils.getCalendar(2012, + Calendar.SEPTEMBER, + 29, + 0, + 0, + 0); + Random rand = new Random(); + for(int day = Calendar.SUNDAY; day <= Calendar.FRIDAY; day++) { + GregorianCalendar serverStartTime = TestUtils.getCalendar(2012, + Calendar.SEPTEMBER, + 22 + day, + rand.nextInt(24), + rand.nextInt(60), + rand.nextInt(60)); + GregorianCalendar computedStart = Utils.getCalendarForNextRun(serverStartTime, + Calendar.SATURDAY, + 0); + assertEquals("Expected :" + expectedStart.getTimeInMillis() + " Computed: " + + computedStart.getTimeInMillis(), + expectedStart.getTimeInMillis(), + computedStart.getTimeInMillis()); + } + + // Targeting saturday, 00:00 and starting on a friday 23:59:59 should + // start the next saturday + GregorianCalendar serverStartTime = TestUtils.getCalendar(2012, + Calendar.SEPTEMBER, + 28, + 23, + 59, + 59); + GregorianCalendar computedStart = Utils.getCalendarForNextRun(serverStartTime, + Calendar.SATURDAY, + 0); + assertEquals("Expected :" + expectedStart.getTimeInMillis() + " Computed: " + + computedStart.getTimeInMillis(), + expectedStart.getTimeInMillis(), + computedStart.getTimeInMillis()); + + // If we start past the start hour on the target day, it should start + // the next week + serverStartTime = TestUtils.getCalendar(2012, Calendar.SEPTEMBER, 29, 1, 0, 1); + computedStart = Utils.getCalendarForNextRun(serverStartTime, Calendar.SATURDAY, 0); + assertEquals(Calendar.SATURDAY, computedStart.get(Calendar.DAY_OF_WEEK)); + assertEquals(serverStartTime.get(Calendar.DAY_OF_YEAR) + 7, + computedStart.get(Calendar.DAY_OF_YEAR)); + } + + private void runRetentionEnforcingStoreTest(boolean onlineDeletes) throws InterruptedException { + + time.setTime(System.currentTimeMillis()); + StoreDefinition retentionStoreDef = new StoreDefinitionsMapper().readStoreList(new StringReader(VoldemortTestConstants.getStoreDefinitionsWithRetentionXml())) + .get(0); + RetentionEnforcingStore store = new RetentionEnforcingStore(engine, + retentionStoreDef, + onlineDeletes, + time); + // do a bunch of puts + store.put(new ByteArray("k1".getBytes()), new Versioned("v1".getBytes()), null); + store.put(new ByteArray("k2".getBytes()), new Versioned("v2".getBytes()), null); + long writeMs = System.currentTimeMillis(); + + // wait for a bit and then do more puts + Thread.sleep(2000); + + store.put(new ByteArray("k3".getBytes()), new Versioned("v3".getBytes()), null); + store.put(new ByteArray("k4".getBytes()), new Versioned("v4".getBytes()), null); + + // move time forward just enough such that some keys will have expired. + time.setTime(writeMs + retentionStoreDef.getRetentionDays() * Time.MS_PER_DAY + 1); + assertEquals("k1 should have expired", 0, store.get(new ByteArray("k1".getBytes()), null) + .size()); + assertEquals("k2 should have expired", 0, store.get(new ByteArray("k2".getBytes()), null) + .size()); + + assertTrue("k3 should not have expired", store.get(new ByteArray("k3".getBytes()), null) + .size() > 0); + assertTrue("k4 should not have expired", store.get(new ByteArray("k4".getBytes()), null) + .size() > 0); + // get all with k1, k4 should return a map with k4 alone + Map>> getAllResult = store.getAll(Arrays.asList(new ByteArray("k1".getBytes()), + new ByteArray("k4".getBytes())), + null); + assertEquals("map should contain one element only", 1, getAllResult.size()); + assertEquals("k1 should not be present", + false, + getAllResult.containsKey(new ByteArray("k1".getBytes()))); + assertEquals("k4 should be present", + true, + getAllResult.containsKey(new ByteArray("k4".getBytes()))); + + // if online deletes are not configured, we should see the deleted keys + // in the base bdb store, so the datacleanup job can go and delete them + assertEquals("k1 should be present", + !onlineDeletes, + engine.get(new ByteArray("k1".getBytes()), null).size() > 0); + assertEquals("k2 should be present", + !onlineDeletes, + engine.get(new ByteArray("k2".getBytes()), null).size() > 0); + + // delete everything for next run + engine.truncate(); + } + + public void testRetentionEnforcingStore() throws InterruptedException { + runRetentionEnforcingStoreTest(false); + } + + public void testRetentionEnforcingStoreOnlineDeletes() throws InterruptedException { + runRetentionEnforcingStoreTest(true); + } + private void put(String... items) { for(String item: items) { VectorClock clock = null; diff --git a/test/unit/voldemort/scheduled/StreamingSlopPusherTest.java b/test/unit/voldemort/scheduled/StreamingSlopPusherTest.java index e44dddb6d5..ab5e1075a6 100644 --- a/test/unit/voldemort/scheduled/StreamingSlopPusherTest.java +++ b/test/unit/voldemort/scheduled/StreamingSlopPusherTest.java @@ -93,9 +93,12 @@ public void setUp() throws Exception { } } - // This method may be susceptible to BindException issues due to TOCTOU - // problem with getLocalCluster. - private void startServers(int... nodeIds) { + // This method is susceptible to BindException issues due to TOCTOU + // problem with getLocalCluster. It is not obvious how to change this set of + // unit tests to better protect against BindException risk since subsets of + // servers are started and stopped. And, since there are multiple + // invocations of startServers within one test in some cases. + private void startServers(int... nodeIds) throws IOException { for(int nodeId: nodeIds) { if(nodeId < NUM_SERVERS) { servers[nodeId] = ServerTestUtils.startVoldemortServer(socketStoreFactory, diff --git a/test/unit/voldemort/server/gossip/GossiperTest.java b/test/unit/voldemort/server/gossip/GossiperTest.java index 041190223a..cb4336e943 100644 --- a/test/unit/voldemort/server/gossip/GossiperTest.java +++ b/test/unit/voldemort/server/gossip/GossiperTest.java @@ -1,5 +1,5 @@ /* - * Copyright 2012 LinkedIn, Inc + * Copyright 2012-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -38,6 +38,7 @@ import voldemort.Attempt; import voldemort.ServerTestUtils; import voldemort.TestUtils; +import voldemort.client.ClientConfig; import voldemort.client.protocol.admin.AdminClient; import voldemort.client.protocol.admin.AdminClientConfig; import voldemort.cluster.Cluster; @@ -149,7 +150,7 @@ public void tearDown() { } private AdminClient getAdminClient(Cluster newCluster) { - return new AdminClient(newCluster, new AdminClientConfig()); + return new AdminClient(newCluster, new AdminClientConfig(), new ClientConfig()); } private Cluster attemptStartAdditionalServer() throws IOException { @@ -214,8 +215,8 @@ public void testGossiper() throws Exception { // Get the new cluster.xml AdminClient localAdminClient = getAdminClient(newCluster); - Versioned versionedClusterXML = localAdminClient.getRemoteMetadata(3, - MetadataStore.CLUSTER_KEY); + Versioned versionedClusterXML = localAdminClient.metadataMgmtOps.getRemoteMetadata(3, + MetadataStore.CLUSTER_KEY); // Increment the version, let what would be the "donor node" know about // it to seed the Gossip. @@ -223,8 +224,12 @@ public void testGossiper() throws Exception { ((VectorClock) version).incrementVersion(3, ((VectorClock) version).getTimestamp() + 1); ((VectorClock) version).incrementVersion(0, ((VectorClock) version).getTimestamp() + 1); - localAdminClient.updateRemoteMetadata(0, MetadataStore.CLUSTER_KEY, versionedClusterXML); - localAdminClient.updateRemoteMetadata(3, MetadataStore.CLUSTER_KEY, versionedClusterXML); + localAdminClient.metadataMgmtOps.updateRemoteMetadata(0, + MetadataStore.CLUSTER_KEY, + versionedClusterXML); + localAdminClient.metadataMgmtOps.updateRemoteMetadata(3, + MetadataStore.CLUSTER_KEY, + versionedClusterXML); try { Thread.sleep(500); diff --git a/test/unit/voldemort/server/socket/ClientRequestExecutorPoolTest.java b/test/unit/voldemort/server/socket/ClientRequestExecutorPoolTest.java index 9807b94f9a..cde7882cbb 100644 --- a/test/unit/voldemort/server/socket/ClientRequestExecutorPoolTest.java +++ b/test/unit/voldemort/server/socket/ClientRequestExecutorPoolTest.java @@ -16,13 +16,14 @@ package voldemort.server.socket; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.List; -import junit.framework.TestCase; - import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -45,7 +46,7 @@ * */ @RunWith(Parameterized.class) -public class ClientRequestExecutorPoolTest extends TestCase { +public class ClientRequestExecutorPoolTest { private int port; private int maxConnectionsPerNode = 3; @@ -64,7 +65,6 @@ public static Collection configs() { return Arrays.asList(new Object[][] { { true }, { false } }); } - @Override @Before public void setUp() { this.port = ServerTestUtils.findFreePort(); @@ -88,7 +88,6 @@ public void setUp() { this.server.start(); } - @Override @After public void tearDown() { this.pool.close(); diff --git a/test/unit/voldemort/store/bdb/BdbCachePartitioningTest.java b/test/unit/voldemort/store/bdb/BdbCachePartitioningTest.java index 5f4fa0d110..e4d56b37bb 100644 --- a/test/unit/voldemort/store/bdb/BdbCachePartitioningTest.java +++ b/test/unit/voldemort/store/bdb/BdbCachePartitioningTest.java @@ -16,11 +16,21 @@ package voldemort.store.bdb; -import java.io.File; +import static junit.framework.Assert.assertEquals; +import static junit.framework.Assert.assertTrue; +import static junit.framework.Assert.fail; -import junit.framework.TestCase; +import java.io.File; +import java.util.Arrays; +import java.util.Collection; import org.apache.commons.io.FileDeleteStrategy; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; import voldemort.TestUtils; import voldemort.server.VoldemortConfig; @@ -38,27 +48,32 @@ * checks that BDB cache partitioning works and caches stay within limits * */ -public class BdbCachePartitioningTest extends TestCase { +@RunWith(Parameterized.class) +public class BdbCachePartitioningTest { private File bdbMasterDir; private BdbStorageConfiguration bdbStorage; + private boolean prefixPartitionId; + + public BdbCachePartitioningTest(boolean prefixPartitionId) { + this.prefixPartitionId = prefixPartitionId; + } - @Override - protected void setUp() throws Exception { - super.setUp(); + @Parameters + public static Collection modes() { + Object[][] data = new Object[][] { { true }, { false } }; + return Arrays.asList(data); + } + + @Before + public void setUp() throws Exception { bdbMasterDir = TestUtils.createTempDir(); FileDeleteStrategy.FORCE.delete(bdbMasterDir); } - @Override - protected void tearDown() throws Exception { - super.tearDown(); - try { - if(bdbStorage != null) - bdbStorage.close(); - } finally { - FileDeleteStrategy.FORCE.delete(bdbMasterDir); - } + @After + public void tearDown() throws Exception { + FileDeleteStrategy.FORCE.delete(bdbMasterDir); } private EnvironmentStats getStats(Environment environment) { @@ -81,6 +96,7 @@ private long getCacheSize(StoreDefinition storeDef) { * conditions), stores will stay within their limits, no matter how much * disproportinate traffic you throw at it */ + @Test public void testStaticPrivateCaches() { int totalCache = 20 * ByteUtils.BYTES_PER_MB; // total cache size @@ -89,96 +105,113 @@ public void testStaticPrivateCaches() { int shareC = totalCache - shareA - shareB; // the rest, 5 MB int numRecords = 40; - // lets use all the default values. - Props props = new Props(); - props.put("node.id", 1); - props.put("voldemort.home", "test/common/voldemort/config"); - VoldemortConfig voldemortConfig = new VoldemortConfig(props); - voldemortConfig.setBdbCacheSize(totalCache); - voldemortConfig.setBdbOneEnvPerStore(true); - voldemortConfig.setBdbDataDirectory(bdbMasterDir.toURI().getPath()); - - bdbStorage = new BdbStorageConfiguration(voldemortConfig); - StoreDefinition defA = TestUtils.makeStoreDefinition("storeA", shareA - / (ByteUtils.BYTES_PER_MB)); - BdbStorageEngine storeA = (BdbStorageEngine) bdbStorage.getStore(defA); - - StoreDefinition defB = TestUtils.makeStoreDefinition("storeB", shareB - / (ByteUtils.BYTES_PER_MB)); - BdbStorageEngine storeB = (BdbStorageEngine) bdbStorage.getStore(defB); - - StoreDefinition defC = TestUtils.makeStoreDefinition("storeC"); - BdbStorageEngine storeC = (BdbStorageEngine) bdbStorage.getStore(defC); - - // before any traffic, the cache will not have grown - assertTrue(Math.abs(shareA - getCacheSize(defA)) > ByteUtils.BYTES_PER_MB); - assertTrue(Math.abs(shareB - getCacheSize(defB)) > ByteUtils.BYTES_PER_MB); - - // sharedCacheSize reading 0 confirms that the store has a private cache - assertEquals(0, getStats(bdbStorage.getEnvironment(defA)).getSharedCacheTotalBytes()); - assertEquals(0, getStats(bdbStorage.getEnvironment(defB)).getSharedCacheTotalBytes()); - - // load data into the stores; each store is guaranteed to be ~ 40MB. - // Data won't fit in memory - byte[] value = new byte[ByteUtils.BYTES_PER_MB]; - for(int i = 0; i < numRecords; i++) { - storeA.put(TestUtils.toByteArray("testKey" + i), new Versioned(value), null); - storeB.put(TestUtils.toByteArray("testKey" + i), new Versioned(value), null); - storeC.put(TestUtils.toByteArray("testKey" + i), new Versioned(value), null); - } - - // we will bring all of that data into the cache, by doing a keywalk. - // This should expand the cache as much as possible - long cacheSizeA = Long.MIN_VALUE; - long cacheSizeB = Long.MIN_VALUE; - long cacheSizeC = Long.MIN_VALUE; - - for(int cycle = 0; cycle < 10; cycle++) { + BdbStorageEngine storeA = null, storeB = null, storeC = null; + try { + // lets use all the default values. + Props props = new Props(); + props.put("node.id", 1); + props.put("voldemort.home", "test/common/voldemort/config"); + VoldemortConfig voldemortConfig = new VoldemortConfig(props); + voldemortConfig.setBdbCacheSize(totalCache); + voldemortConfig.setBdbOneEnvPerStore(true); + voldemortConfig.setBdbDataDirectory(bdbMasterDir.toURI().getPath()); + voldemortConfig.setBdbPrefixKeysWithPartitionId(prefixPartitionId); + + bdbStorage = new BdbStorageConfiguration(voldemortConfig); + StoreDefinition defA = TestUtils.makeStoreDefinition("storeA", + shareA / (ByteUtils.BYTES_PER_MB)); + storeA = (BdbStorageEngine) bdbStorage.getStore(defA, + TestUtils.makeSingleNodeRoutingStrategy()); + + StoreDefinition defB = TestUtils.makeStoreDefinition("storeB", + shareB / (ByteUtils.BYTES_PER_MB)); + storeB = (BdbStorageEngine) bdbStorage.getStore(defB, + TestUtils.makeSingleNodeRoutingStrategy()); + + StoreDefinition defC = TestUtils.makeStoreDefinition("storeC"); + storeC = (BdbStorageEngine) bdbStorage.getStore(defC, + TestUtils.makeSingleNodeRoutingStrategy()); + + // before any traffic, the cache will not have grown + assertTrue("Store A grew without traffic", + Math.abs(shareA - getCacheSize(defA)) > ByteUtils.BYTES_PER_MB); + assertTrue("Store B grew without traffic", + Math.abs(shareB - getCacheSize(defB)) > ByteUtils.BYTES_PER_MB); + + // sharedCacheSize reading 0 confirms that the store has a private + // cache + assertEquals("Store A has non zero shared cache", + 0, + getStats(bdbStorage.getEnvironment(defA)).getSharedCacheTotalBytes()); + assertEquals("Store B has non zero shared cache", + 0, + getStats(bdbStorage.getEnvironment(defB)).getSharedCacheTotalBytes()); + + // load data into the stores; each store is guaranteed to be ~ 40MB. + // Data won't fit in memory + byte[] value = new byte[ByteUtils.BYTES_PER_MB]; for(int i = 0; i < numRecords; i++) { - long cycleCacheSizeA = getAndCheckCacheSize(storeA, defA, "testKey" + i); - long cycleCacheSizeB = getAndCheckCacheSize(storeB, defB, "testKey" + i); - long cycleCacheSizeC = getAndCheckCacheSize(storeC, defC, "testKey" + i); - // record the maximum cache size, each store every grew to - cacheSizeA = (cycleCacheSizeA > cacheSizeA) ? cycleCacheSizeA : cacheSizeA; - cacheSizeB = (cycleCacheSizeB > cacheSizeB) ? cycleCacheSizeB : cacheSizeB; - cacheSizeC = (cycleCacheSizeC > cacheSizeC) ? cycleCacheSizeC : cacheSizeC; + storeA.put(TestUtils.toByteArray("testKey" + i), new Versioned(value), null); + storeB.put(TestUtils.toByteArray("testKey" + i), new Versioned(value), null); + storeC.put(TestUtils.toByteArray("testKey" + i), new Versioned(value), null); } - } - - // check that they are certainly less than expected limits. - assertTrue(cacheSizeA <= shareA); - assertTrue(cacheSizeB <= shareB); - assertTrue(cacheSizeC <= shareC); - // check that they are not exceedingly high than their limits. Small - // overflows are okay. But should not be more than a 1MB - assertTrue(Math.abs(cacheSizeA - shareA) <= ByteUtils.BYTES_PER_MB); - assertTrue(Math.abs(cacheSizeB - shareB) <= ByteUtils.BYTES_PER_MB); - assertTrue(Math.abs(cacheSizeC - shareC) <= ByteUtils.BYTES_PER_MB); - - // try doing reads on store C alone, for which we have no reservations. - // This simulates a spike on one store - long cacheSizeCNow = Long.MIN_VALUE; - for(int cycle = 0; cycle < 10; cycle++) { - for(int i = 0; i < numRecords; i++) { - long cycleCacheSizeCNow = getAndCheckCacheSize(storeC, defC, "testkey" + i); - // record the maximum cache size, each store grew to - cacheSizeCNow = (cycleCacheSizeCNow > cacheSizeCNow) ? cycleCacheSizeCNow - : cacheSizeCNow; + // we will bring all of that data into the cache, by doing a + // keywalk. + // This should expand the cache as much as possible + long cacheSizeA = Long.MIN_VALUE; + long cacheSizeB = Long.MIN_VALUE; + long cacheSizeC = Long.MIN_VALUE; + + for(int cycle = 0; cycle < 10; cycle++) { + for(int i = 0; i < numRecords; i++) { + long cycleCacheSizeA = getAndCheckCacheSize(storeA, defA, "testKey" + i); + long cycleCacheSizeB = getAndCheckCacheSize(storeB, defB, "testKey" + i); + long cycleCacheSizeC = getAndCheckCacheSize(storeC, defC, "testKey" + i); + // record the maximum cache size, each store every grew to + cacheSizeA = (cycleCacheSizeA > cacheSizeA) ? cycleCacheSizeA : cacheSizeA; + cacheSizeB = (cycleCacheSizeB > cacheSizeB) ? cycleCacheSizeB : cacheSizeB; + cacheSizeC = (cycleCacheSizeC > cacheSizeC) ? cycleCacheSizeC : cacheSizeC; + } } - } - assertTrue(cacheSizeCNow <= shareC); + // check that they are certainly less than expected limits.Small + // overflows are okay. But should not be more than a 1MB + assertTrue("Store A not within limits", cacheSizeA <= (shareA + ByteUtils.BYTES_PER_MB)); + assertTrue("Store B not within limits", cacheSizeB <= (shareB + ByteUtils.BYTES_PER_MB)); + assertTrue("Store C not within limits", cacheSizeC <= (shareC + ByteUtils.BYTES_PER_MB)); + + // try doing reads on store C alone, for which we have no + // reservations. + // This simulates a spike on one store + long cacheSizeCNow = Long.MIN_VALUE; + for(int cycle = 0; cycle < 10; cycle++) { + for(int i = 0; i < numRecords; i++) { + long cycleCacheSizeCNow = getAndCheckCacheSize(storeC, defC, "testkey" + i); + // record the maximum cache size, each store grew to + cacheSizeCNow = (cycleCacheSizeCNow > cacheSizeCNow) ? cycleCacheSizeCNow + : cacheSizeCNow; + } + } - storeA.close(); - storeB.close(); - storeC.close(); + assertTrue("Store C not within limits after spike", + cacheSizeCNow <= (shareC + ByteUtils.BYTES_PER_MB)); + } finally { + if(storeA != null) + storeA.close(); + if(storeB != null) + storeB.close(); + if(storeC != null) + storeC.close(); + bdbStorage.close(); + } } /** * Tests that any reservation that will not violate minimum shared cache * will fail, during server startup and dynamic updation */ + @Test public void testMinimumSharedCache() { int totalCache = 20 * ByteUtils.BYTES_PER_MB; // total cache size int shareA = 10 * ByteUtils.BYTES_PER_MB;// A reserves 10MB @@ -192,33 +225,40 @@ public void testMinimumSharedCache() { voldemortConfig.setBdbOneEnvPerStore(true); voldemortConfig.setBdbDataDirectory(bdbMasterDir.toURI().getPath()); voldemortConfig.setBdbMinimumSharedCache(15 * ByteUtils.BYTES_PER_MB); + voldemortConfig.setBdbPrefixKeysWithPartitionId(prefixPartitionId); BdbStorageEngine storeA = null; bdbStorage = new BdbStorageConfiguration(voldemortConfig); - assertEquals(0, bdbStorage.getReservedCacheSize()); + assertEquals("Reserved cache size not zero", 0, bdbStorage.getReservedCacheSize()); try { StoreDefinition defA = TestUtils.makeStoreDefinition("storeA", shareA / ByteUtils.BYTES_PER_MB); - storeA = (BdbStorageEngine) bdbStorage.getStore(defA); + storeA = (BdbStorageEngine) bdbStorage.getStore(defA, + TestUtils.makeSingleNodeRoutingStrategy()); fail("Should have thrown exception since minSharedCache will be violated"); } catch(StorageInitializationException sie) { // should come here. } // failing operations should not alter reserved cache size - assertEquals(0, bdbStorage.getReservedCacheSize()); + assertEquals("failure somehow altered the reservedCacheSize", + 0, + bdbStorage.getReservedCacheSize()); voldemortConfig.setBdbMinimumSharedCache(10 * ByteUtils.BYTES_PER_MB); bdbStorage = new BdbStorageConfiguration(voldemortConfig); try { StoreDefinition defA = TestUtils.makeStoreDefinition("storeA", shareA / ByteUtils.BYTES_PER_MB); - storeA = (BdbStorageEngine) bdbStorage.getStore(defA); + storeA = (BdbStorageEngine) bdbStorage.getStore(defA, + TestUtils.makeSingleNodeRoutingStrategy()); } catch(StorageInitializationException sie) { // should not come here. fail("minSharedCache should n't have been violated"); } - assertEquals(shareA, bdbStorage.getReservedCacheSize()); + assertEquals("store A's share does not match up with reserved cache size", + shareA, + bdbStorage.getReservedCacheSize()); long reserveCacheSize = bdbStorage.getReservedCacheSize(); // now, try increasing the reservation dynamically and it should fail @@ -229,13 +269,16 @@ public void testMinimumSharedCache() { } catch(StorageInitializationException sie) { // should come here. } - // this failure cannot alter the reservedCacheSize - assertEquals(reserveCacheSize, bdbStorage.getReservedCacheSize()); + assertEquals("failure somehow altered the reservedCacheSize", + reserveCacheSize, + bdbStorage.getReservedCacheSize()); if(storeA != null) storeA.close(); + bdbStorage.close(); } + @Test public void testDynamicReservations() { int totalCache = 20 * ByteUtils.BYTES_PER_MB; // total cache size int shareA = 10 * ByteUtils.BYTES_PER_MB;// A reserves 10MB @@ -251,13 +294,16 @@ public void testDynamicReservations() { voldemortConfig.setBdbOneEnvPerStore(true); voldemortConfig.setBdbDataDirectory(bdbMasterDir.toURI().getPath()); voldemortConfig.setBdbMinimumSharedCache(5 * ByteUtils.BYTES_PER_MB); + voldemortConfig.setBdbPrefixKeysWithPartitionId(prefixPartitionId); bdbStorage = new BdbStorageConfiguration(voldemortConfig); StoreDefinition defA = TestUtils.makeStoreDefinition("storeA", shareA / (1024 * 1024)); - BdbStorageEngine storeA = (BdbStorageEngine) bdbStorage.getStore(defA); + BdbStorageEngine storeA = (BdbStorageEngine) bdbStorage.getStore(defA, + TestUtils.makeSingleNodeRoutingStrategy()); StoreDefinition defB = TestUtils.makeStoreDefinition("storeB"); - BdbStorageEngine storeB = (BdbStorageEngine) bdbStorage.getStore(defB); + BdbStorageEngine storeB = (BdbStorageEngine) bdbStorage.getStore(defB, + TestUtils.makeSingleNodeRoutingStrategy()); // load data into the stores; each store is guaranteed to be ~ 40MB. // Data won't fit in memory @@ -281,8 +327,8 @@ public void testDynamicReservations() { } } - assertTrue(Math.abs(cacheSizeA - shareA) <= ByteUtils.BYTES_PER_MB); - assertTrue(Math.abs(cacheSizeB - shareB) <= ByteUtils.BYTES_PER_MB); + assertTrue("Store A not within limits ", cacheSizeA <= (shareA + ByteUtils.BYTES_PER_MB)); + assertTrue("Store B not within limits", cacheSizeB <= (shareB + ByteUtils.BYTES_PER_MB)); // 2. dynamically grow the cache to 15MB and watch B shrink. shareA = 15 * ByteUtils.BYTES_PER_MB; @@ -303,8 +349,8 @@ public void testDynamicReservations() { } } - assertTrue(Math.abs(cacheSizeA - shareA) <= ByteUtils.BYTES_PER_MB); - assertTrue(Math.abs(cacheSizeB - shareB) <= ByteUtils.BYTES_PER_MB); + assertTrue("Store A not within limits ", cacheSizeA <= (shareA + ByteUtils.BYTES_PER_MB)); + assertTrue("Store B not within limits ", cacheSizeB <= (shareB + ByteUtils.BYTES_PER_MB)); // 3. dynamically shrink it back to 10MB and watch B expand again. shareA = 10 * ByteUtils.BYTES_PER_MB; @@ -327,11 +373,12 @@ public void testDynamicReservations() { // check that they are not exceedingly high than their limits. Small // overflows are expected. But should not be more than a 1MB - assertTrue(Math.abs(cacheSizeA - shareA) <= ByteUtils.BYTES_PER_MB); - assertTrue(Math.abs(cacheSizeB - shareB) <= ByteUtils.BYTES_PER_MB); + assertTrue("Store A not within limits ", cacheSizeA <= (shareA + ByteUtils.BYTES_PER_MB)); + assertTrue("Store B not within limits ", cacheSizeB <= (shareB + ByteUtils.BYTES_PER_MB)); storeA.close(); storeB.close(); + bdbStorage.close(); } } diff --git a/test/unit/voldemort/store/bdb/BdbPartitionListIteratorTest.java b/test/unit/voldemort/store/bdb/BdbPartitionListIteratorTest.java new file mode 100644 index 0000000000..98c8e84b56 --- /dev/null +++ b/test/unit/voldemort/store/bdb/BdbPartitionListIteratorTest.java @@ -0,0 +1,172 @@ +/* + * Copyright 2008-2012 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.store.bdb; + +import static junit.framework.Assert.assertEquals; +import static junit.framework.Assert.fail; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.NoSuchElementException; +import java.util.Set; + +import org.apache.commons.io.FileDeleteStrategy; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import voldemort.TestUtils; +import voldemort.routing.RoutingStrategy; +import voldemort.server.VoldemortConfig; +import voldemort.store.PartitionListIterator; +import voldemort.store.StoreDefinition; +import voldemort.utils.ByteArray; +import voldemort.utils.Props; +import voldemort.versioning.Versioned; + +/** + * Tests the PartitionListIterator used in pidscan based rebalancing + * + */ +public class BdbPartitionListIteratorTest { + + private File bdbMasterDir; + private BdbStorageConfiguration bdbStorage; + private BdbStorageEngine store; + private RoutingStrategy strategy; + private HashMap> partitionEntries; + + @Before + public void setUp() throws Exception { + bdbMasterDir = TestUtils.createTempDir(); + FileDeleteStrategy.FORCE.delete(bdbMasterDir); + + // lets use all the default values. + Props props = new Props(); + props.put("node.id", 1); + props.put("voldemort.home", "test/common/voldemort/config"); + VoldemortConfig voldemortConfig = new VoldemortConfig(props); + voldemortConfig.setBdbCacheSize(10 * 1024 * 1024); + voldemortConfig.setBdbOneEnvPerStore(true); + voldemortConfig.setBdbDataDirectory(bdbMasterDir.toURI().getPath()); + voldemortConfig.setBdbPrefixKeysWithPartitionId(true); + bdbStorage = new BdbStorageConfiguration(voldemortConfig); + StoreDefinition defA = TestUtils.makeStoreDefinition("storeA"); + store = (BdbStorageEngine) bdbStorage.getStore(defA, + (strategy = TestUtils.makeSingleNodeRoutingStrategy())); + + // load some data for non odd partitions, and note down how much data we + // have for each partition. + partitionEntries = new HashMap>(); + int numEntries = 0; + while(numEntries++ < 10000) { + String key = "entry_" + numEntries; + int p = strategy.getMasterPartition(key.getBytes()); + // omit odd partitions + if(p % 2 == 1) + continue; + + if(!partitionEntries.containsKey(p)) + partitionEntries.put(p, new HashSet()); + + store.put(new ByteArray(key.getBytes()), new Versioned(key.getBytes()), null); + partitionEntries.get(p).add(key); + } + } + + @After + public void tearDown() throws Exception { + store.close(); + bdbStorage.close(); + FileDeleteStrategy.FORCE.delete(bdbMasterDir); + } + + @Test + public void testEmptyPartitionList() { + + PartitionListIterator plistItr = new PartitionListIterator(store, new ArrayList()); + assertEquals("Empty list cannot have a next element", false, plistItr.hasNext()); + try { + plistItr.next(); + fail("Should have thrown an exception for next()"); + } catch(NoSuchElementException ne) { + + } finally { + plistItr.close(); + } + } + + @Test + public void testEmptyPartition() { + + PartitionListIterator plistItr = new PartitionListIterator(store, Arrays.asList(1)); + assertEquals("No data loaded for odd partitions, so hasNext() should be false", + false, + plistItr.hasNext()); + try { + plistItr.next(); + fail("Should have thrown an exception for next()"); + } catch(NoSuchElementException ne) { + + } finally { + plistItr.close(); + } + } + + @Test + public void testSingletonPartitionList() { + PartitionListIterator plistItr = new PartitionListIterator(store, Arrays.asList(4)); + Set pentries = new HashSet(); + while(plistItr.hasNext()) { + pentries.add(new String(plistItr.next().getFirst().get())); + } + plistItr.close(); + assertEquals(partitionEntries.get(4), pentries); + } + + @Test + public void testPartitionListWithEmptyPartitions() { + PartitionListIterator plistItr = new PartitionListIterator(store, Arrays.asList(2, + 3, + 4, + 5, + 6)); + HashMap> retrievedPartitionEntries = new HashMap>(); + while(plistItr.hasNext()) { + String key = new String(plistItr.next().getFirst().get()); + int p = strategy.getMasterPartition(key.getBytes()); + + if(!retrievedPartitionEntries.containsKey(p)) + retrievedPartitionEntries.put(p, new HashSet()); + retrievedPartitionEntries.get(p).add(key); + } + plistItr.close(); + + // should only have retrieved entries for even partitions + assertEquals(3, retrievedPartitionEntries.size()); + for(Integer p: Arrays.asList(2, 3, 4, 5, 6)) { + if(p % 2 == 0) { + assertEquals(partitionEntries.get(p), retrievedPartitionEntries.get(p)); + } else { + assertEquals(false, retrievedPartitionEntries.containsKey(p)); + } + } + } +} diff --git a/test/unit/voldemort/store/bdb/BdbSplitStorageEngineTest.java b/test/unit/voldemort/store/bdb/BdbSplitStorageEngineTest.java index 3a68734397..cfc863a728 100644 --- a/test/unit/voldemort/store/bdb/BdbSplitStorageEngineTest.java +++ b/test/unit/voldemort/store/bdb/BdbSplitStorageEngineTest.java @@ -16,11 +16,22 @@ package voldemort.store.bdb; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotSame; + import java.io.File; +import java.util.Arrays; +import java.util.Collection; -import junit.framework.TestCase; +import junit.framework.Assert; import org.apache.commons.io.FileDeleteStrategy; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; import voldemort.TestUtils; import voldemort.server.VoldemortConfig; @@ -37,29 +48,38 @@ import com.sleepycat.je.StatsConfig; /** - * checks that + * checks BDB runtime behavior relating to operating multiple environments * * */ -public class BdbSplitStorageEngineTest extends TestCase { +@RunWith(Parameterized.class) +public class BdbSplitStorageEngineTest { private File bdbMasterDir; private BdbStorageConfiguration bdbStorage; private static long CACHE_SIZE = (long) Math.min(Runtime.getRuntime().maxMemory() * 0.30, 32 * 1000 * 1000); + private boolean prefixPartitionId; + + public BdbSplitStorageEngineTest(boolean prefixPartitionId) { + this.prefixPartitionId = prefixPartitionId; + } - @Override - protected void setUp() throws Exception { - super.setUp(); + @Parameters + public static Collection modes() { + Object[][] data = new Object[][] { { true }, { false } }; + return Arrays.asList(data); + } + @Before + public void setUp() throws Exception { bdbMasterDir = TestUtils.createTempDir(); FileDeleteStrategy.FORCE.delete(bdbMasterDir); } - @Override - protected void tearDown() throws Exception { - super.tearDown(); + @After + public void tearDown() throws Exception { try { if(bdbStorage != null) bdbStorage.close(); @@ -68,6 +88,7 @@ protected void tearDown() throws Exception { } } + @Test public void testNoMultipleEnvironment() { // lets use all the default values. Props props = new Props(); @@ -77,10 +98,13 @@ public void testNoMultipleEnvironment() { voldemortConfig.setBdbCacheSize(1 * 1024 * 1024); voldemortConfig.setBdbDataDirectory(bdbMasterDir.toURI().getPath()); voldemortConfig.setBdbOneEnvPerStore(false); + voldemortConfig.setBdbPrefixKeysWithPartitionId(prefixPartitionId); bdbStorage = new BdbStorageConfiguration(voldemortConfig); - BdbStorageEngine storeA = (BdbStorageEngine) bdbStorage.getStore(TestUtils.makeStoreDefinition("storeA")); - BdbStorageEngine storeB = (BdbStorageEngine) bdbStorage.getStore(TestUtils.makeStoreDefinition("storeB")); + BdbStorageEngine storeA = (BdbStorageEngine) bdbStorage.getStore(TestUtils.makeStoreDefinition("storeA"), + TestUtils.makeSingleNodeRoutingStrategy()); + BdbStorageEngine storeB = (BdbStorageEngine) bdbStorage.getStore(TestUtils.makeStoreDefinition("storeB"), + TestUtils.makeSingleNodeRoutingStrategy()); storeA.put(TestUtils.toByteArray("testKey1"), new Versioned("value".getBytes()), @@ -105,7 +129,7 @@ public void testNoMultipleEnvironment() { storeA.close(); storeB.close(); - assertEquals("common BDB file should exists.", true, (bdbMasterDir.exists())); + Assert.assertEquals("common BDB file should exists.", true, (bdbMasterDir.exists())); assertNotSame("StoreA BDB file should not exists.", true, (new File(bdbMasterDir + "/" + "storeA").exists())); @@ -113,6 +137,7 @@ public void testNoMultipleEnvironment() { + "storeB").exists())); } + @Test public void testMultipleEnvironment() { // lets use all the default values. Props props = new Props(); @@ -122,10 +147,13 @@ public void testMultipleEnvironment() { voldemortConfig.setBdbCacheSize(1 * 1024 * 1024); voldemortConfig.setBdbOneEnvPerStore(true); voldemortConfig.setBdbDataDirectory(bdbMasterDir.toURI().getPath()); + voldemortConfig.setBdbPrefixKeysWithPartitionId(prefixPartitionId); bdbStorage = new BdbStorageConfiguration(voldemortConfig); - BdbStorageEngine storeA = (BdbStorageEngine) bdbStorage.getStore(TestUtils.makeStoreDefinition("storeA")); - BdbStorageEngine storeB = (BdbStorageEngine) bdbStorage.getStore(TestUtils.makeStoreDefinition("storeB")); + BdbStorageEngine storeA = (BdbStorageEngine) bdbStorage.getStore(TestUtils.makeStoreDefinition("storeA"), + TestUtils.makeSingleNodeRoutingStrategy()); + BdbStorageEngine storeB = (BdbStorageEngine) bdbStorage.getStore(TestUtils.makeStoreDefinition("storeB"), + TestUtils.makeSingleNodeRoutingStrategy()); storeA.put(TestUtils.toByteArray("testKey1"), new Versioned("value".getBytes()), @@ -156,6 +184,7 @@ public void testMultipleEnvironment() { + "storeB").exists())); } + @Test public void testUnsharedCache() throws DatabaseException { EnvironmentConfig environmentConfig = new EnvironmentConfig(); environmentConfig = new EnvironmentConfig(); @@ -176,6 +205,7 @@ public void testUnsharedCache() throws DatabaseException { assertEquals("MaxCacheSize < 2 * CACHE_SIZE", true, maxCacheSize < 2 * CACHE_SIZE); } + @Test public void testSharedCache() throws DatabaseException { EnvironmentConfig environmentConfig = new EnvironmentConfig(); environmentConfig.setDurability(Durability.COMMIT_NO_SYNC); @@ -201,10 +231,11 @@ private long getMaxCacheUsage(EnvironmentConfig environmentConfig, DatabaseConfi } Environment environmentA = new Environment(dirA, environmentConfig); Database databaseA = environmentA.openDatabase(null, "storeA", databaseConfig); - BdbStorageEngine storeA = new BdbStorageEngine("storeA", - environmentA, - databaseA, - new BdbRuntimeConfig()); + BdbStorageEngine storeA = BdbStorageEngineTest.makeBdbStorageEngine("storeA", + environmentA, + databaseA, + new BdbRuntimeConfig(), + this.prefixPartitionId); File dirB = new File(bdbMasterDir + "/" + "storeB"); if(!dirB.exists()) { @@ -212,10 +243,11 @@ private long getMaxCacheUsage(EnvironmentConfig environmentConfig, DatabaseConfi } Environment environmentB = new Environment(dirB, environmentConfig); Database databaseB = environmentB.openDatabase(null, "storeB", databaseConfig); - BdbStorageEngine storeB = new BdbStorageEngine("storeB", - environmentB, - databaseB, - new BdbRuntimeConfig()); + BdbStorageEngine storeB = BdbStorageEngineTest.makeBdbStorageEngine("storeB", + environmentB, + databaseB, + new BdbRuntimeConfig(), + this.prefixPartitionId); long maxCacheUsage = 0; for(int i = 0; i <= 4; i++) { diff --git a/test/unit/voldemort/store/bdb/BdbStorageEngineTest.java b/test/unit/voldemort/store/bdb/BdbStorageEngineTest.java index d0289282fc..80bf2b2c7b 100644 --- a/test/unit/voldemort/store/bdb/BdbStorageEngineTest.java +++ b/test/unit/voldemort/store/bdb/BdbStorageEngineTest.java @@ -18,6 +18,7 @@ import java.io.File; import java.util.Arrays; +import java.util.Collection; import java.util.List; import java.util.Random; import java.util.concurrent.CountDownLatch; @@ -28,6 +29,12 @@ import java.util.concurrent.atomic.AtomicInteger; import org.apache.commons.io.FileDeleteStrategy; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; import voldemort.TestUtils; import voldemort.server.protocol.admin.AsyncOperationStatus; @@ -47,6 +54,12 @@ import com.sleepycat.je.EnvironmentConfig; import com.sleepycat.je.LockMode; +/** + * Tests the BDB storage engine. Note that this class uses junit4 style test + * methods, though the base class extends TestCase junit 3 style + * + */ +@RunWith(Parameterized.class) public class BdbStorageEngineTest extends AbstractStorageEngineTest { private static final LockMode LOCK_MODE = LockMode.DEFAULT; @@ -58,9 +71,21 @@ public class BdbStorageEngineTest extends AbstractStorageEngineTest { private BdbStorageEngine store; private DatabaseConfig databaseConfig; private BdbRuntimeConfig runtimeConfig; + private boolean prefixPartitionId; + + public BdbStorageEngineTest(boolean prefixPartitionId) { + this.prefixPartitionId = prefixPartitionId; + } + + @Parameters + public static Collection modes() { + Object[][] data = new Object[][] { { true }, { false } }; + return Arrays.asList(data); + } @Override - protected void setUp() throws Exception { + @Before + public void setUp() throws Exception { super.setUp(); this.envConfig = new EnvironmentConfig(); this.envConfig.setDurability(Durability.COMMIT_NO_SYNC); @@ -71,15 +96,36 @@ protected void setUp() throws Exception { this.databaseConfig = new DatabaseConfig(); databaseConfig.setAllowCreate(true); databaseConfig.setTransactional(true); - databaseConfig.setSortedDuplicates(true); + databaseConfig.setSortedDuplicates(false); this.database = environment.openDatabase(null, "test", databaseConfig); this.runtimeConfig = new BdbRuntimeConfig(); runtimeConfig.setLockMode(LOCK_MODE); - this.store = new BdbStorageEngine("test", this.environment, this.database, runtimeConfig); + this.store = makeBdbStorageEngine("test", + this.environment, + this.database, + runtimeConfig, + this.prefixPartitionId); + } + + protected static BdbStorageEngine makeBdbStorageEngine(String name, + Environment environment, + Database database, + BdbRuntimeConfig config, + boolean prefixPartitionId) { + if(prefixPartitionId) { + return new PartitionPrefixedBdbStorageEngine(name, + environment, + database, + config, + TestUtils.makeSingleNodeRoutingStrategy()); + } else { + return new BdbStorageEngine(name, environment, database, config); + } } @Override - protected void tearDown() throws Exception { + @After + public void tearDown() throws Exception { super.tearDown(); try { store.close(); @@ -94,6 +140,7 @@ public StorageEngine getStorageEngine() { return store; } + @Test public void testPersistence() throws Exception { this.store.put(new ByteArray("abc".getBytes()), new Versioned("cdef".getBytes()), @@ -102,39 +149,54 @@ public void testPersistence() throws Exception { this.environment.close(); this.environment = new Environment(this.tempDir, envConfig); this.database = environment.openDatabase(null, "test", databaseConfig); - this.store = new BdbStorageEngine("test", this.environment, this.database, runtimeConfig); + this.store = makeBdbStorageEngine("test", + this.environment, + this.database, + runtimeConfig, + this.prefixPartitionId); List> vals = store.get(new ByteArray("abc".getBytes()), null); assertEquals(1, vals.size()); TestUtils.bytesEqual("cdef".getBytes(), vals.get(0).getValue()); } + @Test public void testEquals() { String name = "someName"; - assertEquals(new BdbStorageEngine(name, environment, database, runtimeConfig), - new BdbStorageEngine(name, environment, database, runtimeConfig)); + assertEquals(makeBdbStorageEngine(name, + environment, + database, + runtimeConfig, + this.prefixPartitionId), + makeBdbStorageEngine(name, + environment, + database, + runtimeConfig, + this.prefixPartitionId)); } + @Test public void testNullConstructorParameters() { try { - new BdbStorageEngine(null, environment, database, runtimeConfig); + makeBdbStorageEngine(null, environment, database, runtimeConfig, this.prefixPartitionId); } catch(IllegalArgumentException e) { return; } fail("No exception thrown for null name."); try { - new BdbStorageEngine("name", null, database, runtimeConfig); + makeBdbStorageEngine("name", null, database, runtimeConfig, this.prefixPartitionId); } catch(IllegalArgumentException e) { return; } fail("No exception thrown for null environment."); try { - new BdbStorageEngine("name", environment, null, runtimeConfig); + makeBdbStorageEngine("name", environment, null, runtimeConfig, this.prefixPartitionId); } catch(IllegalArgumentException e) { return; } fail("No exception thrown for null database."); } + @Test public void testConcurrentReadAndPut() throws Exception { ExecutorService executor = Executors.newFixedThreadPool(10); final CountDownLatch latch = new CountDownLatch(10); @@ -174,6 +236,7 @@ public void run() { assertFalse("Should not have seen any empty results", returnedEmpty.get()); } + @Test public void testSimultaneousIterationAndModification() throws Exception { // start a thread to do modifications ExecutorService executor = Executors.newFixedThreadPool(2); @@ -215,6 +278,7 @@ public void run() { assertTrue(executor.awaitTermination(5, TimeUnit.SECONDS)); } + @Test public void testNativeBackup() throws Exception { File backupToDir = File.createTempFile("bdb-storage", "bkp"); backupToDir.delete(); @@ -235,7 +299,6 @@ public void testNativeBackup() throws Exception { } finally { deleteDir(backupToDir); } - } private static void assertArrayEquals(Object[] expected, Object[] actual) { diff --git a/test/unit/voldemort/store/bdb/PartitionPrefixedBdbStorageEngineTest.java b/test/unit/voldemort/store/bdb/PartitionPrefixedBdbStorageEngineTest.java new file mode 100644 index 0000000000..c664a58a08 --- /dev/null +++ b/test/unit/voldemort/store/bdb/PartitionPrefixedBdbStorageEngineTest.java @@ -0,0 +1,240 @@ +/* + * Copyright 2008-2012 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.store.bdb; + +import static junit.framework.Assert.assertEquals; +import static junit.framework.Assert.fail; + +import java.io.File; +import java.io.StringReader; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.commons.io.FileDeleteStrategy; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import voldemort.ServerTestUtils; +import voldemort.TestUtils; +import voldemort.VoldemortTestConstants; +import voldemort.cluster.Cluster; +import voldemort.routing.RoutingStrategy; +import voldemort.routing.RoutingStrategyFactory; +import voldemort.server.VoldemortConfig; +import voldemort.store.StoreBinaryFormat; +import voldemort.store.StoreDefinition; +import voldemort.utils.ByteArray; +import voldemort.utils.ByteUtils; +import voldemort.utils.ClosableIterator; +import voldemort.utils.Pair; +import voldemort.utils.Props; +import voldemort.versioning.Versioned; +import voldemort.xml.ClusterMapper; +import voldemort.xml.StoreDefinitionsMapper; + +/** + * Tests for the BDB storage engine prefixing the partition id to the keys, to + * enable efficient partition scans + * + * + * + */ +public class PartitionPrefixedBdbStorageEngineTest { + + private File bdbMasterDir; + private BdbStorageConfiguration bdbStorage; + + @Before + public void setUp() throws Exception { + bdbMasterDir = TestUtils.createTempDir(); + FileDeleteStrategy.FORCE.delete(bdbMasterDir); + // lets use all the default values. + Props props = new Props(); + props.put("node.id", 1); + props.put("voldemort.home", "test/common/voldemort/config"); + VoldemortConfig voldemortConfig = new VoldemortConfig(props); + voldemortConfig.setBdbCacheSize(10 * 1024 * 1024); + voldemortConfig.setBdbOneEnvPerStore(true); + voldemortConfig.setBdbDataDirectory(bdbMasterDir.toURI().getPath()); + voldemortConfig.setBdbPrefixKeysWithPartitionId(true); + bdbStorage = new BdbStorageConfiguration(voldemortConfig); + } + + @After + public void tearDown() throws Exception { + try { + if(bdbStorage != null) + bdbStorage.close(); + } finally { + FileDeleteStrategy.FORCE.delete(bdbMasterDir); + } + } + + @Test + public void testPartitionToByteArrayConversion() { + // check the conversions used in the partition scan code path + for(int i = 0; i <= ClusterMapper.MAX_PARTITIONID; i++) { + byte[] pkey = StoreBinaryFormat.makePartitionKey(i); + int j = StoreBinaryFormat.extractPartition(pkey); + assertEquals(i, j); + } + + byte[] key = "abcdefghijklmnopqrstuvwxyz".getBytes(); + // check the conversions used in the other code path + byte[] prefixedkey = StoreBinaryFormat.makePrefixedKey(key, 20); + int partition = StoreBinaryFormat.extractPartition(prefixedkey); + assertEquals(partition, 20); + assertEquals(0, ByteUtils.compare(key, StoreBinaryFormat.extractKey(prefixedkey))); + } + + @Test + public void testHashConsistencyAcrossRoutingStrategies() { + // check that as long as the cluster.xml is the same, a key will hash to + // the same partition, immaterial of whether it is zone or consistent + // routing strategy + + StoreDefinitionsMapper mapper = new StoreDefinitionsMapper(); + List storeDefs = mapper.readStoreList(new StringReader(VoldemortTestConstants.getTwoStoresWithZonesXml())); + + StoreDefinition consistentStore = storeDefs.get(0); + StoreDefinition zoneStore = storeDefs.get(1); + + assertEquals(consistentStore.getName(), "cstore"); + assertEquals(zoneStore.getName(), "zstore"); + + Cluster cluster = VoldemortTestConstants.getEightNodeClusterWithZones(); + RoutingStrategy cStrategy = new RoutingStrategyFactory().updateRoutingStrategy(consistentStore, + cluster); + RoutingStrategy zStrategy = new RoutingStrategyFactory().updateRoutingStrategy(zoneStore, + cluster); + BdbStorageEngine cPrefixedBdbStore = (BdbStorageEngine) bdbStorage.getStore(consistentStore, + cStrategy); + BdbStorageEngine zPrefixedBdbStore = (BdbStorageEngine) bdbStorage.getStore(zoneStore, + zStrategy); + HashMap kvpairs = ServerTestUtils.createRandomKeyValuePairs(10000); + for(ByteArray key: kvpairs.keySet()) { + assertEquals(cStrategy.getPartitionList(key.get()).get(0), + zStrategy.getPartitionList(key.get()).get(0)); + + cPrefixedBdbStore.put(key, new Versioned(kvpairs.get(key)), null); + zPrefixedBdbStore.put(key, new Versioned(kvpairs.get(key)), null); + } + + for(ByteArray key: kvpairs.keySet()) { + assertEquals("Values read back does not match up", + 0, + ByteUtils.compare(cPrefixedBdbStore.get(key, null).get(0).getValue(), + zPrefixedBdbStore.get(key, null).get(0).getValue())); + } + cPrefixedBdbStore.close(); + zPrefixedBdbStore.close(); + } + + private Set getKeys(ClosableIterator itr) { + HashSet keySet = new HashSet(); + while(itr.hasNext()) { + keySet.add(new String(itr.next().get())); + } + itr.close(); + return keySet; + } + + private Set getEntries(ClosableIterator>> itr) { + HashSet keySet = new HashSet(); + while(itr.hasNext()) { + Pair> entry = itr.next(); + ByteArray key = entry.getFirst(); + byte[] value = entry.getSecond().getValue(); + + String skey = new String(key.get()); + int keyId = Integer.parseInt(skey.replaceAll("key", "")); + assertEquals(0, ByteUtils.compare(value, ("value" + keyId).getBytes())); + + keySet.add(skey); + } + itr.close(); + return keySet; + } + + @Test + public void testPartitionScan() { + + StoreDefinition storedef = TestUtils.makeStoreDefinition("storeA"); + RoutingStrategy strategy = TestUtils.makeSingleNodeRoutingStrategy(); + BdbStorageEngine prefixedBdbStore = (BdbStorageEngine) bdbStorage.getStore(storedef, + strategy); + try { + // insert a bunch of records + HashMap> partitionToKeysMap = new HashMap>(); + for(int i = 0; i < 10000; i++) { + String key = "key" + i; + byte[] bkey = key.getBytes(); + + int partition = strategy.getPartitionList(bkey).get(0); + if(!partitionToKeysMap.containsKey(partition)) + partitionToKeysMap.put(partition, new HashSet()); + partitionToKeysMap.get(partition).add(key); + + prefixedBdbStore.put(new ByteArray(bkey), + new Versioned(("value" + i).getBytes()), + null); + } + + // check if they are properly retrieved by that partition id + for(int p = 0; p < strategy.getNumReplicas(); p++) { + + // verify keys + Set keys = getKeys(prefixedBdbStore.keys(p)); + assertEquals(partitionToKeysMap.get(p).size(), keys.size()); + assertEquals(partitionToKeysMap.get(p), keys); + + // verify values + keys = getEntries(prefixedBdbStore.entries(p)); + assertEquals(partitionToKeysMap.get(p).size(), keys.size()); + assertEquals(partitionToKeysMap.get(p), keys); + } + + // make sure the entries() code path does not break. + HashSet allKeys = new HashSet(); + for(Integer p: partitionToKeysMap.keySet()) { + Set pkeys = partitionToKeysMap.get(p); + int originalSize = allKeys.size(); + allKeys.removeAll(pkeys); + // this is to make sure the pkeys have 0 overlap + assertEquals(allKeys.size(), originalSize); + allKeys.addAll(pkeys); + } + // this makes sure all the data we put in is what you get out, not a + // byte less or more + Set keys = getKeys(prefixedBdbStore.keys()); + assertEquals(allKeys.size(), keys.size()); + assertEquals(allKeys, keys); + + keys = getEntries(prefixedBdbStore.entries()); + assertEquals(allKeys.size(), keys.size()); + assertEquals(allKeys, keys); + + } catch(Exception e) { + fail("Should not have thrown any exceptions" + e.getMessage()); + } finally { + prefixedBdbStore.close(); + } + } +} diff --git a/test/unit/voldemort/store/memory/CacheStorageEngineTest.java b/test/unit/voldemort/store/memory/CacheStorageEngineTest.java index d8bf271ede..55d4522184 100644 --- a/test/unit/voldemort/store/memory/CacheStorageEngineTest.java +++ b/test/unit/voldemort/store/memory/CacheStorageEngineTest.java @@ -41,7 +41,8 @@ public void setUp() throws Exception { @Override public StorageEngine getStorageEngine() { - return new CacheStorageConfiguration().getStore(TestUtils.makeStoreDefinition("test")); + return new CacheStorageConfiguration().getStore(TestUtils.makeStoreDefinition("test"), + TestUtils.makeSingleNodeRoutingStrategy()); } public void testNoPressureBehavior() { diff --git a/test/unit/voldemort/store/readonly/swapper/StoreSwapperTest.java b/test/unit/voldemort/store/readonly/swapper/StoreSwapperTest.java index aaaaf9c7bf..78bc3f4247 100644 --- a/test/unit/voldemort/store/readonly/swapper/StoreSwapperTest.java +++ b/test/unit/voldemort/store/readonly/swapper/StoreSwapperTest.java @@ -1,5 +1,5 @@ /* - * Copyright 2011-2012 LinkedIn, Inc + * Copyright 2011-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -136,7 +136,7 @@ public void setUp() throws IOException { @After public void tearDown() throws IOException { - adminClient.stop(); + adminClient.close(); for(VoldemortServer server: servers) { ServerTestUtils.stopVoldemortServer(server); } @@ -245,12 +245,13 @@ public void testFetchSwapWithoutRollback(StoreSwapper swapper) throws Exception File temporaryDir = createTempROFolder(); // Retrieve all the current versions - long currentVersion = adminClient.getROCurrentVersion(0, Lists.newArrayList(STORE_NAME)) - .get(STORE_NAME); + long currentVersion = adminClient.readonlyOps.getROCurrentVersion(0, + Lists.newArrayList(STORE_NAME)) + .get(STORE_NAME); for(int nodeId = 1; nodeId < NUM_NODES; nodeId++) { - long newVersion = adminClient.getROCurrentVersion(nodeId, - Lists.newArrayList(STORE_NAME)) - .get(STORE_NAME); + long newVersion = adminClient.readonlyOps.getROCurrentVersion(nodeId, + Lists.newArrayList(STORE_NAME)) + .get(STORE_NAME); if(newVersion != currentVersion) fail("Current version (on " + nodeId + ") = " + newVersion + " is not equal to others"); @@ -281,8 +282,9 @@ public void testFetchSwapWithoutRollback(StoreSwapper swapper) throws Exception // ... check if "currentVersion + 3 " is NOT deleted for(int nodeId = 0; nodeId < NUM_NODES; nodeId++) { - long maxVersion = adminClient.getROMaxVersion(nodeId, Lists.newArrayList(STORE_NAME)) - .get(STORE_NAME); + long maxVersion = adminClient.readonlyOps.getROMaxVersion(nodeId, + Lists.newArrayList(STORE_NAME)) + .get(STORE_NAME); assertTrue(maxVersion == (currentVersion + 3)); } @@ -295,12 +297,13 @@ public void testFetchSwap(StoreSwapper swapper) throws Exception { File temporaryDir = createTempROFolder(); // Retrieve all the current versions - long currentVersion = adminClient.getROCurrentVersion(0, Lists.newArrayList(STORE_NAME)) - .get(STORE_NAME); + long currentVersion = adminClient.readonlyOps.getROCurrentVersion(0, + Lists.newArrayList(STORE_NAME)) + .get(STORE_NAME); for(int nodeId = 1; nodeId < NUM_NODES; nodeId++) { - long newVersion = adminClient.getROCurrentVersion(nodeId, - Lists.newArrayList(STORE_NAME)) - .get(STORE_NAME); + long newVersion = adminClient.readonlyOps.getROCurrentVersion(nodeId, + Lists.newArrayList(STORE_NAME)) + .get(STORE_NAME); if(newVersion != currentVersion) fail("Current version (on " + nodeId + ") = " + newVersion + " is not equal to others"); @@ -373,9 +376,9 @@ public void testFetchSwap(StoreSwapper swapper) throws Exception { for(int nodeId = 0; nodeId < NUM_NODES; nodeId++) { versionToNode.put(nodeId, - adminClient.getROCurrentVersion(nodeId, - Lists.newArrayList(STORE_NAME)) - .get(STORE_NAME)); + adminClient.readonlyOps.getROCurrentVersion(nodeId, + Lists.newArrayList(STORE_NAME)) + .get(STORE_NAME)); } servers[1].getMetadataStore().put(MetadataStore.SERVER_STATE_KEY, @@ -388,9 +391,9 @@ public void testFetchSwap(StoreSwapper swapper) throws Exception { // Check that latest is not currentVersion + 4 for(int nodeId = 0; nodeId < NUM_NODES; nodeId++) { - long currentNodeVersion = adminClient.getROCurrentVersion(nodeId, - Lists.newArrayList(STORE_NAME)) - .get(STORE_NAME); + long currentNodeVersion = adminClient.readonlyOps.getROCurrentVersion(nodeId, + Lists.newArrayList(STORE_NAME)) + .get(STORE_NAME); assertTrue(currentNodeVersion != (currentVersion + 4)); assertEquals(currentNodeVersion, (long) versionToNode.get(nodeId)); } @@ -403,9 +406,9 @@ public void testFetchSwap(StoreSwapper swapper) throws Exception { swapper.swapStoreData(STORE_NAME, temporaryDir.getAbsolutePath(), currentVersion + 5); for(int nodeId = 0; nodeId < NUM_NODES; nodeId++) { - long currentNodeVersion = adminClient.getROCurrentVersion(nodeId, - Lists.newArrayList(STORE_NAME)) - .get(STORE_NAME); + long currentNodeVersion = adminClient.readonlyOps.getROCurrentVersion(nodeId, + Lists.newArrayList(STORE_NAME)) + .get(STORE_NAME); assertTrue(currentNodeVersion == (currentVersion + 5)); } } diff --git a/test/unit/voldemort/store/rebalancing/RebootstrappingStoreTest.java b/test/unit/voldemort/store/rebalancing/RebootstrappingStoreTest.java index 3d65074c88..ec3804beb8 100644 --- a/test/unit/voldemort/store/rebalancing/RebootstrappingStoreTest.java +++ b/test/unit/voldemort/store/rebalancing/RebootstrappingStoreTest.java @@ -107,15 +107,15 @@ public void rebalance() { AdminClient adminClient = RebalanceUtils.createTempAdminClient(config, cluster, 4); HashMap> replicaToPartitionList = Maps.newHashMap(); replicaToPartitionList.put(0, ImmutableList.of(0, 1)); - int req = adminClient.migratePartitions(0, - 1, - STORE_NAME, - replicaToPartitionList, - null, - null, - false); - adminClient.waitForCompletion(1, req, 5, TimeUnit.SECONDS); - Versioned versionedCluster = adminClient.getRemoteCluster(0); + int req = adminClient.storeMntOps.migratePartitions(0, + 1, + STORE_NAME, + replicaToPartitionList, + null, + null, + false); + adminClient.rpcOps.waitForCompletion(1, req, 5, TimeUnit.SECONDS); + Versioned versionedCluster = adminClient.metadataMgmtOps.getRemoteCluster(0); Node node0 = versionedCluster.getValue().getNodeById(0); Node node1 = versionedCluster.getValue().getNodeById(1); Node newNode0 = new Node(node0.getId(), @@ -130,7 +130,10 @@ public void rebalance() { node1.getSocketPort(), node1.getAdminPort(), ImmutableList.of(0, 1)); - long deleted = adminClient.deletePartitions(0, STORE_NAME, ImmutableList.of(0, 1), null); + long deleted = adminClient.storeMntOps.deletePartitions(0, + STORE_NAME, + ImmutableList.of(0, 1), + null); assert deleted > 0; Cluster newCluster = new Cluster(cluster.getName(), ImmutableList.of(newNode0, newNode1), @@ -139,7 +142,7 @@ public void rebalance() { VectorClock clock = (VectorClock) versionedCluster.getVersion(); clock.incrementVersion(node.getId(), System.currentTimeMillis()); - adminClient.updateRemoteCluster(node.getId(), newCluster, clock); + adminClient.metadataMgmtOps.updateRemoteCluster(node.getId(), newCluster, clock); } } diff --git a/test/unit/voldemort/store/routed/ReadRepairerTest.java b/test/unit/voldemort/store/routed/ReadRepairerTest.java index 61e41981c8..343d277a83 100644 --- a/test/unit/voldemort/store/routed/ReadRepairerTest.java +++ b/test/unit/voldemort/store/routed/ReadRepairerTest.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2009 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -189,6 +189,7 @@ public void testMissingKeysAreAddedToNodeWhenDoingReadRepair() throws Exception /** * See Issue 92: ReadRepairer.getRepairs should not return duplicates. */ + @Test public void testNoDuplicates() throws Exception { List> values = asList(getValue(1, 1, new int[] { 1, 2 }), getValue(2, 1, new int[] { 1, 2 }), @@ -198,12 +199,14 @@ public void testNoDuplicates() throws Exception { assertEquals(getValue(3, 1, new int[] { 1, 2 }), repairs.get(0)); } + @Test public void testSingleSuccessor() throws Exception { assertVariationsEqual(singletonList(getValue(1, 1, new int[] { 1, 1 })), asList(getValue(1, 1, new int[] { 1 }), getValue(2, 1, new int[] { 1, 1 }))); } + @Test public void testAllConcurrent() throws Exception { assertVariationsEqual(asList(getValue(1, 1, new int[] { 2 }), getValue(1, 1, new int[] { 3 }), @@ -216,6 +219,7 @@ public void testAllConcurrent() throws Exception { getValue(3, 1, new int[] { 3 }))); } + @Test public void testTwoAncestorsToOneSuccessor() throws Exception { int[] expected = new int[] { 1, 1, 2, 2 }; assertVariationsEqual(asList(getValue(2, 1, expected), getValue(3, 1, expected)), @@ -224,6 +228,7 @@ public void testTwoAncestorsToOneSuccessor() throws Exception { getValue(3, 1, new int[] { 2 }))); } + @Test public void testOneAcestorToTwoSuccessors() throws Exception { int[] expected = new int[] { 1, 1, 2, 2 }; assertVariationsEqual(asList(getValue(2, 1, expected), getValue(3, 1, expected)), @@ -232,6 +237,7 @@ public void testOneAcestorToTwoSuccessors() throws Exception { getValue(3, 1, new int[] { 2 }))); } + @Test public void testEqualObsoleteVersions() throws Exception { int[] expected = new int[] { 1, 1 }; assertVariationsEqual(asList(getValue(1, 1, expected), @@ -243,6 +249,7 @@ public void testEqualObsoleteVersions() throws Exception { getValue(4, 1, expected))); } + @Test public void testDiamondPattern() throws Exception { int[] expected = new int[] { 1, 1, 2, 2 }; assertVariationsEqual(asList(getValue(1, 1, expected), @@ -254,6 +261,7 @@ public void testDiamondPattern() throws Exception { getValue(4, 1, expected))); } + @Test public void testConcurrentToOneDoesNotImplyConcurrentToAll() throws Exception { assertVariationsEqual(asList(getValue(1, 1, new int[] { 1, 3, 3 }), getValue(1, 1, new int[] { 1, 2 }), @@ -264,6 +272,7 @@ public void testConcurrentToOneDoesNotImplyConcurrentToAll() throws Exception { getValue(3, 1, new int[] { 1, 3, 3 }))); } + @Test public void testLotsOfVersions() throws Exception { assertVariationsEqual(asList(getValue(1, 1, new int[] { 1, 2, 2, 3 }), getValue(1, 1, new int[] { 1, 2, 3, 3 }), @@ -323,6 +332,14 @@ public void testMultipleKeys() { assertEquals("There should be no repairs.", 0, repairs.size()); } + /** + * Testing helper method to construct node-values out of thin air. + * + * @param nodeId The node ID + * @param value The value (an integer) + * @param version The version (vector of integers passed to getClock()) + * @return + */ private NodeValue getValue(int nodeId, int value, int[] version) { return new NodeValue(nodeId, Integer.toString(value), diff --git a/test/unit/voldemort/store/stats/HistogramTest.java b/test/unit/voldemort/store/stats/HistogramTest.java index c4a759e8c1..956fe4296c 100644 --- a/test/unit/voldemort/store/stats/HistogramTest.java +++ b/test/unit/voldemort/store/stats/HistogramTest.java @@ -1,14 +1,30 @@ +/* + * Copyright 2012 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + package voldemort.store.stats; +import static org.junit.Assert.assertEquals; + import org.junit.Before; import org.junit.Test; -import static org.junit.Assert.assertEquals; - public class HistogramTest { - + private Histogram histogram; - + @Before public void setUp() { histogram = new Histogram(10, 5); @@ -29,20 +45,66 @@ public void setUp() { histogram.insert(66); histogram.insert(76); } - + @Test - public void testAverage() { + public void test50thQuartile() { assertEquals(histogram.getQuantile(0.50), 30); } - + @Test public void test95thQuartile() { assertEquals(histogram.getQuantile(0.95), 45); } - + @Test public void test99thQuartile() { assertEquals(histogram.getQuantile(0.99), 45); } - + + @Test + public void testResetHistogram() { + + Histogram resetingHistogram = new Histogram(10, 1, 10); + // tests that the functionality is still working + for(long data = 0; data < 5; data++) { + for(int loop = 0; loop <= data; loop++) { + resetingHistogram.insert(data); + } + } + assertEquals(3, resetingHistogram.getQuantile(0.50)); + assertEquals(4, resetingHistogram.getQuantile(0.99)); + assertEquals(2.67, resetingHistogram.getAverage(), 0.01); + + // tests that once enough time passes, old data will be discarded + try { + Thread.sleep(10); + } catch(InterruptedException ie) {} + + assertEquals(0, resetingHistogram.getQuantile(0.50)); + assertEquals(0, resetingHistogram.getQuantile(0.99)); + assertEquals(0.0, resetingHistogram.getAverage(), 0.0); + } + + @Test + public void testUpperBoundaryCondition() { + Histogram h = new Histogram(100, 1); + h.insert(98); + h.insert(99); + h.insert(100); // Should bucket with 99 + h.insert(101); // Should bucket with 99 + + assertEquals(h.getQuantile(0.24), 98); + assertEquals(h.getQuantile(0.26), 99); + } + + @Test + public void testLowerBoundaryCondition() { + Histogram h = new Histogram(100, 1); + h.insert(-1); // Should not be bucketed + h.insert(0); + h.insert(1); + + assertEquals(h.getQuantile(0.49), 0); + assertEquals(h.getQuantile(0.51), 1); + } } diff --git a/test/unit/voldemort/store/stats/SimpleCounterTest.java b/test/unit/voldemort/store/stats/SimpleCounterTest.java new file mode 100644 index 0000000000..df7dc67079 --- /dev/null +++ b/test/unit/voldemort/store/stats/SimpleCounterTest.java @@ -0,0 +1,171 @@ +package voldemort.store.stats; + +import static org.junit.Assert.assertEquals; + +import java.util.Arrays; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +import org.junit.Before; +import org.junit.Test; + +import voldemort.utils.Time; + +public class SimpleCounterTest { + + final private static int COUNTER_RESET_INTERVAL_MS = 100; + private SimpleCounter simpleCounter; + + @Before + public void setUp() { + simpleCounter = new SimpleCounter(COUNTER_RESET_INTERVAL_MS); + } + + private static void sleepForResetInterval(long startTimeMs) { + try { + long stopTimeMs = startTimeMs + COUNTER_RESET_INTERVAL_MS; + long sleepDurationMs = stopTimeMs - System.currentTimeMillis() + 1; + if (sleepDurationMs >0) { + Thread.sleep(sleepDurationMs); + } else { + System.err.println("Sleep duration determined to be negative. "+ + "GC or something must have interfered with this test. "+ + "If test failed, try again since result is bogus. "+ + "If test passed, then timing still worked out for test."); + } + } catch(InterruptedException e) { + e.printStackTrace(); + } + } + + @Test + public void testSingleThread() { + // Interval 0 + assertEquals(0.0, simpleCounter.getAvgEventValue(), 0.0); + assertEquals(0.0, simpleCounter.getEventRate(), 0.0); + + // Interval 1- add some samples + long startTimeMs = System.currentTimeMillis(); + for(int i = 0; i < 10; i++) + simpleCounter.count(); + sleepForResetInterval(startTimeMs); + + // Interval 2 + startTimeMs = System.currentTimeMillis(); + for(int i = 0; i < 10; i++) + simpleCounter.count(100); + // verify the stats returned are for the first interval + assertEquals(0.0, simpleCounter.getAvgEventValue(), 0.0); + assertEquals(10 / ((COUNTER_RESET_INTERVAL_MS * 1.0) / Time.MS_PER_SECOND), + simpleCounter.getEventRate(), + 0.0); + sleepForResetInterval(startTimeMs); + + // Interval 3 + // verify the stats returned are for the second interval and that + // multiple calls during the current interval will always provide the + // same result + startTimeMs = System.currentTimeMillis(); + for(int i = 0; i < 10; i++) { + assertEquals(100.0, simpleCounter.getAvgEventValue(), 0.0); + assertEquals(10 / ((COUNTER_RESET_INTERVAL_MS * 1.0) / Time.MS_PER_SECOND), + simpleCounter.getEventRate(), + 0.0); + } + sleepForResetInterval(startTimeMs); + + // No activity + assertEquals(0.0, simpleCounter.getAvgEventValue(), 0.0); + assertEquals(0.0, simpleCounter.getEventRate(), 0.0); + } + + @Test + public void testMultipleThreads() throws InterruptedException { + ExecutorService executorService = null; + try { + final int NUM_THREADS = 5; + final int NUM_OPS = 10000; + + long startTimeMs = System.currentTimeMillis(); + executorService = Executors.newFixedThreadPool(NUM_THREADS); + final CountDownLatch latch1 = new CountDownLatch(NUM_THREADS); + final CountDownLatch latch0 = new CountDownLatch(1); + + for(int i = 0; i < NUM_THREADS; i++) { + final int threadId = i; + executorService.submit(new Runnable() { + + public void run() { + try { + latch0.await(); + for(int j = 0; j < NUM_OPS; j++) { + simpleCounter.count(100 * (threadId + 1)); + } + } catch(InterruptedException e) { + e.printStackTrace(); + } finally { + latch1.countDown(); + } + } + }); + } + latch0.countDown(); + latch1.await(); + // one more sleep so we expire the current interval where all the + // action happened + sleepForResetInterval(startTimeMs); + + startTimeMs = System.currentTimeMillis(); + assertEquals(300.0, simpleCounter.getAvgEventValue(), 0.0); + assertEquals((NUM_OPS * NUM_THREADS) + / ((COUNTER_RESET_INTERVAL_MS * 1.0) / Time.MS_PER_SECOND), + simpleCounter.getEventRate(), + 0.0); + sleepForResetInterval(startTimeMs); + + // Run for a long period spannning multiple intervals and see if we + // observe if we see consitent metrics + final ConcurrentLinkedQueue observedEventRate = new ConcurrentLinkedQueue(); + final ConcurrentLinkedQueue observedEventValueAvg = new ConcurrentLinkedQueue(); + final int NUM_INTERVALS = 30; + final CountDownLatch latch2 = new CountDownLatch(NUM_THREADS); + for(int i = 0; i < NUM_THREADS; i++) { + executorService.submit(new Runnable() { + + public void run() { + try { + for(int interval = 0; interval < NUM_INTERVALS; interval++) { + long startTimeMs = System.currentTimeMillis(); + for(int j = 0; j < NUM_OPS; j++) { + simpleCounter.count(100); + } + sleepForResetInterval(startTimeMs); + } + observedEventRate.add(simpleCounter.getEventRate()); + observedEventValueAvg.add(simpleCounter.getAvgEventValue()); + } finally { + latch2.countDown(); + } + } + }); + } + + latch2.await(); + Object[] actualEventRates = new Object[NUM_THREADS]; + Object[] actualEventValueAvgs = new Object[NUM_THREADS]; + for(int i = 0; i < NUM_THREADS; i++) { + actualEventRates[i] = (NUM_OPS * NUM_THREADS) + / ((COUNTER_RESET_INTERVAL_MS * 1.0) / Time.MS_PER_SECOND); + actualEventValueAvgs[i] = 100.0; + } + assertEquals(Arrays.equals(observedEventRate.toArray(), actualEventRates), true); + assertEquals(Arrays.equals(observedEventValueAvg.toArray(), actualEventValueAvgs), true); + + } finally { + if(executorService != null) + executorService.shutdown(); + } + } +} diff --git a/test/unit/voldemort/store/system/AsyncMetadataVersionManagerTest.java b/test/unit/voldemort/store/system/AsyncMetadataVersionManagerTest.java index 69a186126b..be730842e5 100644 --- a/test/unit/voldemort/store/system/AsyncMetadataVersionManagerTest.java +++ b/test/unit/voldemort/store/system/AsyncMetadataVersionManagerTest.java @@ -28,7 +28,6 @@ import org.junit.Test; import voldemort.ServerTestUtils; -import voldemort.TestUtils; import voldemort.client.SystemStore; import voldemort.client.SystemStoreRepository; import voldemort.client.scheduler.AsyncMetadataVersionManager; @@ -71,27 +70,17 @@ public class AsyncMetadataVersionManagerTest { @Before public void setUp() throws Exception { - cluster = ServerTestUtils.getLocalCluster(2, new int[][] { { 0, 1, 2, 3 }, { 4, 5, 6, 7 } }); - servers = new VoldemortServer[2]; - - servers[0] = ServerTestUtils.startVoldemortServer(socketStoreFactory, - ServerTestUtils.createServerConfig(true, - 0, - TestUtils.createTempDir() - .getAbsolutePath(), - null, - storesXmlfile, - new Properties()), - cluster); - servers[1] = ServerTestUtils.startVoldemortServer(socketStoreFactory, - ServerTestUtils.createServerConfig(true, - 1, - TestUtils.createTempDir() - .getAbsolutePath(), - null, - storesXmlfile, - new Properties()), - cluster); + final int numServers = 2; + servers = new VoldemortServer[numServers]; + int partitionMap[][] = { { 0, 1, 2, 3 }, { 4, 5, 6, 7 } }; + cluster = ServerTestUtils.startVoldemortCluster(numServers, + servers, + partitionMap, + socketStoreFactory, + true, // useNio + null, + storesXmlfile, + new Properties()); socketUrl = servers[0].getIdentityNode().getSocketUrl().toString(); @@ -108,8 +97,9 @@ public void setUp() throws Exception { @After public void tearDown() throws Exception { - servers[0].stop(); - servers[1].stop(); + for(VoldemortServer server: servers) { + ServerTestUtils.stopVoldemortServer(server); + } } /* diff --git a/test/unit/voldemort/store/system/SystemStoreTest.java b/test/unit/voldemort/store/system/SystemStoreTest.java index a2ca3eef0b..b47dc79c54 100644 --- a/test/unit/voldemort/store/system/SystemStoreTest.java +++ b/test/unit/voldemort/store/system/SystemStoreTest.java @@ -26,7 +26,6 @@ import org.junit.Test; import voldemort.ServerTestUtils; -import voldemort.TestUtils; import voldemort.client.AbstractStoreClientFactory; import voldemort.client.ClientConfig; import voldemort.client.SocketStoreClientFactory; @@ -61,27 +60,20 @@ public class SystemStoreTest { @Before public void setUp() throws Exception { + final int numServers = 2; + servers = new VoldemortServer[numServers]; + int partitionMap[][] = { { 0, 1, 2, 3 }, { 4, 5, 6, 7 } }; + cluster = ServerTestUtils.getLocalCluster(2, new int[][] { { 0, 1, 2, 3 }, { 4, 5, 6, 7 } }); servers = new VoldemortServer[2]; - - servers[0] = ServerTestUtils.startVoldemortServer(socketStoreFactory, - ServerTestUtils.createServerConfig(true, - 0, - TestUtils.createTempDir() - .getAbsolutePath(), - null, - storesXmlfile, - new Properties()), - cluster); - servers[1] = ServerTestUtils.startVoldemortServer(socketStoreFactory, - ServerTestUtils.createServerConfig(true, - 1, - TestUtils.createTempDir() - .getAbsolutePath(), - null, - storesXmlfile, - new Properties()), - cluster); + cluster = ServerTestUtils.startVoldemortCluster(numServers, + servers, + partitionMap, + socketStoreFactory, + true, // useNio + null, + storesXmlfile, + new Properties()); socketUrl = servers[0].getIdentityNode().getSocketUrl().toString(); @@ -93,13 +85,13 @@ public void setUp() throws Exception { bootStrapUrls = new String[1]; bootStrapUrls[0] = socketUrl; clusterXml = ((AbstractStoreClientFactory) socketFactory).bootstrapMetadataWithRetries(MetadataStore.CLUSTER_KEY); - } @After public void tearDown() throws Exception { - servers[0].stop(); - servers[1].stop(); + for(VoldemortServer server: servers) { + ServerTestUtils.stopVoldemortServer(server); + } } @Test diff --git a/test/unit/voldemort/utils/ByteUtilsTest.java b/test/unit/voldemort/utils/ByteUtilsTest.java index 7fee09a7cd..979fa26cd1 100644 --- a/test/unit/voldemort/utils/ByteUtilsTest.java +++ b/test/unit/voldemort/utils/ByteUtilsTest.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2009 LinkedIn, Inc + * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -21,13 +21,15 @@ import junit.framework.TestCase; +import org.apache.commons.codec.DecoderException; + public class ByteUtilsTest extends TestCase { public void testCat() { assertTrue("Concatenation of empty arrays is not empty", Arrays.equals(new byte[0], ByteUtils.cat(new byte[0], new byte[0]))); - assertTrue("Concatenation of no arrays is not empty", Arrays.equals(new byte[0], - ByteUtils.cat())); + assertTrue("Concatenation of no arrays is not empty", + Arrays.equals(new byte[0], ByteUtils.cat())); assertTrue("Concatenation of arrays incorrect.", Arrays.equals("abcdefg".getBytes(), ByteUtils.cat("ab".getBytes(), "".getBytes(), @@ -94,14 +96,14 @@ public void testReadWriteBytes() { assertEquals("Read value not equal to written value.", 5, ByteUtils.readBytes(bytes, 0, 8)); long value = System.currentTimeMillis(); ByteUtils.writeBytes(bytes, value, 0, 8); - assertEquals("Read value not equal to written value.", value, ByteUtils.readBytes(bytes, - 0, - 8)); + assertEquals("Read value not equal to written value.", + value, + ByteUtils.readBytes(bytes, 0, 8)); bytes = new byte[24]; ByteUtils.writeBytes(bytes, value, 8, 8); - assertEquals("Read value not equal to written value.", value, ByteUtils.readBytes(bytes, - 8, - 8)); + assertEquals("Read value not equal to written value.", + value, + ByteUtils.readBytes(bytes, 8, 8)); } public void testGetNumberOfRequiredBytes() { @@ -113,14 +115,156 @@ public void testGetNumberOfRequiredBytes() { } public void testToString() { + assertEquals("00", ByteUtils.toHexString(new byte[] { 0 })); assertEquals("010203", ByteUtils.toHexString(new byte[] { 1, 2, 3 })); - assertEquals("afadae", ByteUtils.toHexString(new byte[] { (byte) 0xaf, (byte) 0xad, - (byte) 0xae })); - assertEquals("00000001" + "00000010" + "00000011", ByteUtils.toBinaryString(new byte[] { 1, - 2, 3 })); - assertEquals("10101111" + "10101101" + "10101110", ByteUtils.toBinaryString(new byte[] { - (byte) 0xaf, (byte) 0xad, (byte) 0xae })); + assertEquals("afadae", + ByteUtils.toHexString(new byte[] { (byte) 0xaf, (byte) 0xad, (byte) 0xae })); + assertEquals("afadae", + ByteUtils.toHexString(new byte[] { (byte) 0xaf, (byte) 0xad, (byte) 0xae })); + assertEquals("000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9fa0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebfc0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedfe0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff", + ByteUtils.toHexString(new byte[] { (byte) 0, (byte) 1, (byte) 2, (byte) 3, + (byte) 4, (byte) 5, (byte) 6, (byte) 7, (byte) 8, (byte) 9, (byte) 10, + (byte) 11, (byte) 12, (byte) 13, (byte) 14, (byte) 15, (byte) 16, + (byte) 17, (byte) 18, (byte) 19, (byte) 20, (byte) 21, (byte) 22, + (byte) 23, (byte) 24, (byte) 25, (byte) 26, (byte) 27, (byte) 28, + (byte) 29, (byte) 30, (byte) 31, (byte) 32, (byte) 33, (byte) 34, + (byte) 35, (byte) 36, (byte) 37, (byte) 38, (byte) 39, (byte) 40, + (byte) 41, (byte) 42, (byte) 43, (byte) 44, (byte) 45, (byte) 46, + (byte) 47, (byte) 48, (byte) 49, (byte) 50, (byte) 51, (byte) 52, + (byte) 53, (byte) 54, (byte) 55, (byte) 56, (byte) 57, (byte) 58, + (byte) 59, (byte) 60, (byte) 61, (byte) 62, (byte) 63, (byte) 64, + (byte) 65, (byte) 66, (byte) 67, (byte) 68, (byte) 69, (byte) 70, + (byte) 71, (byte) 72, (byte) 73, (byte) 74, (byte) 75, (byte) 76, + (byte) 77, (byte) 78, (byte) 79, (byte) 80, (byte) 81, (byte) 82, + (byte) 83, (byte) 84, (byte) 85, (byte) 86, (byte) 87, (byte) 88, + (byte) 89, (byte) 90, (byte) 91, (byte) 92, (byte) 93, (byte) 94, + (byte) 95, (byte) 96, (byte) 97, (byte) 98, (byte) 99, (byte) 100, + (byte) 101, (byte) 102, (byte) 103, (byte) 104, (byte) 105, + (byte) 106, (byte) 107, (byte) 108, (byte) 109, (byte) 110, + (byte) 111, (byte) 112, (byte) 113, (byte) 114, (byte) 115, + (byte) 116, (byte) 117, (byte) 118, (byte) 119, (byte) 120, + (byte) 121, (byte) 122, (byte) 123, (byte) 124, (byte) 125, + (byte) 126, (byte) 127, (byte) 128, (byte) 129, (byte) 130, + (byte) 131, (byte) 132, (byte) 133, (byte) 134, (byte) 135, + (byte) 136, (byte) 137, (byte) 138, (byte) 139, (byte) 140, + (byte) 141, (byte) 142, (byte) 143, (byte) 144, (byte) 145, + (byte) 146, (byte) 147, (byte) 148, (byte) 149, (byte) 150, + (byte) 151, (byte) 152, (byte) 153, (byte) 154, (byte) 155, + (byte) 156, (byte) 157, (byte) 158, (byte) 159, (byte) 160, + (byte) 161, (byte) 162, (byte) 163, (byte) 164, (byte) 165, + (byte) 166, (byte) 167, (byte) 168, (byte) 169, (byte) 170, + (byte) 171, (byte) 172, (byte) 173, (byte) 174, (byte) 175, + (byte) 176, (byte) 177, (byte) 178, (byte) 179, (byte) 180, + (byte) 181, (byte) 182, (byte) 183, (byte) 184, (byte) 185, + (byte) 186, (byte) 187, (byte) 188, (byte) 189, (byte) 190, + (byte) 191, (byte) 192, (byte) 193, (byte) 194, (byte) 195, + (byte) 196, (byte) 197, (byte) 198, (byte) 199, (byte) 200, + (byte) 201, (byte) 202, (byte) 203, (byte) 204, (byte) 205, + (byte) 206, (byte) 207, (byte) 208, (byte) 209, (byte) 210, + (byte) 211, (byte) 212, (byte) 213, (byte) 214, (byte) 215, + (byte) 216, (byte) 217, (byte) 218, (byte) 219, (byte) 220, + (byte) 221, (byte) 222, (byte) 223, (byte) 224, (byte) 225, + (byte) 226, (byte) 227, (byte) 228, (byte) 229, (byte) 230, + (byte) 231, (byte) 232, (byte) 233, (byte) 234, (byte) 235, + (byte) 236, (byte) 237, (byte) 238, (byte) 239, (byte) 240, + (byte) 241, (byte) 242, (byte) 243, (byte) 244, (byte) 245, + (byte) 246, (byte) 247, (byte) 248, (byte) 249, (byte) 250, + (byte) 251, (byte) 252, (byte) 253, (byte) 254, (byte) 255 })); + + assertEquals("00000000", ByteUtils.toBinaryString(new byte[] { 0 })); + assertEquals("00000001" + "00000010" + "00000011", + ByteUtils.toBinaryString(new byte[] { 1, 2, 3 })); + assertEquals("10101111" + "10101101" + "10101110", + ByteUtils.toBinaryString(new byte[] { (byte) 0xaf, (byte) 0xad, (byte) 0xae })); + + } + + public void specificFromHexStringTest(byte[] lhsBytes, String rhsString) { + try { + byte[] rhsBytes = ByteUtils.fromHexString(rhsString); + int offset = 0; + for(byte b: lhsBytes) { + assertEquals(b, rhsBytes[offset]); + ++offset; + } + offset = 0; + for(byte b: rhsBytes) { + assertEquals(b, lhsBytes[offset]); + ++offset; + } + } catch(DecoderException de) { + // DecoderException not expected... + assertTrue(false); + } + } + public void testFromHexString() { + specificFromHexStringTest(new byte[] { 0 }, "00"); + specificFromHexStringTest(new byte[] { 1, 2, 3 }, "010203"); + specificFromHexStringTest(new byte[] { (byte) 0xaf, (byte) 0xad, (byte) 0xae }, "afadae"); + // Use following commands to determine test input: + // $ seq -s ", " -f "(byte) %g" 0 255 + // $ printf '%02x' $(seq 0 255) + specificFromHexStringTest(new byte[] { (byte) 0, (byte) 1, (byte) 2, (byte) 3, (byte) 4, + (byte) 5, (byte) 6, (byte) 7, (byte) 8, (byte) 9, + (byte) 10, (byte) 11, (byte) 12, (byte) 13, (byte) 14, + (byte) 15, (byte) 16, (byte) 17, (byte) 18, (byte) 19, + (byte) 20, (byte) 21, (byte) 22, (byte) 23, (byte) 24, + (byte) 25, (byte) 26, (byte) 27, (byte) 28, (byte) 29, + (byte) 30, (byte) 31, (byte) 32, (byte) 33, (byte) 34, + (byte) 35, (byte) 36, (byte) 37, (byte) 38, (byte) 39, + (byte) 40, (byte) 41, (byte) 42, (byte) 43, (byte) 44, + (byte) 45, (byte) 46, (byte) 47, (byte) 48, (byte) 49, + (byte) 50, (byte) 51, (byte) 52, (byte) 53, (byte) 54, + (byte) 55, (byte) 56, (byte) 57, (byte) 58, (byte) 59, + (byte) 60, (byte) 61, (byte) 62, (byte) 63, (byte) 64, + (byte) 65, (byte) 66, (byte) 67, (byte) 68, (byte) 69, + (byte) 70, (byte) 71, (byte) 72, (byte) 73, (byte) 74, + (byte) 75, (byte) 76, (byte) 77, (byte) 78, (byte) 79, + (byte) 80, (byte) 81, (byte) 82, (byte) 83, (byte) 84, + (byte) 85, (byte) 86, (byte) 87, (byte) 88, (byte) 89, + (byte) 90, (byte) 91, (byte) 92, (byte) 93, (byte) 94, + (byte) 95, (byte) 96, (byte) 97, (byte) 98, (byte) 99, + (byte) 100, (byte) 101, (byte) 102, (byte) 103, + (byte) 104, (byte) 105, (byte) 106, (byte) 107, + (byte) 108, (byte) 109, (byte) 110, (byte) 111, + (byte) 112, (byte) 113, (byte) 114, (byte) 115, + (byte) 116, (byte) 117, (byte) 118, (byte) 119, + (byte) 120, (byte) 121, (byte) 122, (byte) 123, + (byte) 124, (byte) 125, (byte) 126, (byte) 127, + (byte) 128, (byte) 129, (byte) 130, (byte) 131, + (byte) 132, (byte) 133, (byte) 134, (byte) 135, + (byte) 136, (byte) 137, (byte) 138, (byte) 139, + (byte) 140, (byte) 141, (byte) 142, (byte) 143, + (byte) 144, (byte) 145, (byte) 146, (byte) 147, + (byte) 148, (byte) 149, (byte) 150, (byte) 151, + (byte) 152, (byte) 153, (byte) 154, (byte) 155, + (byte) 156, (byte) 157, (byte) 158, (byte) 159, + (byte) 160, (byte) 161, (byte) 162, (byte) 163, + (byte) 164, (byte) 165, (byte) 166, (byte) 167, + (byte) 168, (byte) 169, (byte) 170, (byte) 171, + (byte) 172, (byte) 173, (byte) 174, (byte) 175, + (byte) 176, (byte) 177, (byte) 178, (byte) 179, + (byte) 180, (byte) 181, (byte) 182, (byte) 183, + (byte) 184, (byte) 185, (byte) 186, (byte) 187, + (byte) 188, (byte) 189, (byte) 190, (byte) 191, + (byte) 192, (byte) 193, (byte) 194, (byte) 195, + (byte) 196, (byte) 197, (byte) 198, (byte) 199, + (byte) 200, (byte) 201, (byte) 202, (byte) 203, + (byte) 204, (byte) 205, (byte) 206, (byte) 207, + (byte) 208, (byte) 209, (byte) 210, (byte) 211, + (byte) 212, (byte) 213, (byte) 214, (byte) 215, + (byte) 216, (byte) 217, (byte) 218, (byte) 219, + (byte) 220, (byte) 221, (byte) 222, (byte) 223, + (byte) 224, (byte) 225, (byte) 226, (byte) 227, + (byte) 228, (byte) 229, (byte) 230, (byte) 231, + (byte) 232, (byte) 233, (byte) 234, (byte) 235, + (byte) 236, (byte) 237, (byte) 238, (byte) 239, + (byte) 240, (byte) 241, (byte) 242, (byte) 243, + (byte) 244, (byte) 245, (byte) 246, (byte) 247, + (byte) 248, (byte) 249, (byte) 250, (byte) 251, + (byte) 252, (byte) 253, (byte) 254, (byte) 255 }, + "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9fa0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebfc0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedfe0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff"); } public void testNthByte() { @@ -143,8 +287,9 @@ public void testNthByteRead(String theByte, String theLong, int nth) { public void testMd5() { String test = "alskdjflsajflksdjldfsdf"; MessageDigest digest = ByteUtils.getDigest("MD5"); - assertEquals(0, ByteUtils.compare(ByteUtils.md5(test.getBytes()), - digest.digest(test.getBytes()))); + assertEquals(0, + ByteUtils.compare(ByteUtils.md5(test.getBytes()), + digest.digest(test.getBytes()))); } public void testSha1() { diff --git a/test/unit/voldemort/utils/ConsistencyCheckTest.java b/test/unit/voldemort/utils/ConsistencyCheckTest.java new file mode 100644 index 0000000000..379dd50899 --- /dev/null +++ b/test/unit/voldemort/utils/ConsistencyCheckTest.java @@ -0,0 +1,538 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.utils; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; + +import org.junit.Before; +import org.junit.Test; + +import voldemort.ServerTestUtils; +import voldemort.client.ClientConfig; +import voldemort.client.protocol.admin.AdminClient; +import voldemort.client.protocol.admin.AdminClientConfig; +import voldemort.cluster.Cluster; +import voldemort.cluster.Node; +import voldemort.routing.RoutingStrategy; +import voldemort.routing.RoutingStrategyFactory; +import voldemort.server.VoldemortServer; +import voldemort.store.StoreDefinition; +import voldemort.store.socket.SocketStoreFactory; +import voldemort.store.socket.clientrequest.ClientRequestExecutorPool; +import voldemort.utils.ConsistencyCheck.ClusterNode; +import voldemort.utils.ConsistencyCheck.HashedValue; +import voldemort.utils.ConsistencyCheck.KeyFetchTracker; +import voldemort.utils.ConsistencyCheck.Reporter; +import voldemort.versioning.VectorClock; +import voldemort.versioning.Version; +import voldemort.versioning.Versioned; + +public class ConsistencyCheckTest { + + final String STORE_NAME = "consistency-check"; + final String STORES_XML = "test/common/voldemort/config/consistency-stores.xml"; + + Node n1 = new Node(1, "localhost", 10000, 10001, 10002, 0, new ArrayList()); + Node n1_dup = new Node(1, "localhost", 10000, 10001, 10002, 0, new ArrayList()); + Node n2 = new Node(2, "localhost", 10000, 10001, 10002, 0, new ArrayList()); + Node n3 = new Node(3, "localhost", 10000, 10001, 10002, 0, new ArrayList()); + Node n4 = new Node(4, "localhost", 10000, 10001, 10002, 0, new ArrayList()); + ClusterNode cn0_1 = new ClusterNode(0, n1); + ClusterNode cn0_1_dup = new ClusterNode(0, n1); + ClusterNode cn1_1dup = new ClusterNode(1, n1_dup); + ClusterNode cn0_2 = new ClusterNode(0, n2); + ClusterNode cn0_3 = new ClusterNode(0, n3); + ClusterNode cn0_4 = new ClusterNode(0, n4); + ClusterNode cn1_2 = new ClusterNode(1, n2); // 1.1 + + byte[] value1 = { 0, 1, 2, 3, 4 }; + byte[] value2 = { 0, 1, 2, 3, 5 }; + byte[] value3 = { 0, 1, 2, 3, 6 }; + byte[] value4 = { 0, 1, 2, 3, 7 }; + Versioned versioned1 = new Versioned(value1); + Versioned versioned2 = new Versioned(value2); + Version hv1 = new ConsistencyCheck.HashedValue(versioned1); + Version hv1_dup = new ConsistencyCheck.HashedValue(versioned1); + Version hv2 = new ConsistencyCheck.HashedValue(versioned2); + + long now = System.currentTimeMillis(); + Version vc1 = new VectorClock(now - Time.MS_PER_DAY); + Version vc2 = new VectorClock(now); + Version hv3 = new ConsistencyCheck.HashedValue(new Versioned(value1)); + Version vc3 = new VectorClock(now - Time.MS_PER_HOUR * 24 + 500 * Time.MS_PER_SECOND); + + // make set + Set setFourNodes = new HashSet(); + Set setThreeNodes = new HashSet(); + + @Before + public void setUp() { + setFourNodes.add(cn0_1); + setFourNodes.add(cn0_2); + setFourNodes.add(cn0_3); + setFourNodes.add(cn0_4); + setThreeNodes.add(cn0_1); + setThreeNodes.add(cn0_2); + setThreeNodes.add(cn0_3); + } + + @Test + public void testClusterNode() { + + // test getter + assertEquals(cn0_1.getNode(), n1); + assertEquals(cn1_1dup.getNode(), n1_dup); + assertEquals(cn0_2.getNode(), n2); + assertEquals(new Integer(0), cn0_1.getPrefixId()); + assertEquals(new Integer(1), cn1_1dup.getPrefixId()); + assertEquals(new Integer(0), cn0_2.getPrefixId()); + + // test equals function + assertTrue(cn0_1.equals(cn0_1_dup)); + assertFalse(cn1_1dup.equals(cn0_1)); + assertFalse(cn0_2.equals(cn0_1)); + assertFalse(cn0_2.equals(cn1_1dup)); + + // test toString function + assertEquals("0.1", cn0_1.toString()); + assertEquals("1.1", cn1_1dup.toString()); + assertEquals("0.2", cn0_2.toString()); + } + + @Test + public void testHashedValue() { + + assertTrue(hv1.equals(hv1_dup)); + assertEquals(hv1.hashCode(), hv1_dup.hashCode()); + assertFalse(hv1.hashCode() == hv2.hashCode()); + assertFalse(hv1.equals(hv2)); + assertFalse(hv1.equals(null)); + assertFalse(hv1.equals(new Versioned(null))); + assertFalse(hv1.equals(new Integer(0))); + + assertEquals(versioned1.getVersion(), ((ConsistencyCheck.HashedValue) hv1).getInner()); + assertEquals(((ConsistencyCheck.HashedValue) hv1).getValueHash(), hv1.hashCode()); + } + + @Test + public void testRetentionChecker() { + ConsistencyCheck.RetentionChecker rc1 = new ConsistencyCheck.RetentionChecker(0); + ConsistencyCheck.RetentionChecker rc2 = new ConsistencyCheck.RetentionChecker(1); + + assertFalse(rc1.isExpired(vc1)); + assertFalse(rc1.isExpired(vc2)); + assertFalse(rc1.isExpired(hv3)); + assertFalse(rc1.isExpired(vc3)); + assertTrue(rc2.isExpired(vc1)); + assertFalse(rc2.isExpired(vc2)); + assertFalse(rc2.isExpired(hv3)); + assertTrue(rc2.isExpired(vc3)); + } + + @Test + public void testDetermineConsistencyVectorClock() { + Map> versionNodeSetMap = new HashMap>(); + int replicationFactor = 4; + + // Version is vector clock + Version v1 = new VectorClock(); + ((VectorClock) v1).incrementVersion(1, 100000001); + ((VectorClock) v1).incrementVersion(2, 100000003); + Version v2 = new VectorClock(); + ((VectorClock) v2).incrementVersion(1, 100000001); + ((VectorClock) v2).incrementVersion(3, 100000002); + Version v3 = new VectorClock(); + ((VectorClock) v3).incrementVersion(1, 100000001); + ((VectorClock) v3).incrementVersion(4, 100000001); + + // FULL: simple + versionNodeSetMap.put(v1, setFourNodes); + assertEquals(ConsistencyCheck.ConsistencyLevel.FULL, + ConsistencyCheck.determineConsistency(versionNodeSetMap, replicationFactor)); + + // FULL: three versions + versionNodeSetMap.clear(); + versionNodeSetMap.put(v1, setFourNodes); + versionNodeSetMap.put(v2, setFourNodes); + versionNodeSetMap.put(v3, setFourNodes); + assertEquals(ConsistencyCheck.ConsistencyLevel.FULL, + ConsistencyCheck.determineConsistency(versionNodeSetMap, replicationFactor)); + + // LATEST_CONSISTENCY: two versions + versionNodeSetMap.clear(); + versionNodeSetMap.put(v1, setFourNodes); + versionNodeSetMap.put(v2, setThreeNodes); + assertEquals(ConsistencyCheck.ConsistencyLevel.LATEST_CONSISTENT, + ConsistencyCheck.determineConsistency(versionNodeSetMap, replicationFactor)); + + // INCONSISTENT: one version + versionNodeSetMap.clear(); + versionNodeSetMap.put(v1, setThreeNodes); + assertEquals(ConsistencyCheck.ConsistencyLevel.INCONSISTENT, + ConsistencyCheck.determineConsistency(versionNodeSetMap, replicationFactor)); + + // INCONSISTENT: non-latest consistent + versionNodeSetMap.clear(); + versionNodeSetMap.put(v1, setThreeNodes); + versionNodeSetMap.put(v2, setFourNodes); + assertEquals(ConsistencyCheck.ConsistencyLevel.INCONSISTENT, + ConsistencyCheck.determineConsistency(versionNodeSetMap, replicationFactor)); + + // INCONSISTENT: three versions + versionNodeSetMap.clear(); + versionNodeSetMap.put(v1, setThreeNodes); + versionNodeSetMap.put(v2, setFourNodes); + versionNodeSetMap.put(v3, setThreeNodes); + assertEquals(ConsistencyCheck.ConsistencyLevel.INCONSISTENT, + ConsistencyCheck.determineConsistency(versionNodeSetMap, replicationFactor)); + } + + public void testDetermineConsistencyHashValue() { + Map> versionNodeSetMap = new HashMap>(); + int replicationFactor = 4; + + // vector clocks + Version v1 = new VectorClock(); + ((VectorClock) v1).incrementVersion(1, 100000001); + ((VectorClock) v1).incrementVersion(2, 100000003); + Version v2 = new VectorClock(); + ((VectorClock) v2).incrementVersion(1, 100000001); + ((VectorClock) v2).incrementVersion(3, 100000002); + Version v3 = new VectorClock(); + ((VectorClock) v3).incrementVersion(1, 100000001); + ((VectorClock) v3).incrementVersion(4, 100000001); + + // Version is HashedValue + Versioned versioned1 = new Versioned(value1, v1); + Versioned versioned2 = new Versioned(value2, v2); + Versioned versioned3 = new Versioned(value3, v3); + Version hv1 = new ConsistencyCheck.HashedValue(versioned1); + Version hv2 = new ConsistencyCheck.HashedValue(versioned2); + Version hv3 = new ConsistencyCheck.HashedValue(versioned3); + + // FULL + // one version + versionNodeSetMap.clear(); + versionNodeSetMap.put(hv1, setFourNodes); + assertEquals(ConsistencyCheck.ConsistencyLevel.FULL, + ConsistencyCheck.determineConsistency(versionNodeSetMap, replicationFactor)); + + // three versions + versionNodeSetMap.clear(); + versionNodeSetMap.put(hv1, setFourNodes); + versionNodeSetMap.put(hv2, setFourNodes); + versionNodeSetMap.put(hv3, setFourNodes); + assertEquals(ConsistencyCheck.ConsistencyLevel.FULL, + ConsistencyCheck.determineConsistency(versionNodeSetMap, replicationFactor)); + + // LATEST_CONSISTENT: not possible since timestamp is ignored + + // INCONSISTENT + versionNodeSetMap.clear(); + versionNodeSetMap.put(hv1, setThreeNodes); + assertEquals(ConsistencyCheck.ConsistencyLevel.INCONSISTENT, + ConsistencyCheck.determineConsistency(versionNodeSetMap, replicationFactor)); + + versionNodeSetMap.clear(); + versionNodeSetMap.put(hv1, setFourNodes); + versionNodeSetMap.put(hv2, setThreeNodes); + assertEquals(ConsistencyCheck.ConsistencyLevel.INCONSISTENT, + ConsistencyCheck.determineConsistency(versionNodeSetMap, replicationFactor)); + } + + @Test + public void testCleanInlegibleKeys() { + // versions + Version v1 = new VectorClock(); + ((VectorClock) v1).incrementVersion(1, 100000001); + ((VectorClock) v1).incrementVersion(2, 100000003); + Version v2 = new VectorClock(); + ((VectorClock) v2).incrementVersion(1, 100000002); + + // setup + Map>> map = new HashMap>>(); + Map> nodeSetMap = new HashMap>(); + Set oneNodeSet = new HashSet(); + oneNodeSet.add(cn0_1); + Set twoNodeSet = new HashSet(); + twoNodeSet.add(cn0_1); + twoNodeSet.add(cn0_2); + int requiredWrite = 2; + ByteArray key1 = new ByteArray(value1); + + // delete one key + map.clear(); + nodeSetMap.clear(); + nodeSetMap.put(v1, oneNodeSet); + map.put(key1, nodeSetMap); + + assertEquals(1, map.size()); + ConsistencyCheck.cleanIneligibleKeys(map, requiredWrite); + assertEquals(0, map.size()); + + // delete one version out of two versions + map.clear(); + nodeSetMap.clear(); + nodeSetMap.put(v1, oneNodeSet); + nodeSetMap.put(v2, twoNodeSet); + map.put(key1, nodeSetMap); + + assertEquals(2, map.get(key1).size()); + ConsistencyCheck.cleanIneligibleKeys(map, requiredWrite); + assertEquals(1, map.size()); + assertEquals(1, map.get(key1).size()); + + } + + @Test + public void testKeyVersionToString() { + byte[] keyBytes = { 0, 1, 2, 17, 4 }; + ByteArray key = new ByteArray(keyBytes); + long now = System.currentTimeMillis(); + Version v1 = new VectorClock(now); + Version v2 = new VectorClock(now + 1); + Versioned versioned = new Versioned(value1, v1); + + // make Prefix Nodes + Set set = new HashSet(); + set.add(cn0_1); + set.add(cn1_2); + set.add(cn0_3); + + // test vector clock + Map> mapVector = new HashMap>(); + mapVector.put(v1, set); + ((VectorClock) v1).incrementVersion(1, now); + String sVector = ConsistencyCheck.keyVersionToString(key, mapVector, "testStore", 99); + assertEquals("BAD_KEY,testStore,99,0001021104," + set.toString().replace(", ", ";") + "," + + now + ",[1:1]", sVector); + + // test two lines + ((VectorClock) v2).incrementVersion(1, now); + ((VectorClock) v2).incrementVersion(1, now + 1); + mapVector.put(v2, set); + String sVector2 = ConsistencyCheck.keyVersionToString(key, mapVector, "testStore", 99); + String s1 = "BAD_KEY,testStore,99,0001021104," + set.toString().replace(", ", ";") + "," + + now + ",[1:1]"; + + String s2 = "BAD_KEY,testStore,99,0001021104," + set.toString().replace(", ", ";") + "," + + (now + 1) + ",[1:2]"; + assertTrue(sVector2.equals(s1 + s2) || sVector2.equals(s2 + s1)); + + // test value hash + Version v3 = new HashedValue(versioned); + Map> mapHashed = new HashMap>(); + mapHashed.put(v3, set); + assertEquals("BAD_KEY,testStore,99,0001021104," + set.toString().replace(", ", ";") + "," + + now + ",[1:1],-1172398097", + ConsistencyCheck.keyVersionToString(key, mapHashed, "testStore", 99)); + + } + + @Test + public void testKeyFetchTracker() { + KeyFetchTracker tracker = new KeyFetchTracker(4); + tracker.recordFetch(cn0_1, new ByteArray(value1)); + tracker.recordFetch(cn0_2, new ByteArray(value1)); + tracker.recordFetch(cn0_3, new ByteArray(value1)); + tracker.recordFetch(cn0_4, new ByteArray(value1)); + tracker.recordFetch(cn0_1, new ByteArray(value2)); + tracker.recordFetch(cn0_2, new ByteArray(value2)); + tracker.recordFetch(cn0_3, new ByteArray(value2)); + assertNull(tracker.nextFinished()); + tracker.recordFetch(cn0_4, new ByteArray(value2)); + assertEquals(new ByteArray(value1), tracker.nextFinished()); + assertNull(tracker.nextFinished()); + // multiple fetch on same node same key + tracker.recordFetch(cn0_1, new ByteArray(value3)); + tracker.recordFetch(cn0_2, new ByteArray(value3)); + tracker.recordFetch(cn0_3, new ByteArray(value3)); + tracker.recordFetch(cn0_4, new ByteArray(value3)); + tracker.recordFetch(cn0_4, new ByteArray(value3)); + tracker.recordFetch(cn0_4, new ByteArray(value3)); + assertEquals(new ByteArray(value2), tracker.nextFinished()); + + tracker.recordFetch(cn0_1, new ByteArray(value4)); + tracker.recordFetch(cn0_2, new ByteArray(value4)); + tracker.recordFetch(cn0_3, new ByteArray(value4)); + + assertNull(tracker.nextFinished()); + + tracker.finishAll(); + assertEquals(new ByteArray(value3), tracker.nextFinished()); + assertEquals(new ByteArray(value4), tracker.nextFinished()); + assertNull(tracker.nextFinished()); + } + + @Test + public void testOnePartitionEndToEnd() throws Exception { + long now = System.currentTimeMillis(); + + // setup four nodes with one store and one partition + final SocketStoreFactory socketStoreFactory = new ClientRequestExecutorPool(2, + 10000, + 100000, + 32 * 1024); + VoldemortServer[] servers = new VoldemortServer[4]; + int partitionMap[][] = { { 0 }, { 1 }, { 2 }, { 3 } }; + Cluster cluster = ServerTestUtils.startVoldemortCluster(4, + servers, + partitionMap, + socketStoreFactory, + true, + null, + STORES_XML, + new Properties()); + + Node node = cluster.getNodeById(0); + String bootstrapUrl = "tcp://" + node.getHost() + ":" + node.getSocketPort(); + AdminClient adminClient = new AdminClient(bootstrapUrl, + new AdminClientConfig(), + new ClientConfig()); + + byte[] value = { 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + + // make versions + VectorClock vc1 = new VectorClock(); + VectorClock vc2 = new VectorClock(); + VectorClock vc3 = new VectorClock(); + vc1.incrementVersion(0, now); // [0:1] + vc2.incrementVersion(1, now - 5000); // [1:1] + vc3.incrementVersion(0, now - 89000000); // [0:1], over a day old + + ArrayList>> n0store = new ArrayList>>(); + ArrayList>> n1store = new ArrayList>>(); + ArrayList>> n2store = new ArrayList>>(); + ArrayList>> n3store = new ArrayList>>(); + ArrayList keysHashedToPar0 = new ArrayList(); + + // find store + Versioned> storeDefinitions = adminClient.metadataMgmtOps.getRemoteStoreDefList(0); + List StoreDefitions = storeDefinitions.getValue(); + StoreDefinition storeDefinition = null; + for(StoreDefinition def: StoreDefitions) { + if(def.getName().equals(STORE_NAME)) { + storeDefinition = def; + break; + } + } + assertNotNull("No such store found: " + STORE_NAME, storeDefinition); + + RoutingStrategy router = new RoutingStrategyFactory().updateRoutingStrategy(storeDefinition, + cluster); + while(keysHashedToPar0.size() < 7) { + // generate random key + Map map = ServerTestUtils.createRandomKeyValuePairs(1); + ByteArray key = map.keySet().iterator().next(); + key.get()[0] = (byte) keysHashedToPar0.size(); + Integer masterPartition = router.getMasterPartition(key.get()); + if(masterPartition == 0) { + keysHashedToPar0.add(key); + } else { + continue; + } + } + ByteArray k6 = keysHashedToPar0.get(6); + ByteArray k5 = keysHashedToPar0.get(5); + ByteArray k4 = keysHashedToPar0.get(4); + ByteArray k3 = keysHashedToPar0.get(3); + ByteArray k2 = keysHashedToPar0.get(2); + ByteArray k1 = keysHashedToPar0.get(1); + ByteArray k0 = keysHashedToPar0.get(0); + + // insert K6 into node 0,1,2 + Versioned v6 = new Versioned(value, vc1); + n0store.add(Pair.create(k6, v6)); + n1store.add(Pair.create(k6, v6)); + n2store.add(Pair.create(k6, v6)); + + // insert K6(conflicting but not latest version) into node 0,1,2,3 + Versioned v6ConflictEarly = new Versioned(value, vc2); + n0store.add(Pair.create(k6, v6ConflictEarly)); + n1store.add(Pair.create(k6, v6ConflictEarly)); + n2store.add(Pair.create(k6, v6ConflictEarly)); + n3store.add(Pair.create(k6, v6ConflictEarly)); + + // insert K4,K5 into four nodes + Versioned v5 = new Versioned(value, vc1); + Versioned v4 = new Versioned(value, vc1); + n0store.add(Pair.create(k5, v5)); + n1store.add(Pair.create(k5, v5)); + n2store.add(Pair.create(k5, v5)); + n3store.add(Pair.create(k5, v5)); + n0store.add(Pair.create(k4, v4)); + n1store.add(Pair.create(k4, v4)); + n2store.add(Pair.create(k4, v4)); + n3store.add(Pair.create(k4, v4)); + + // insert K3 into node 0,1,2 + Versioned v3 = new Versioned(value, vc2); + n0store.add(Pair.create(k3, v3)); + n1store.add(Pair.create(k3, v3)); + n2store.add(Pair.create(k3, v3)); + + // insert K3(conflicting but latest version) into node 0,1,2,3 + Versioned v3ConflictLate = new Versioned(value, vc1); + n0store.add(Pair.create(k3, v3ConflictLate)); + n1store.add(Pair.create(k3, v3ConflictLate)); + n2store.add(Pair.create(k3, v3ConflictLate)); + n3store.add(Pair.create(k3, v3ConflictLate)); + + // insert K2 into node 0,1 + Versioned v2 = new Versioned(value, vc1); + n0store.add(Pair.create(k2, v2)); + n1store.add(Pair.create(k2, v2)); + + // insert K1 into node 0 + Versioned v1 = new Versioned(value, vc1); + n0store.add(Pair.create(k1, v1)); + + // insert K0(out of retention) into node 0,1,2 + Versioned v0 = new Versioned(value, vc3); + n0store.add(Pair.create(k0, v0)); + + // stream to store + adminClient.streamingOps.updateEntries(0, STORE_NAME, n0store.iterator(), null); + adminClient.streamingOps.updateEntries(1, STORE_NAME, n1store.iterator(), null); + adminClient.streamingOps.updateEntries(2, STORE_NAME, n2store.iterator(), null); + adminClient.streamingOps.updateEntries(3, STORE_NAME, n3store.iterator(), null); + + // should have FULL:2(K4,K5), LATEST_CONSISTENT:1(K3), + // INCONSISTENT:2(K6,K2), ignored(K1,K0) + List urls = new ArrayList(); + urls.add(bootstrapUrl); + ConsistencyCheck checker = new ConsistencyCheck(urls, STORE_NAME, 0, null); + Reporter reporter = null; + checker.connect(); + reporter = checker.execute(); + + assertEquals(7 - 2, reporter.numTotalKeys); + assertEquals(3, reporter.numGoodKeys); + } +} diff --git a/test/unit/voldemort/utils/ConsistencyFixTest.java b/test/unit/voldemort/utils/ConsistencyFixTest.java new file mode 100644 index 0000000000..12de83e96c --- /dev/null +++ b/test/unit/voldemort/utils/ConsistencyFixTest.java @@ -0,0 +1,339 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.utils; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.util.Properties; +import java.util.concurrent.ArrayBlockingQueue; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.RejectedExecutionHandler; +import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.TimeUnit; + +import org.junit.Test; + +import voldemort.ServerTestUtils; +import voldemort.TestUtils; +import voldemort.cluster.Cluster; +import voldemort.cluster.Node; +import voldemort.server.VoldemortServer; +import voldemort.store.socket.SocketStoreFactory; +import voldemort.store.socket.clientrequest.ClientRequestExecutorPool; +import voldemort.utils.ConsistencyFix.BadKey; +import voldemort.utils.ConsistencyFix.BadKeyOrphanReader; +import voldemort.utils.ConsistencyFix.BadKeyReader; +import voldemort.utils.ConsistencyFix.BadKeyStatus; +import voldemort.utils.ConsistencyFix.BadKeyWriter; +import voldemort.utils.ConsistencyFix.Stats; +import voldemort.versioning.VectorClock; + +public class ConsistencyFixTest { + + final static String STORE_NAME = "consistency-fix"; + final static String STORES_XML = "test/common/voldemort/config/consistency-stores.xml"; + + /** + * + * @return bootstrap url + */ + public static String setUpCluster() { + // setup four nodes with one store and one partition + final SocketStoreFactory socketStoreFactory = new ClientRequestExecutorPool(2, + 10000, + 100000, + 32 * 1024); + VoldemortServer[] servers = new VoldemortServer[4]; + int partitionMap[][] = { { 0 }, { 1 }, { 2 }, { 3 } }; + try { + Cluster cluster = ServerTestUtils.startVoldemortCluster(4, + servers, + partitionMap, + socketStoreFactory, + true, + null, + STORES_XML, + new Properties()); + Node node = cluster.getNodeById(0); + return "tcp://" + node.getHost() + ":" + node.getSocketPort(); + } catch(IOException e) { + e.printStackTrace(); + fail("Unexpected exception"); + } + + return null; + } + + public void badKeyReaderWriteBadKeys(String fileName, boolean addWhiteSpace) { + // Write file of "bad keys" with messy white space + try { + BufferedWriter fileWriter = new BufferedWriter(new FileWriter(fileName)); + if(addWhiteSpace) + fileWriter.write("\n\n\t\t\n\n\t\n"); + for(int i = 0; i < 1000; ++i) { + byte[] keyb = TestUtils.randomBytes(10); + fileWriter.write(ByteUtils.toHexString(keyb) + "\n"); + if(addWhiteSpace) { + + if(i % 5 == 0) { + fileWriter.write("\n\n\t\t\n\n\t\n"); + } + if(i % 7 == 0) { + fileWriter.write("\t"); + } + } + } + if(addWhiteSpace) + fileWriter.write("\n\n\t\t\n\n\t\n"); + fileWriter.close(); + } catch(IOException e) { + e.printStackTrace(); + fail("Unexpected exception"); + } + } + + public void badKeyReaderWriteOrphanKeys(String fileName, boolean addWhiteSpace) { + // Write file of "orphans" with some white space between entries. + /*- + * Example entry: + 6473333333646464,2 + 00,version(1:1, 2:1) ts:1357257858674 + 00,version(1:1, 3:1) ts:1357257858684 + */ + try { + BufferedWriter fileWriter = new BufferedWriter(new FileWriter(fileName)); + if(addWhiteSpace) + fileWriter.write("\n\n\t\t\n\n\t\n"); + for(int i = 0; i < 1000; ++i) { + int numValues = (i % 3) + 1; + byte[] keyb = TestUtils.randomBytes(10); + String keyLine = ByteUtils.toHexString(keyb) + "," + numValues; + System.out.println("keyLine: " + keyLine); + fileWriter.write(keyLine + "\n"); + for(int j = 0; j < numValues; j++) { + int valLength = (j + 10) * (j + 1); + String value = ByteUtils.toHexString(TestUtils.randomBytes(valLength)); + VectorClock vectorClock = TestUtils.getClock(j); + String valueLine = value + "," + vectorClock.toString(); + System.out.println("valueLine: " + valueLine); + fileWriter.write(valueLine + "\n"); + } + if(addWhiteSpace) { + if(i % 5 == 0) { + fileWriter.write("\n\n\t\t\n\n\t\n"); + } + } + } + if(addWhiteSpace) + fileWriter.write("\n\n\t\t\n\n\t\n"); + fileWriter.close(); + } catch(IOException e) { + e.printStackTrace(); + fail("Unexpected exception"); + } + } + + /** + * + * @param orphan true for testing orphan, false for testing normal... + */ + public void badKeyReaderHelper(boolean orphan) { + String tmpDir = TestUtils.createTempDir().getAbsolutePath(); + String fileName = tmpDir + "BadKeyFile"; + if(orphan) { + badKeyReaderWriteOrphanKeys(fileName, true); + } else { + badKeyReaderWriteBadKeys(fileName, true); + } + + // Get cluster bootstrap url + String url = setUpCluster(); + + // Construct ConsistencyFix with parseOnly true + ConsistencyFix consistencyFix = new ConsistencyFix(url, STORE_NAME, 100, 100, false, true); + + // Do set up for BadKeyReader akin to consistencyFix.execute... + int parallelism = 1; + BlockingQueue blockingQ = new ArrayBlockingQueue(parallelism); + RejectedExecutionHandler rejectedExecutionHandler = new ThreadPoolExecutor.CallerRunsPolicy(); + ExecutorService consistencyFixWorkers = new ThreadPoolExecutor(parallelism, + parallelism, + 0L, + TimeUnit.MILLISECONDS, + blockingQ, + rejectedExecutionHandler); + BlockingQueue badKeyQOut = new ArrayBlockingQueue(10000); + + ExecutorService badKeyReaderService = Executors.newSingleThreadExecutor(); + CountDownLatch allBadKeysReadLatch = new CountDownLatch(1); + + // Submit file of bad keys to (appropriate) BadKeyReader + BadKeyReader bkr = null; + if(orphan) { + bkr = new BadKeyOrphanReader(allBadKeysReadLatch, + fileName, + consistencyFix, + consistencyFixWorkers, + badKeyQOut); + } else { + bkr = new BadKeyReader(allBadKeysReadLatch, + fileName, + consistencyFix, + consistencyFixWorkers, + badKeyQOut); + } + badKeyReaderService.submit(bkr); + + // Wait for file to be processed. + try { + allBadKeysReadLatch.await(); + + badKeyReaderService.shutdown(); + consistencyFixWorkers.shutdown(); + } catch(InterruptedException e) { + e.printStackTrace(); + fail("Unexpected exception"); + } + consistencyFix.close(); + + // Make sure everything worked as expected. + assertFalse(bkr.hasException()); + assertEquals(0, badKeyQOut.size()); + } + + @Test + public void testBadKeyReader() { + badKeyReaderHelper(false); + } + + @Test + public void testBadKeyOrphanReader() { + badKeyReaderHelper(true); + } + + @Test + public void testBadKeyResult() { + BadKey badKey = new BadKey("0101", "0101\n"); + ConsistencyFix.BadKeyStatus bkr1 = new BadKeyStatus(badKey, ConsistencyFix.Status.SUCCESS); + assertFalse(bkr1.isPoison()); + assertEquals(bkr1.getBadKey().getKeyInHexFormat(), "0101"); + assertEquals(bkr1.getBadKey().getReaderInput(), "0101\n"); + assertEquals(bkr1.getStatus(), ConsistencyFix.Status.SUCCESS); + + ConsistencyFix.BadKeyStatus bkr2 = new BadKeyStatus(); + assertTrue(bkr2.isPoison()); + assertEquals(bkr2.getBadKey(), null); + assertEquals(bkr2.getStatus(), null); + } + + @Test + public void testBadKeyWriter() { + String tmpDir = TestUtils.createTempDir().getAbsolutePath(); + String fileName = tmpDir + "BadKeyFile"; + + // Set up bad key writer + BlockingQueue bq = new ArrayBlockingQueue(5); + ExecutorService badKeyWriterService = Executors.newSingleThreadExecutor(); + + BadKeyWriter badKeyWriter = new BadKeyWriter(fileName, bq); + badKeyWriterService.submit(badKeyWriter); + + // Enqueue stuff for bad key writer to write + try { + for(int i = 0; i < 100; ++i) { + BadKey badKey = new BadKey(Integer.toHexString(i), Integer.toHexString(i) + "\n"); + + bq.put(new BadKeyStatus(badKey, ConsistencyFix.Status.REPAIR_EXCEPTION)); + } + // Poison bad key writer + bq.put(new BadKeyStatus()); + } catch(InterruptedException e) { + e.printStackTrace(); + fail("Unexpected exception"); + } + + // wait for bad key writer to shutdown + badKeyWriterService.shutdown(); + try { + badKeyWriterService.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS); + } catch(InterruptedException e) { + e.printStackTrace(); + fail("Unexpected exception"); + } + + assertFalse(badKeyWriter.hasException()); + + // Read output file & verify. + try { + BufferedReader fileReader = new BufferedReader(new FileReader(fileName)); + + int i = 0; + for(String keyLine = fileReader.readLine(); keyLine != null; keyLine = fileReader.readLine()) { + assertEquals(keyLine, Integer.toHexString(i)); + i++; + } + } catch(FileNotFoundException fnfe) { + fnfe.printStackTrace(); + fail("Unexpected exception"); + } catch(IOException ioe) { + ioe.printStackTrace(); + fail("Unexpected exception"); + } + } + + @Test + public void testConsistencyFixSetupTeardown() { + String url = setUpCluster(); + + ConsistencyFix consistencyFix = new ConsistencyFix(url, STORE_NAME, 100, 100, false, false); + + consistencyFix.close(); + } + + @Test + public void testStats() throws InterruptedException { + ConsistencyFix.Stats stats = new Stats(1000); + + long lastTimeMs = stats.lastTimeMs; + TimeUnit.MILLISECONDS.sleep(2); + for(int i = 0; i < 1001; ++i) { + stats.incrementFixCount(); + } + assertTrue(stats.fixCount == 1001); + assertTrue(stats.startTimeMs < stats.lastTimeMs); + assertTrue(lastTimeMs < System.currentTimeMillis()); + + } + + @Test + public void testStatus() { + ConsistencyFix.Status status = ConsistencyFix.Status.SUCCESS; + assertEquals(status.toString(), "success"); + } +} diff --git a/test/unit/voldemort/utils/ConsistencyFixWorkerTest.java b/test/unit/voldemort/utils/ConsistencyFixWorkerTest.java new file mode 100644 index 0000000000..ca2d3b0fd2 --- /dev/null +++ b/test/unit/voldemort/utils/ConsistencyFixWorkerTest.java @@ -0,0 +1,177 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.utils; + +import static org.junit.Assert.assertTrue; + +import java.util.ArrayList; +import java.util.List; + +import org.junit.Test; + +import voldemort.TestUtils; +import voldemort.client.protocol.admin.AdminClient; +import voldemort.client.protocol.admin.QueryKeyResult; +import voldemort.store.routed.NodeValue; +import voldemort.utils.ConsistencyFix.BadKey; +import voldemort.versioning.Versioned; + +public class ConsistencyFixWorkerTest { + + public void testRepair(int putNodes[], boolean orphan) { + byte[] bKey = TestUtils.randomBytes(10); + String hexKey = ByteUtils.toHexString(bKey); + ByteArray baKey = new ByteArray(bKey); + + BadKey badKey; + QueryKeyResult queryKeyResult; + if(!orphan) { + badKey = new BadKey(hexKey, hexKey + "\n"); + queryKeyResult = null; + } else { + StringBuilder orphanInput = new StringBuilder(); + orphanInput.append(hexKey + "," + "1\n"); + List> values = new ArrayList>(0); + int arbitraryNodeId = 2; + Versioned versioned = TestUtils.getVersioned(TestUtils.randomBytes(25), + arbitraryNodeId); + orphanInput.append(ByteUtils.toHexString(versioned.getValue())); + orphanInput.append("," + versioned.toString() + "\n"); + values.add(versioned); + + badKey = new BadKey(hexKey, orphanInput.toString()); + queryKeyResult = new QueryKeyResult(baKey, values); + } + + Versioned value = TestUtils.getVersioned(TestUtils.randomBytes(25), 0); + + String url = ConsistencyFixTest.setUpCluster(); + ConsistencyFix consistencyFix = new ConsistencyFix(url, + ConsistencyFixTest.STORE_NAME, + 100, + 100, + false, + false); + + AdminClient adminClient = consistencyFix.getAdminClient(); + + System.out.println("Initial get"); + for(int i = 0; i < 4; ++i) { + List> results; + results = adminClient.storeOps.getNodeKey(ConsistencyFixTest.STORE_NAME, i, baKey); + assertTrue(results.size() == 0); + } + + System.out.println("Puts"); + for(int putNode: putNodes) { + NodeValue nodeKeyValue; + nodeKeyValue = new NodeValue(putNode, baKey, value); + adminClient.storeOps.putNodeKeyValue(ConsistencyFixTest.STORE_NAME, nodeKeyValue); + } + + // Construct normal consistency fix worker + ConsistencyFixWorker consistencyFixWorker = null; + if(!orphan) { + consistencyFixWorker = new ConsistencyFixWorker(badKey, consistencyFix, null); + } else { + consistencyFixWorker = new ConsistencyFixWorker(badKey, + consistencyFix, + null, + queryKeyResult); + } + consistencyFixWorker.run(); + + System.out.println("Second get"); + int expectedNumVersions = 0; + if(putNodes.length > 0) { + expectedNumVersions++; + } + if(orphan) { + expectedNumVersions++; + } + for(int i = 0; i < 4; ++i) { + System.out.println("Node : " + i); + List> results; + results = adminClient.storeOps.getNodeKey(ConsistencyFixTest.STORE_NAME, i, baKey); + for(Versioned v: results) { + System.out.println("\t" + v.getVersion()); + } + + assertTrue(results.size() == expectedNumVersions); + } + } + + @Test + public void repairPutOne() { + int putNodes[] = { 1 }; + testRepair(putNodes, false); + } + + @Test + public void repairPutTwo() { + int putNodes[] = { 0, 3 }; + testRepair(putNodes, false); + } + + @Test + public void repairPutThree() { + int putNodes[] = { 1, 2, 3 }; + testRepair(putNodes, false); + } + + @Test + public void repairPutFour() { + int putNodes[] = { 0, 1, 2, 3 }; + testRepair(putNodes, false); + } + + @Test + public void repairPutZero() { + int putNodes[] = {}; + testRepair(putNodes, false); + } + + @Test + public void orphanPutOne() { + int putNodes[] = { 1 }; + testRepair(putNodes, true); + } + + @Test + public void orphanPutTwo() { + int putNodes[] = { 0, 3 }; + testRepair(putNodes, true); + } + + @Test + public void orphanPutThree() { + int putNodes[] = { 1, 2, 3 }; + testRepair(putNodes, true); + } + + @Test + public void orphanPutFour() { + int putNodes[] = { 0, 1, 2, 3 }; + testRepair(putNodes, true); + } + + @Test + public void orphanPutZero() { + int putNodes[] = {}; + testRepair(putNodes, true); + } +} diff --git a/test/unit/voldemort/utils/RebalanceUtilsTest.java b/test/unit/voldemort/utils/RebalanceUtilsTest.java index ccf5b6f9cf..2cf2cebd14 100644 --- a/test/unit/voldemort/utils/RebalanceUtilsTest.java +++ b/test/unit/voldemort/utils/RebalanceUtilsTest.java @@ -1,5 +1,5 @@ /* - * Copyright 2011 LinkedIn, Inc + * Copyright 2011-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -43,12 +43,12 @@ public void testGetNodeIds() { List nodes = Lists.newArrayList(); // Test with empty node list - assertEquals(RebalanceUtils.getNodeIds(nodes).size(), 0); + assertEquals(NodeUtils.getNodeIds(nodes).size(), 0); // Add one node nodes.add(new Node(0, "localhost", 1, 2, 3, new ArrayList())); - assertEquals(RebalanceUtils.getNodeIds(nodes).size(), 1); - assertEquals(RebalanceUtils.getNodeIds(nodes).get(0).intValue(), 0); + assertEquals(NodeUtils.getNodeIds(nodes).size(), 1); + assertEquals(NodeUtils.getNodeIds(nodes).get(0).intValue(), 0); } public void testGetClusterWithNewNodes() { @@ -78,4 +78,153 @@ public void testGetClusterWithNewNodes() { assertEquals(generatedCluster.getNodeById(3).getPartitionIds().size(), 0); } + + public void testRemoveItemsToSplitListEvenly() { + // input of size 5 + List input = new ArrayList(); + System.out.println("Input of size 5"); + for(int i = 0; i < 5; ++i) { + input.add(i); + } + + List output = RebalanceClusterUtils.removeItemsToSplitListEvenly(input, 1); + assertEquals(output.size(), 2); + assertEquals(output.get(0), new Integer(1)); + assertEquals(output.get(1), new Integer(3)); + System.out.println("1 : " + output); + + output = RebalanceClusterUtils.removeItemsToSplitListEvenly(input, 2); + assertEquals(output.size(), 1); + assertEquals(output.get(0), new Integer(2)); + System.out.println("2 : " + output); + + output = RebalanceClusterUtils.removeItemsToSplitListEvenly(input, 3); + assertEquals(output.size(), 1); + assertEquals(output.get(0), new Integer(2)); + System.out.println("3 : " + output); + + output = RebalanceClusterUtils.removeItemsToSplitListEvenly(input, 4); + assertEquals(output.size(), 1); + assertEquals(output.get(0), new Integer(2)); + System.out.println("4 : " + output); + + output = RebalanceClusterUtils.removeItemsToSplitListEvenly(input, 5); + assertEquals(output.size(), 0); + System.out.println("5 : " + output); + + // input of size 10 + input.clear(); + System.out.println("Input of size 10"); + for(int i = 0; i < 10; ++i) { + input.add(i); + } + + output = RebalanceClusterUtils.removeItemsToSplitListEvenly(input, 1); + assertEquals(output.size(), 5); + assertEquals(output.get(0), new Integer(1)); + assertEquals(output.get(4), new Integer(9)); + System.out.println("1 : " + output); + + output = RebalanceClusterUtils.removeItemsToSplitListEvenly(input, 2); + assertEquals(output.size(), 3); + assertEquals(output.get(0), new Integer(2)); + assertEquals(output.get(2), new Integer(8)); + System.out.println("2 : " + output); + + output = RebalanceClusterUtils.removeItemsToSplitListEvenly(input, 3); + assertEquals(output.size(), 2); + assertEquals(output.get(0), new Integer(3)); + assertEquals(output.get(1), new Integer(7)); + System.out.println("3 : " + output); + + output = RebalanceClusterUtils.removeItemsToSplitListEvenly(input, 4); + assertEquals(output.size(), 2); + assertEquals(output.get(0), new Integer(3)); + assertEquals(output.get(1), new Integer(7)); + System.out.println("4 : " + output); + + output = RebalanceClusterUtils.removeItemsToSplitListEvenly(input, 5); + assertEquals(output.size(), 1); + assertEquals(output.get(0), new Integer(5)); + System.out.println("5 : " + output); + + output = RebalanceClusterUtils.removeItemsToSplitListEvenly(input, 6); + assertEquals(output.size(), 1); + assertEquals(output.get(0), new Integer(5)); + System.out.println("6 : " + output); + + // input of size 20 + input.clear(); + System.out.println("Input of size 20"); + for(int i = 0; i < 20; ++i) { + input.add(i); + } + + output = RebalanceClusterUtils.removeItemsToSplitListEvenly(input, 1); + assertEquals(output.size(), 10); + assertEquals(output.get(0), new Integer(1)); + assertEquals(output.get(9), new Integer(19)); + System.out.println("1 : " + output); + + output = RebalanceClusterUtils.removeItemsToSplitListEvenly(input, 2); + assertEquals(output.size(), 6); + assertEquals(output.get(0), new Integer(2)); + assertEquals(output.get(5), new Integer(17)); + System.out.println("2 : " + output); + + output = RebalanceClusterUtils.removeItemsToSplitListEvenly(input, 3); + assertEquals(output.size(), 5); + assertEquals(output.get(0), new Integer(3)); + assertEquals(output.get(4), new Integer(17)); + System.out.println("3 : " + output); + + output = RebalanceClusterUtils.removeItemsToSplitListEvenly(input, 4); + assertEquals(output.size(), 4); + assertEquals(output.get(0), new Integer(4)); + assertEquals(output.get(3), new Integer(16)); + System.out.println("4 : " + output); + + output = RebalanceClusterUtils.removeItemsToSplitListEvenly(input, 5); + assertEquals(output.size(), 3); + assertEquals(output.get(0), new Integer(5)); + assertEquals(output.get(2), new Integer(15)); + System.out.println("5 : " + output); + + output = RebalanceClusterUtils.removeItemsToSplitListEvenly(input, 6); + assertEquals(output.size(), 2); + assertEquals(output.get(0), new Integer(6)); + assertEquals(output.get(1), new Integer(13)); + System.out.println("6 : " + output); + + output = RebalanceClusterUtils.removeItemsToSplitListEvenly(input, 7); + assertEquals(output.size(), 2); + assertEquals(output.get(0), new Integer(6)); + assertEquals(output.get(1), new Integer(13)); + System.out.println("7 : " + output); + + output = RebalanceClusterUtils.removeItemsToSplitListEvenly(input, 9); + assertEquals(output.size(), 2); + assertEquals(output.get(0), new Integer(6)); + assertEquals(output.get(1), new Integer(13)); + System.out.println("9 : " + output); + + output = RebalanceClusterUtils.removeItemsToSplitListEvenly(input, 10); + assertEquals(output.size(), 1); + assertEquals(output.get(0), new Integer(10)); + System.out.println("10 : " + output); + + output = RebalanceClusterUtils.removeItemsToSplitListEvenly(input, 11); + assertEquals(output.size(), 1); + assertEquals(output.get(0), new Integer(10)); + System.out.println("11 : " + output); + + output = RebalanceClusterUtils.removeItemsToSplitListEvenly(input, 19); + assertEquals(output.size(), 1); + assertEquals(output.get(0), new Integer(10)); + System.out.println("19 : " + output); + + output = RebalanceClusterUtils.removeItemsToSplitListEvenly(input, 20); + assertEquals(output.size(), 0); + System.out.println("20 : " + output); + } } diff --git a/test/unit/voldemort/utils/ServerTestUtilsTest.java b/test/unit/voldemort/utils/ServerTestUtilsTest.java index 5608a6e605..8864c1866a 100644 --- a/test/unit/voldemort/utils/ServerTestUtilsTest.java +++ b/test/unit/voldemort/utils/ServerTestUtilsTest.java @@ -41,38 +41,47 @@ public class ServerTestUtilsTest { 100000, 32 * 1024); + @Test public void testStartVoldemortCluster() throws IOException { int numServers = 8; VoldemortServer[] servers = new VoldemortServer[numServers]; int partitionMap[][] = { { 0 }, { 1 }, { 2 }, { 3 }, { 4 }, { 5 }, { 6 }, { 7 } }; - ServerTestUtils.startVoldemortCluster(numServers, - servers, - partitionMap, - socketStoreFactory, - true, - null, - storesXmlfile, - new Properties()); + Cluster cluster = ServerTestUtils.startVoldemortCluster(numServers, + servers, + partitionMap, + socketStoreFactory, + true, + null, + storesXmlfile, + new Properties()); + assertTrue(cluster != null); } - @Test - public void startMultipleVoldemortClusters() throws IOException { + // ********************************************************************** + // * START : "commented out" tests + // These tests helped to find the root case of BindException problem when + // clusters were started. These tests were used in debugging and stress + // testing and should not be part of our general junit tests. The @Test + // parameter is therefore commented out. The debugging methods themselves + // are not commented out so that they can be kept up to date with other code + // changes. + + // @Test + public void stressTestStartVoldemortCluster() throws IOException { for(int i = 0; i < 10; i++) { testStartVoldemortCluster(); } } - // ********************************************************************** - // * START : TESTS THAT HELPED FIND ROOT CAUSE OF BindException PROBLEM * - // @Test public void startMultipleVoldemortServers() throws IOException { - Cluster cluster = ServerTestUtils.getLocalCluster(8, new int[][] { { 0 }, { 1 }, { 2 }, - { 3 }, { 4 }, { 5 }, { 6 }, { 7 } }); + Cluster cluster = ServerTestUtils.getLocalCluster(16, new int[][] { { 0 }, { 1 }, { 2 }, + { 3 }, { 4 }, { 5 }, { 6 }, { 7 }, { 8 }, { 9 }, { 10 }, { 11 }, { 12 }, { 13 }, + { 14 }, { 15 } }); - VoldemortServer[] servers = new VoldemortServer[8]; + VoldemortServer[] servers = new VoldemortServer[16]; - for(int i = 0; i < 8; i++) { + for(int i = 0; i < 16; i++) { servers[i] = ServerTestUtils.startVoldemortServer(socketStoreFactory, ServerTestUtils.createServerConfig(true, i, @@ -86,6 +95,14 @@ public void startMultipleVoldemortServers() throws IOException { assertTrue(true); } + // @Test + public void startMultipleVoldemortServersUnsafe5() throws IOException { + for(int i = 0; i < 5; i++) { + startMultipleVoldemortServers(); + } + assertTrue(true); + } + // @Test public void startMultipleVoldemortServers10() { for(int i = 0; i < 10; i++) { @@ -180,6 +197,6 @@ public void testFindFreePorts100() throws Exception { } } - // ** END : TESTS THAT HELPED FIND ROOT CAUSE OF BindException PROBLEM ** + // * END : "commented out" tests // ********************************************************************** } diff --git a/test/unit/voldemort/utils/pool/KeyedResourcePoolContentionTest.java b/test/unit/voldemort/utils/pool/KeyedResourcePoolContentionTest.java new file mode 100644 index 0000000000..c565f4c0ed --- /dev/null +++ b/test/unit/voldemort/utils/pool/KeyedResourcePoolContentionTest.java @@ -0,0 +1,139 @@ +/* + * Copyright 2012 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package voldemort.utils.pool; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.util.Random; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; + +import org.junit.Before; +import org.junit.Test; + +public class KeyedResourcePoolContentionTest extends KeyedResourcePoolTestBase { + + protected static int POOL_SIZE = 5; + protected static long TIMEOUT_MS = 500; + + @Before + public void setUp() { + factory = new TestResourceFactory(); + config = new ResourcePoolConfig().setMaxPoolSize(POOL_SIZE) + .setTimeout(TIMEOUT_MS, TimeUnit.MILLISECONDS); + this.pool = new KeyedResourcePool(factory, config); + } + + // This method was helpful when developing contendForResources + public void printStats(String key) { + System.err.println(""); + System.err.println("getCreated: " + this.factory.getCreated()); + System.err.println("getDestroyed: " + this.factory.getDestroyed()); + System.err.println("getTotalResourceCount(key): " + this.pool.getTotalResourceCount(key)); + System.err.println("getTotalResourceCount(): " + this.pool.getTotalResourceCount()); + System.err.println("getCheckedInResourcesCount(key): " + + this.pool.getCheckedInResourcesCount(key)); + System.err.println("getCheckedInResourceCount(): " + this.pool.getCheckedInResourceCount()); + } + + @Test + public void contendForResources() throws Exception { + int numCheckers = POOL_SIZE * 2; + int numChecks = 10 * 1000; + String key = "Key"; + float invalidationRate = (float) 0.25; + CountDownLatch waitForThreads = new CountDownLatch(numCheckers); + CountDownLatch waitForCheckers = new CountDownLatch(numCheckers); + for(int i = 0; i < numCheckers; ++i) { + new Thread(new Checkers(waitForThreads, + waitForCheckers, + key, + numChecks, + invalidationRate)).start(); + } + + try { + waitForCheckers.await(); + assertEquals(this.pool.getCheckedInResourceCount(), this.pool.getTotalResourceCount()); + } catch(InterruptedException e) { + e.printStackTrace(); + } + + } + + public class Checkers implements Runnable { + + private final CountDownLatch startSignal; + private final CountDownLatch doneSignal; + + private final String key; + private final int checks; + + private Random random; + private float invalidationRate; + + Checkers(CountDownLatch startSignal, + CountDownLatch doneSignal, + String key, + int checks, + float invalidationRate) { + this.startSignal = startSignal; + this.doneSignal = doneSignal; + + this.key = key; + this.checks = checks; + + this.random = new Random(); + this.invalidationRate = invalidationRate; + } + + @Override + public void run() { + startSignal.countDown(); + try { + startSignal.await(); + } catch(InterruptedException e) { + e.printStackTrace(); + } + + try { + TestResource tr = null; + for(int i = 0; i < checks; ++i) { + tr = pool.checkout(key); + assertTrue(tr.isValid()); + + // Invalid some resources (except on last checkin) + float f = random.nextFloat(); + if(f < invalidationRate && i != checks - 1) { + tr.invalidate(); + } + Thread.yield(); + + pool.checkin(key, tr); + Thread.yield(); + + // if(i % 1000 == 0) { printStats(key); } + } + } catch(Exception e) { + System.err.println(e.toString()); + fail(e.toString()); + } + doneSignal.countDown(); + } + } +} diff --git a/test/unit/voldemort/utils/pool/KeyedResourcePoolRaceTest.java b/test/unit/voldemort/utils/pool/KeyedResourcePoolRaceTest.java new file mode 100644 index 0000000000..0ec0f9a2dd --- /dev/null +++ b/test/unit/voldemort/utils/pool/KeyedResourcePoolRaceTest.java @@ -0,0 +1,83 @@ +/* + * Copyright 2012 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package voldemort.utils.pool; + +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.Callable; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; + +import org.junit.Before; +import org.junit.Test; + +public class KeyedResourcePoolRaceTest extends KeyedResourcePoolTestBase { + + protected static final int POOL_SIZE = 100; + protected static final long TIMEOUT_MS = 500; + protected static final long NUM_TESTS = 250; + + @Before + public void setUp() { + factory = new TestResourceFactory(); + config = new ResourcePoolConfig().setMaxPoolSize(POOL_SIZE) + .setTimeout(TIMEOUT_MS, TimeUnit.MILLISECONDS); + this.pool = new KeyedResourcePool(factory, config); + } + + // See http://code.google.com/p/project-voldemort/issues/detail?id=276 + @Test + public void testAttemptGrow() { + ExecutorService service = Executors.newFixedThreadPool(POOL_SIZE); + for(int i = 0; i < NUM_TESTS; i++) { + final CountDownLatch checkouts = new CountDownLatch(POOL_SIZE); + List> tasks = new ArrayList>(POOL_SIZE); + for(int t = 0; t < POOL_SIZE; t++) { + tasks.add(new Callable() { + + @Override + public Boolean call() throws Exception { + try { + TestResource resource = pool.checkout("a"); + checkouts.countDown(); + checkouts.await(); + resource.invalidate(); + pool.checkin("a", resource); + return true; + } catch(Exception e) { + checkouts.countDown(); + throw e; + } + } + }); + } + try { + List> futures = service.invokeAll(tasks); + for(Future future: futures) { + assertTrue(future.get()); + } + } catch(Exception e) { + fail("Unexpected exception - " + e.getMessage()); + } + } + } +} diff --git a/test/unit/voldemort/utils/pool/KeyedResourcePoolTest.java b/test/unit/voldemort/utils/pool/KeyedResourcePoolTest.java index cc6625d857..cf94885a5f 100644 --- a/test/unit/voldemort/utils/pool/KeyedResourcePoolTest.java +++ b/test/unit/voldemort/utils/pool/KeyedResourcePoolTest.java @@ -22,27 +22,19 @@ import java.util.LinkedList; import java.util.Queue; -import java.util.Random; -import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.concurrent.atomic.AtomicInteger; import org.junit.Before; import org.junit.Test; import voldemort.utils.Time; -public class KeyedResourcePoolTest { +public class KeyedResourcePoolTest extends KeyedResourcePoolTestBase { protected static int POOL_SIZE = 5; protected static long TIMEOUT_MS = 500; - protected TestResourceFactory factory; - protected KeyedResourcePool pool; - protected ResourcePoolConfig config; - @Before public void setUp() { factory = new TestResourceFactory(); @@ -101,9 +93,9 @@ public void testExceptionOnDestroy() throws Exception { assertEquals(1, this.pool.getCheckedInResourceCount()); this.pool.checkout("a"); - assertEquals(2, this.factory.getCreated()); - assertEquals(2, this.pool.getTotalResourceCount()); - assertEquals(1, this.pool.getCheckedInResourceCount()); + assertEquals(1, this.factory.getCreated()); + assertEquals(1, this.pool.getTotalResourceCount()); + assertEquals(0, this.pool.getCheckedInResourceCount()); for(int i = 0; i < POOL_SIZE - 1; i++) { checkedOut = this.pool.checkout("a"); @@ -266,189 +258,4 @@ public void testMaxInvalidCreations() throws Exception { } } - // This method was helpful when developing contendForResources - public void printStats(String key) { - System.err.println(""); - System.err.println("getCreated: " + this.factory.getCreated()); - System.err.println("getDestroyed: " + this.factory.getDestroyed()); - System.err.println("getTotalResourceCount(key): " + this.pool.getTotalResourceCount(key)); - System.err.println("getTotalResourceCount(): " + this.pool.getTotalResourceCount()); - System.err.println("getCheckedInResourcesCount(key): " - + this.pool.getCheckedInResourcesCount(key)); - System.err.println("getCheckedInResourceCount(): " + this.pool.getCheckedInResourceCount()); - } - - @Test - public void contendForResources() throws Exception { - int numCheckers = POOL_SIZE * 2; - int numChecks = 10 * 1000; - String key = "Key"; - float invalidationRate = (float) 0.25; - CountDownLatch waitForThreads = new CountDownLatch(numCheckers); - CountDownLatch waitForCheckers = new CountDownLatch(numCheckers); - for(int i = 0; i < numCheckers; ++i) { - new Thread(new Checkers(waitForThreads, - waitForCheckers, - key, - numChecks, - invalidationRate)).start(); - } - - try { - waitForCheckers.await(); - assertEquals(POOL_SIZE, this.pool.getTotalResourceCount()); - assertEquals(POOL_SIZE, this.pool.getCheckedInResourceCount()); - } catch(InterruptedException e) { - e.printStackTrace(); - } - - } - - public class Checkers implements Runnable { - - private final CountDownLatch startSignal; - private final CountDownLatch doneSignal; - - private final String key; - private final int checks; - - private Random random; - private float invalidationRate; - - Checkers(CountDownLatch startSignal, - CountDownLatch doneSignal, - String key, - int checks, - float invalidationRate) { - this.startSignal = startSignal; - this.doneSignal = doneSignal; - - this.key = key; - this.checks = checks; - - this.random = new Random(); - this.invalidationRate = invalidationRate; - } - - public void run() { - startSignal.countDown(); - try { - startSignal.await(); - } catch(InterruptedException e) { - e.printStackTrace(); - } - - try { - TestResource tr = null; - for(int i = 0; i < checks; ++i) { - tr = pool.checkout(key); - assertTrue(tr.isValid()); - - // Invalid some resources (except on last checkin) - float f = random.nextFloat(); - if(f < invalidationRate && i != checks - 1) { - tr.invalidate(); - } - Thread.yield(); - - pool.checkin(key, tr); - Thread.yield(); - - // if(i % 1000 == 0) { printStats(key); } - } - } catch(Exception e) { - System.err.println(e.toString()); - fail(e.toString()); - } - doneSignal.countDown(); - } - } - - protected static class TestResource { - - private String value; - private AtomicBoolean isValid; - private AtomicBoolean isDestroyed; - - public TestResource(String value) { - this.value = value; - this.isValid = new AtomicBoolean(true); - this.isDestroyed = new AtomicBoolean(false); - } - - public boolean isValid() { - return isValid.get(); - } - - public void invalidate() { - this.isValid.set(false); - } - - public boolean isDestroyed() { - return isDestroyed.get(); - } - - public void destroy() { - this.isDestroyed.set(true); - } - - @Override - public String toString() { - return "TestResource(" + value + ")"; - } - - } - - protected static class TestResourceFactory implements ResourceFactory { - - private final AtomicInteger created = new AtomicInteger(0); - private final AtomicInteger destroyed = new AtomicInteger(0); - private Exception createException; - private Exception destroyException; - private boolean isCreatedValid = true; - - public TestResource create(String key) throws Exception { - if(createException != null) - throw createException; - TestResource r = new TestResource(Integer.toString(created.getAndIncrement())); - if(!isCreatedValid) - r.invalidate(); - return r; - } - - public void destroy(String key, TestResource obj) throws Exception { - if(destroyException != null) - throw destroyException; - destroyed.incrementAndGet(); - obj.destroy(); - } - - public boolean validate(String key, TestResource value) { - return value.isValid(); - } - - public int getCreated() { - return this.created.get(); - } - - public int getDestroyed() { - return this.destroyed.get(); - } - - public void setDestroyException(Exception e) { - this.destroyException = e; - } - - public void setCreateException(Exception e) { - this.createException = e; - } - - public void setCreatedValid(boolean isValid) { - this.isCreatedValid = isValid; - } - - public void close() {} - - } - } diff --git a/test/unit/voldemort/utils/pool/KeyedResourcePoolTestBase.java b/test/unit/voldemort/utils/pool/KeyedResourcePoolTestBase.java new file mode 100644 index 0000000000..d0e5e38daf --- /dev/null +++ b/test/unit/voldemort/utils/pool/KeyedResourcePoolTestBase.java @@ -0,0 +1,181 @@ +/* + * Copyright 2012 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package voldemort.utils.pool; + +import static org.junit.Assert.assertFalse; + +import java.util.Queue; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; + +public class KeyedResourcePoolTestBase { + + protected TestResourceFactory factory; + protected KeyedResourcePool pool; + protected ResourcePoolConfig config; + + protected static class TestResource { + + private String value; + private AtomicBoolean isValid; + private AtomicBoolean isDestroyed; + + public TestResource(String value) { + this.value = value; + this.isValid = new AtomicBoolean(true); + this.isDestroyed = new AtomicBoolean(false); + } + + public boolean isValid() { + return isValid.get(); + } + + public void invalidate() { + this.isValid.set(false); + } + + public boolean isDestroyed() { + return isDestroyed.get(); + } + + public void destroy() { + this.isDestroyed.set(true); + } + + @Override + public String toString() { + return "TestResource(" + value + ")"; + } + + } + + protected static class TestResourceFactory implements ResourceFactory { + + private final AtomicInteger created = new AtomicInteger(0); + private final AtomicInteger destroyed = new AtomicInteger(0); + private Exception createException; + private Exception destroyException; + private boolean isCreatedValid = true; + + @Override + public TestResource create(String key) throws Exception { + if(createException != null) + throw createException; + TestResource r = new TestResource(Integer.toString(created.getAndIncrement())); + if(!isCreatedValid) + r.invalidate(); + return r; + } + + @Override + public void destroy(String key, TestResource obj) throws Exception { + if(destroyException != null) + throw destroyException; + destroyed.incrementAndGet(); + obj.destroy(); + } + + @Override + public boolean validate(String key, TestResource value) { + return value.isValid(); + } + + public int getCreated() { + return this.created.get(); + } + + public int getDestroyed() { + return this.destroyed.get(); + } + + public void setDestroyException(Exception e) { + this.destroyException = e; + } + + public void setCreateException(Exception e) { + this.createException = e; + } + + public void setCreatedValid(boolean isValid) { + this.isCreatedValid = isValid; + } + + @Override + public void close() {} + + } + + // TestResourceRequest is only need for the QueuedResourcePool tests, but it + // is easier/cleaner to define here with the other test resources. + protected static class TestResourceRequest implements AsyncResourceRequest { + + private AtomicBoolean usedResource; + private AtomicBoolean handledTimeout; + private AtomicBoolean handledException; + + static AtomicInteger usedResourceCount = new AtomicInteger(0); + static AtomicInteger handledTimeoutCount = new AtomicInteger(0); + static AtomicInteger handledExceptionCount = new AtomicInteger(0); + + long deadlineNs; + final Queue doneQueue; + + TestResourceRequest(long deadlineNs, Queue doneQueue) { + this.usedResource = new AtomicBoolean(false); + this.handledTimeout = new AtomicBoolean(false); + this.handledException = new AtomicBoolean(false); + this.deadlineNs = deadlineNs; + this.doneQueue = doneQueue; + } + + @Override + public void useResource(TestResource tr) { + // System.err.println("useResource " + + // Thread.currentThread().getName()); + assertFalse(this.handledTimeout.get()); + assertFalse(this.handledException.get()); + usedResource.set(true); + usedResourceCount.getAndIncrement(); + doneQueue.add(tr); + } + + @Override + public void handleTimeout() { + // System.err.println("handleTimeout " + + // Thread.currentThread().getName()); + assertFalse(this.usedResource.get()); + assertFalse(this.handledException.get()); + handledTimeout.set(true); + handledTimeoutCount.getAndIncrement(); + } + + @Override + public void handleException(Exception e) { + // System.err.println("handleException " + + // Thread.currentThread().getName()); + assertFalse(this.usedResource.get()); + assertFalse(this.handledTimeout.get()); + handledException.set(true); + handledExceptionCount.getAndIncrement(); + } + + @Override + public long getDeadlineNs() { + return deadlineNs; + } + } + +} diff --git a/test/unit/voldemort/utils/pool/QueuedKeyedResourcePoolContentionTest.java b/test/unit/voldemort/utils/pool/QueuedKeyedResourcePoolContentionTest.java new file mode 100644 index 0000000000..83ea3a23de --- /dev/null +++ b/test/unit/voldemort/utils/pool/QueuedKeyedResourcePoolContentionTest.java @@ -0,0 +1,209 @@ +/* + * Copyright 2012 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package voldemort.utils.pool; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.util.Queue; +import java.util.Random; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; + +import org.junit.Before; +import org.junit.Test; + +public class QueuedKeyedResourcePoolContentionTest extends KeyedResourcePoolContentionTest { + + protected QueuedKeyedResourcePool queuedPool; + + @Before + @Override + public void setUp() { + super.setUp(); + this.queuedPool = new QueuedKeyedResourcePool(factory, config); + super.pool = queuedPool; + + TestResourceRequest.usedResourceCount.set(0); + TestResourceRequest.handledTimeoutCount.set(0); + TestResourceRequest.handledExceptionCount.set(0); + } + + @Test + public void contendForQueue() throws Exception { + // Over ride some set up + super.config = new ResourcePoolConfig().setMaxPoolSize(POOL_SIZE) + .setTimeout(TIMEOUT_MS * 50, TimeUnit.MILLISECONDS); + this.queuedPool = new QueuedKeyedResourcePool(factory, config); + super.pool = queuedPool; + + int numEnqueuers = POOL_SIZE * 2; + int numEnqueues = 10 * 1000; + String key = "Key"; + float invalidationRate = (float) 0.25; + CountDownLatch waitForThreads = new CountDownLatch(numEnqueuers); + CountDownLatch waitForEnqueuers = new CountDownLatch(numEnqueuers); + for(int i = 0; i < numEnqueuers; ++i) { + new Thread(new Enqueuers(waitForThreads, + waitForEnqueuers, + key, + numEnqueues, + invalidationRate)).start(); + } + + try { + waitForEnqueuers.await(); + assertEquals(POOL_SIZE, this.queuedPool.getTotalResourceCount()); + assertEquals(POOL_SIZE, this.queuedPool.getCheckedInResourceCount()); + assertEquals(0, this.queuedPool.getRegisteredResourceRequestCount()); + + assertEquals(numEnqueuers * numEnqueues, TestResourceRequest.usedResourceCount.get()); + assertEquals(0, TestResourceRequest.handledTimeoutCount.get()); + assertEquals(0, TestResourceRequest.handledExceptionCount.get()); + + } catch(InterruptedException e) { + e.printStackTrace(); + } + + } + + @Test + public void contendForQueueAndPool() throws Exception { + // Over ride some set up + super.config = new ResourcePoolConfig().setMaxPoolSize(POOL_SIZE) + .setTimeout(TIMEOUT_MS * 100, TimeUnit.MILLISECONDS); + this.queuedPool = new QueuedKeyedResourcePool(factory, config); + super.pool = queuedPool; + + int numEnqueuers = POOL_SIZE; + int numCheckers = POOL_SIZE; + int numEnqueues = 10 * 1000; + String key = "Key"; + float invalidationRate = (float) 0.25; + CountDownLatch waitForThreadsStart = new CountDownLatch(numEnqueuers + numCheckers); + CountDownLatch waitForThreadsEnd = new CountDownLatch(numEnqueuers + numCheckers); + for(int i = 0; i < numEnqueuers; ++i) { + new Thread(new Enqueuers(waitForThreadsStart, + waitForThreadsEnd, + key, + numEnqueues, + invalidationRate)).start(); + } + for(int i = 0; i < numCheckers; ++i) { + new Thread(new Checkers(waitForThreadsStart, + waitForThreadsEnd, + key, + numEnqueues, + invalidationRate)).start(); + } + + try { + waitForThreadsEnd.await(); + assertEquals(this.queuedPool.getCheckedInResourceCount(), + this.queuedPool.getTotalResourceCount()); + assertEquals(0, this.queuedPool.getRegisteredResourceRequestCount()); + + assertEquals(numEnqueuers * numEnqueues, TestResourceRequest.usedResourceCount.get()); + assertEquals(0, TestResourceRequest.handledTimeoutCount.get()); + assertEquals(0, TestResourceRequest.handledExceptionCount.get()); + + } catch(InterruptedException e) { + e.printStackTrace(); + } + + } + + public class Enqueuers implements Runnable { + + private final CountDownLatch startSignal; + private final CountDownLatch doneSignal; + + private final String key; + private final int enqueues; + private int used; + Queue resources; + + private Random random; + private float invalidationRate; + + Enqueuers(CountDownLatch startSignal, + CountDownLatch doneSignal, + String key, + int enqueues, + float invalidationRate) { + this.startSignal = startSignal; + this.doneSignal = doneSignal; + + this.key = key; + this.enqueues = enqueues; + this.used = 0; + resources = new ConcurrentLinkedQueue(); + + this.random = new Random(); + this.invalidationRate = invalidationRate; + } + + private void processAtMostOneEnqueuedResource() throws Exception { + TestResource tr = resources.poll(); + if(tr != null) { + this.used++; + assertTrue(tr.isValid()); + + // Invalidate some resources (except on last few check ins) + float f = random.nextFloat(); + if(f < invalidationRate && this.used < this.enqueues - POOL_SIZE) { + tr.invalidate(); + } + Thread.yield(); + + queuedPool.checkin(key, tr); + Thread.yield(); + } + } + + @Override + public void run() { + startSignal.countDown(); + try { + startSignal.await(); + } catch(InterruptedException e) { + e.printStackTrace(); + } + + try { + for(int i = 0; i < enqueues; ++i) { + long deadlineNs = System.nanoTime() + + TimeUnit.MILLISECONDS.toNanos(config.getTimeout(TimeUnit.NANOSECONDS)); + + queuedPool.registerResourceRequest(key, new TestResourceRequest(deadlineNs, + resources)); + Thread.yield(); + + processAtMostOneEnqueuedResource(); + } + while(this.used < enqueues) { + processAtMostOneEnqueuedResource(); + Thread.yield(); + } + } catch(Exception e) { + fail(e.toString()); + } + doneSignal.countDown(); + } + } +} diff --git a/test/unit/voldemort/utils/pool/QueuedKeyedResourcePoolTest.java b/test/unit/voldemort/utils/pool/QueuedKeyedResourcePoolTest.java index ebc9d55706..3ccac1f0c4 100644 --- a/test/unit/voldemort/utils/pool/QueuedKeyedResourcePoolTest.java +++ b/test/unit/voldemort/utils/pool/QueuedKeyedResourcePoolTest.java @@ -16,18 +16,11 @@ package voldemort.utils.pool; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; -import static org.junit.Assert.fail; import java.util.LinkedList; import java.util.Queue; -import java.util.Random; -import java.util.concurrent.ConcurrentLinkedQueue; -import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.concurrent.atomic.AtomicInteger; import org.junit.Before; import org.junit.Test; @@ -264,220 +257,4 @@ public void testExceptionInQueue() throws Exception { assertEquals(0, TestResourceRequest.handledTimeoutCount.get()); assertEquals(1, TestResourceRequest.handledExceptionCount.get()); } - - @Test - public void contendForQueue() throws Exception { - // Over ride some set up - super.config = new ResourcePoolConfig().setMaxPoolSize(POOL_SIZE) - .setTimeout(TIMEOUT_MS * 50, TimeUnit.MILLISECONDS); - this.queuedPool = new QueuedKeyedResourcePool(factory, config); - super.pool = queuedPool; - - int numEnqueuers = POOL_SIZE * 2; - int numEnqueues = 10 * 1000; - String key = "Key"; - float invalidationRate = (float) 0.25; - CountDownLatch waitForThreads = new CountDownLatch(numEnqueuers); - CountDownLatch waitForEnqueuers = new CountDownLatch(numEnqueuers); - for(int i = 0; i < numEnqueuers; ++i) { - new Thread(new Enqueuers(waitForThreads, - waitForEnqueuers, - key, - numEnqueues, - invalidationRate)).start(); - } - - try { - waitForEnqueuers.await(); - assertEquals(POOL_SIZE, this.queuedPool.getTotalResourceCount()); - assertEquals(POOL_SIZE, this.queuedPool.getCheckedInResourceCount()); - assertEquals(0, this.queuedPool.getRegisteredResourceRequestCount()); - - assertEquals(numEnqueuers * numEnqueues, TestResourceRequest.usedResourceCount.get()); - assertEquals(0, TestResourceRequest.handledTimeoutCount.get()); - assertEquals(0, TestResourceRequest.handledExceptionCount.get()); - - } catch(InterruptedException e) { - e.printStackTrace(); - } - - } - - @Test - public void contendForQueueAndPool() throws Exception { - // Over ride some set up - super.config = new ResourcePoolConfig().setMaxPoolSize(POOL_SIZE) - .setTimeout(TIMEOUT_MS * 100, TimeUnit.MILLISECONDS); - this.queuedPool = new QueuedKeyedResourcePool(factory, config); - super.pool = queuedPool; - - int numEnqueuers = POOL_SIZE; - int numCheckers = POOL_SIZE; - int numEnqueues = 10 * 1000; - String key = "Key"; - float invalidationRate = (float) 0.25; - CountDownLatch waitForThreadsStart = new CountDownLatch(numEnqueuers + numCheckers); - CountDownLatch waitForThreadsEnd = new CountDownLatch(numEnqueuers + numCheckers); - for(int i = 0; i < numEnqueuers; ++i) { - new Thread(new Enqueuers(waitForThreadsStart, - waitForThreadsEnd, - key, - numEnqueues, - invalidationRate)).start(); - } - for(int i = 0; i < numCheckers; ++i) { - new Thread(new Checkers(waitForThreadsStart, - waitForThreadsEnd, - key, - numEnqueues, - invalidationRate)).start(); - } - - try { - waitForThreadsEnd.await(); - assertEquals(POOL_SIZE, this.queuedPool.getTotalResourceCount()); - assertEquals(POOL_SIZE, this.queuedPool.getCheckedInResourceCount()); - assertEquals(0, this.queuedPool.getRegisteredResourceRequestCount()); - - assertEquals(numEnqueuers * numEnqueues, TestResourceRequest.usedResourceCount.get()); - assertEquals(0, TestResourceRequest.handledTimeoutCount.get()); - assertEquals(0, TestResourceRequest.handledExceptionCount.get()); - - } catch(InterruptedException e) { - e.printStackTrace(); - } - - } - - public class Enqueuers implements Runnable { - - private final CountDownLatch startSignal; - private final CountDownLatch doneSignal; - - private final String key; - private final int enqueues; - private int used; - Queue resources; - - private Random random; - private float invalidationRate; - - Enqueuers(CountDownLatch startSignal, - CountDownLatch doneSignal, - String key, - int enqueues, - float invalidationRate) { - this.startSignal = startSignal; - this.doneSignal = doneSignal; - - this.key = key; - this.enqueues = enqueues; - this.used = 0; - resources = new ConcurrentLinkedQueue(); - - this.random = new Random(); - this.invalidationRate = invalidationRate; - } - - private void processAtMostOneEnqueuedResource() throws Exception { - TestResource tr = resources.poll(); - if(tr != null) { - this.used++; - assertTrue(tr.isValid()); - - // Invalidate some resources (except on last few check ins) - float f = random.nextFloat(); - if(f < invalidationRate && this.used < this.enqueues - POOL_SIZE) { - tr.invalidate(); - } - Thread.yield(); - - queuedPool.checkin(key, tr); - Thread.yield(); - } - } - - public void run() { - startSignal.countDown(); - try { - startSignal.await(); - } catch(InterruptedException e) { - e.printStackTrace(); - } - - try { - for(int i = 0; i < enqueues; ++i) { - long deadlineNs = System.nanoTime() - + TimeUnit.MILLISECONDS.toNanos(config.getTimeout(TimeUnit.NANOSECONDS)); - - queuedPool.registerResourceRequest(key, new TestResourceRequest(deadlineNs, - resources)); - Thread.yield(); - - processAtMostOneEnqueuedResource(); - } - while(this.used < enqueues) { - processAtMostOneEnqueuedResource(); - Thread.yield(); - } - } catch(Exception e) { - fail(e.toString()); - } - doneSignal.countDown(); - } - } - - protected static class TestResourceRequest implements AsyncResourceRequest { - - private AtomicBoolean usedResource; - private AtomicBoolean handledTimeout; - private AtomicBoolean handledException; - - static AtomicInteger usedResourceCount = new AtomicInteger(0); - static AtomicInteger handledTimeoutCount = new AtomicInteger(0); - static AtomicInteger handledExceptionCount = new AtomicInteger(0); - - long deadlineNs; - final Queue doneQueue; - - TestResourceRequest(long deadlineNs, Queue doneQueue) { - this.usedResource = new AtomicBoolean(false); - this.handledTimeout = new AtomicBoolean(false); - this.handledException = new AtomicBoolean(false); - this.deadlineNs = deadlineNs; - this.doneQueue = doneQueue; - } - - public void useResource(TestResource tr) { - // System.err.println("useResource " + - // Thread.currentThread().getName()); - assertFalse(this.handledTimeout.get()); - assertFalse(this.handledException.get()); - usedResource.set(true); - usedResourceCount.getAndIncrement(); - doneQueue.add(tr); - } - - public void handleTimeout() { - // System.err.println("handleTimeout " + - // Thread.currentThread().getName()); - assertFalse(this.usedResource.get()); - assertFalse(this.handledException.get()); - handledTimeout.set(true); - handledTimeoutCount.getAndIncrement(); - } - - public void handleException(Exception e) { - // System.err.println("handleException " + - // Thread.currentThread().getName()); - assertFalse(this.usedResource.get()); - assertFalse(this.handledTimeout.get()); - handledException.set(true); - handledExceptionCount.getAndIncrement(); - } - - public long getDeadlineNs() { - return deadlineNs; - } - } }