Skip to content

Commit

Permalink
remove some common values found in clinvar, which do not add useful i…
Browse files Browse the repository at this point in the history
…nformaiton
  • Loading branch information
akotlar committed Aug 6, 2018
1 parent 1582af8 commit 7df8409
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 20 deletions.
1 change: 1 addition & 0 deletions install/install-perl-libs.sh
Expand Up @@ -28,6 +28,7 @@ cpanm install MouseX::ConfigFromFile
cpanm install MouseX::Getopt
cpanm install Archive::Extract
cpanm install DBI
cpanm install String::Strip
# Needed for fetching SQL (Utils::SqlWriter::Connection)
cpanm install DBD::mysql
cpanm install IO/FDPass.pm
Expand Down
45 changes: 30 additions & 15 deletions lib/Seq/Tracks/Build.pm
Expand Up @@ -21,6 +21,9 @@ use Seq::Tracks::Base::Types;
use Seq::Tracks::Build::LocalFilesPaths;
use Seq::Output::Delimiters;

# Faster than regex trim
use String::Strip qw/StripLTSpace/;

extends 'Seq::Tracks::Base';
# All builders need getReadFh
with 'Seq::Role::IO';
Expand Down Expand Up @@ -109,17 +112,6 @@ has fieldMap => (is => 'ro', isa => 'HashRef', lazy => 1, default => sub {
return \%data;
});

# TODO: config output;
has _emptyFieldRegex => (is => 'ro', isa => 'RegexpRef', init_arg => undef, default => sub {
my $delim = Seq::Output::Delimiters->new();

my $emptyField = $delim->emptyFieldChar;

my $regex = qr/^\s*$emptyField\s*$/;

return $regex;
});

################################ Constructor ################################
sub BUILD {
my $self = shift;
Expand All @@ -137,7 +129,7 @@ sub BUILD {

my $d = Seq::Output::Delimiters->new();
$self->{_cleanDelims} = $d->cleanDelims;

$self->{_missChar} = $d->emptyFieldChar;
# Commit, sync, and remove any databases opened
# This is useful because locking may occur if there is an open transaction
# before fork(), and to make sure that any database meta data is properly
Expand Down Expand Up @@ -200,7 +192,7 @@ sub coerceFeatureType {

if(!looks_like_number($val)) {
$_[0]->{_cleanDelims}->($val);
$_[0]->coerceUndefinedValues($val);
$_[0]->_stripAndCoerceUndef($val);
}

if( defined $type && defined $val ) {
Expand Down Expand Up @@ -445,13 +437,36 @@ sub makeMergeFunc {
);
}

sub coerceUndefinedValues {
sub _stripAndCoerceUndef {
#my ($self, $dataStr) = @_;
state $c1 = 'no assertion criteria provided';
state $c2 = 'not provided';
state $c3 = 'see cases';
state $c4 = 'na';
state $c5 = '.';

# Don't waste storage space on NA. In Bystro undef values equal NA (or whatever
# Output.pm chooses to represent missing data as.
StripLTSpace($_[1]);

# Common missing values: not provided and see cases are clinvar-specific
my $lc = lc($_[1]);

if(length($lc) <= 8) {
if($lc eq $c5
|| $lc eq $c4
|| $lc eq $_[0]->{_missChar}
) {
$_[1] = undef;
return undef;
}
return $_[1];
}

if($_[1] =~ /^\s*NA\s*$/i || $_[1] =~/^\s*$/ || $_[1] =~/^\s*\.\s*$/ || $_[1] =~ $_[0]->_emptyFieldRegex) {
if($lc eq $c3
|| $lc eq $c2
|| $lc eq $c1
) {
$_[1] = undef;
return undef;
}
Expand Down
44 changes: 39 additions & 5 deletions t/tracks/build/coerceUndefinedValues.t
Expand Up @@ -34,31 +34,65 @@ my $seq = Seq::Tracks::Build->new({
my $delims = Seq::Output::Delimiters->new();

my $test='NA';
my $res = $seq->coerceUndefinedValues($test);
my $res = $seq->_stripAndCoerceUndef($test);

ok(!defined $test && !defined $res, "Modifies passed value, and sets NA to undef");

$test='.';
$res = $seq->coerceUndefinedValues($test);
$res = $seq->_stripAndCoerceUndef($test);

ok(!defined $test && !defined $res, "Sets . to undef");

$test='see cases';
$res = $seq->_stripAndCoerceUndef($test);

ok(!defined $test && !defined $res, "'see cases' is not a valid value");

$test='see cases ';
$res = $seq->_stripAndCoerceUndef($test);

ok(!defined $test && !defined $res, "'see cases' with trailing whitespace not a valid value");

$test=' see cases';
$res = $seq->_stripAndCoerceUndef($test);

ok(!defined $test && !defined $res, "'see cases' with leading whitespace not a valid value");

$test=' see cases ';
$res = $seq->_stripAndCoerceUndef($test);

ok(!defined $test && !defined $res, "'see cases' with leading/trailing whitespace is not a valid value");

$test='not provided';
$res = $seq->_stripAndCoerceUndef($test);

ok(!defined $test && !defined $res, "'not provided' is not a valid value");

$test='no assertion criteria provided';
$res = $seq->_stripAndCoerceUndef($test);

ok(!defined $test && !defined $res, "'no assertion criteria provided' is not a valid value");


$test=$delims->emptyFieldChar;
$res = $seq->coerceUndefinedValues($test);
$res = $seq->_stripAndCoerceUndef($test);

ok(!defined $test && !defined $res, "Sets the emptyFieldChar to undef");

$test=' NA ';
$res = $seq->coerceUndefinedValues($test);
$res = $seq->_stripAndCoerceUndef($test);

ok(!defined $test && !defined $res, "Whitespace doesnt affect coercion");

my $expected = 'NA / Some value';
$test='NA / Some value';
$res = $seq->coerceUndefinedValues($test);
$res = $seq->_stripAndCoerceUndef($test);

ok($test eq $res && $res eq $expected, "Doesn't clear valued statements");

$test = " SOMETHING NOT NULL ";
$seq->_stripAndCoerceUndef($test);
ok($test eq "SOMETHING NOT NULL", "_stripAndCoerceUndef also strips leading/trailing spaces");

done_testing();
1;

0 comments on commit 7df8409

Please sign in to comment.