@@ -273,7 +273,6 @@ void SQLiteConstructor::run(){
for (int i=0 ;i<keep_rc->size ();i++){
cout << keep_rc->at (i);
}cout << endl;
exit (0 );
/*
* reduce genome sequences
*/
@@ -641,7 +640,7 @@ DBSeq SQLiteConstructor::add_higher_taxa(string taxon_id,vector<DBSeq> seqs){
*/
vector<DBSeq> * keep_seqs2 = new vector<DBSeq>();
vector<bool > * keep_rc2 = new vector<bool >();
get_same_seqs_pthreads_SWPS3 (seqs_fn2,keep_seqs2,keep_rc2);
get_same_seqs_openmp_SWPS3 (seqs_fn2,keep_seqs2,keep_rc2);
// take keep_seqs and the known_seqs and get the distances and get the best
vector<int > scores;
SBMatrix mat = swps3_readSBMatrix ( " EDNAFULL" );
@@ -798,175 +797,6 @@ vector <DBSeq> SQLiteConstructor::include_gis_from_file(vector<DBSeq> seqs){
}
vector<double > SQLiteConstructor::get_blast_score_and_rc (Sequence inseq1, DBSeq inseq2, bool * rc){
vector<double > retvalues;
FastaUtil seqwriter1;
FastaUtil seqwriter2;
vector<Sequence> sc1;
vector<Sequence> sc2;
const string fn1 = " seq1" ;
const string fn2 = " seq2" ;
seqwriter1.writeFileFromVector (fn1,sc1);
seqwriter2.writeFileFromVector (fn2,sc2);
// string cmd = "bl2seq -i seq1 -j seq2 -p blastn -D 1";
double coverage = 0 ;
double identity = 0 ;
string line;
const char * cmd = " bl2seq -i seq1 -j seq2 -p blastn -D 1" ;
FILE *fp = popen (cmd, " r" );
char buff[1000 ];
vector<string> tokens;
while ( fgets ( buff, sizeof buff, fp ) != NULL ) {// doesn't exit out
string line (buff);
size_t found=line.find (" #" );
if (found==string::npos){
// cout << "XXX " << line << endl;
string del (" \t " );
Tokenize (line, tokens, del);
for (int i=0 ;i<tokens.size ();i++){
// cout << i << " " << tokens[i] << endl;
}
coverage = coverage + strtod (tokens[3 ].c_str (),NULL );
if (strtod (tokens[2 ].c_str (),NULL ) > identity){
identity = strtod (tokens[2 ].c_str (),NULL );
}
}
// cout << buff;
}
pclose ( fp );
if (tokens.size () < 1 ){
return retvalues;
}else {
// bool rc = false;
if (strtod (tokens[8 ].c_str (),NULL )>strtod (tokens[9 ].c_str (),NULL ))
*rc=true ;
else
*rc=false ;
cout << *rc;
}
retvalues.push_back (identity/100.0 );
retvalues.push_back (coverage/(int )inseq1.get_sequence ().size ());
return retvalues;
// return (float(maxident/100.0),float(coverage/len(seq1.seq.tostring())),rc)
}
// vector< vector<DBSeq> >
void SQLiteConstructor::get_same_seqs (vector<DBSeq> seqs, vector<DBSeq> * keep_seqs, vector<bool > * keep_rc){
// vector<DBSeq> keep_seqs;
// vector<DBSeq> keep_rc;
double maxide = 0 ;
double maxcov = 0 ;
bool rc = false ;
int reports = 100 ;
for (int i=0 ;i<seqs.size ();i++){
if (i%reports == 0 ){
cout << i << endl;
}
maxide = 0 ;
maxcov = 0 ;
rc = false ;
for (int j=0 ;j<known_seqs->size ();j++){
bool trc = false ;
vector<double > ret = get_blast_score_and_rc (known_seqs->at (j), seqs[i], &trc); // should be pointer?
if (ret.size () > 1 ){
/* if (ret[0] >maxide){
maxide = ret[0];
}
if (ret[1] > maxcov){ // should these be in the same conditional statement
maxcov = ret[1];
rc = trc;//need to get it somewhere else -- pointer probably
}*/
if (ret[0 ] >maxide && ret[1 ] > maxcov){ // should these be in the same conditional statement
maxide = ret[0 ];
maxcov = ret[1 ];
rc = trc;// need to get it somewhere else -- pointer probably
}
}
}
if (maxide >= identity && maxcov >= coverage){
keep_seqs->push_back (seqs[i]);
keep_rc->push_back (rc);
}
}
}
void SQLiteConstructor::get_same_seqs_pthreads (vector<DBSeq> seqs, vector<DBSeq> * keep_seqs, vector<bool > * keep_rc){
// vector<DBSeq> keep_seqs;
// vector<DBSeq> keep_rc;
/*
* begin the parallelization here
*/
// split the seqs into the num of threads
// vector<Same_seq_pthread_storage> storage;
struct thread_data thread_data_array[numthreads];
for (int i=0 ;i<numthreads; i++){
vector <DBSeq> st_seqs;
if ((i+1 ) < numthreads){
for (unsigned int j=(i*(seqs.size ()/numthreads));j<((seqs.size ()/numthreads))*(i+1 );j++){
// for(int j=(i*(seqs.size()/numthreads));j<100;j++){
st_seqs.push_back (seqs[j]);
}
}else {// last one
for (unsigned int j=(i*(seqs.size ()/numthreads));j<seqs.size ();j++){
// for(int j=(i*(seqs.size()/numthreads));j<100;j++){
st_seqs.push_back (seqs[j]);
}
}
cout << " spliting: " << st_seqs.size () << endl;
// Same_seq_pthread_storage temp (st_seqs,coverage,identity);
// storage.push_back(temp);
thread_data_array[i].thread_id = i;
thread_data_array[i].seqs = st_seqs;
thread_data_array[i].coverage = coverage;
thread_data_array[i].identity = identity;
thread_data_array[i].reports = 100 ;
thread_data_array[i].known_seqs = known_seqs;
vector<DBSeq> keep_seqs1;
vector<bool > keep_rc1;
thread_data_array[i].keep_seqs = keep_seqs1;
thread_data_array[i].keep_rc = keep_rc1;
}
pthread_t threads[numthreads];
void *status;
int rc;
pthread_attr_t attr;
pthread_attr_init (&attr);
pthread_attr_setdetachstate (&attr, PTHREAD_CREATE_JOINABLE);
for (int i=0 ; i <numthreads; i++){
cout << " thread: " << i <<endl;
rc = pthread_create (&threads[i], &attr, Same_seq_pthread_go, (void *) &thread_data_array[i]);
if (rc){
printf (" ERROR; return code from pthread_create() is %d\n " , rc);
exit (-1 );
}
}
pthread_attr_destroy (&attr);
for (int i=0 ;i<numthreads; i++){
cout << " joining: " << i << endl;
pthread_join ( threads[i], &status);
if (rc){
printf (" ERROR; return code from pthread_join() is %d\n " , rc);
exit (-1 );
}
printf (" Completed join with thread %d status= %ld\n " ,i, (long )status);
}
/*
* bring em back and combine for keep_seqs and keep_rc
*/
for (int i=0 ;i<numthreads; i++){
for (int j=0 ;j<thread_data_array[i].keep_seqs .size ();j++){
keep_seqs->push_back (thread_data_array[i].keep_seqs [j]);
keep_rc->push_back (thread_data_array[i].keep_rc [j]);
}
}
}
/*
* OPENMP version
@@ -1052,6 +882,7 @@ void SQLiteConstructor::remove_duplicates_SWPS3(vector<DBSeq> * keep_seqs, vecto
}
vector<int > remove ;
#pragma omp parallel for shared(remove)
for (unsigned int i=0 ;i<unique_ids.size ();i++){
mycount = 0 ;
mycount = (int ) count (ids.begin (),ids.end (), unique_ids[i]);
@@ -1650,8 +1481,29 @@ void SQLiteConstructor::add_seqs_from_file_to_dbseqs_vector(string filename,vect
}
// DEPRECATED!!!!
// THESE ARE HERE FOR HISTORICAL REASONS
/*
* things that are basically deprecated but keeping them just in case
@@ -1836,3 +1688,173 @@ void SQLiteConstructor::remove_duplicates(vector<DBSeq> * keep_seqs, vector<bool
keep_rc->erase (keep_rc->begin ()+remove [i]);
}
}
vector<double > SQLiteConstructor::get_blast_score_and_rc (Sequence inseq1, DBSeq inseq2, bool * rc){
vector<double > retvalues;
FastaUtil seqwriter1;
FastaUtil seqwriter2;
vector<Sequence> sc1;
vector<Sequence> sc2;
const string fn1 = " seq1" ;
const string fn2 = " seq2" ;
seqwriter1.writeFileFromVector (fn1,sc1);
seqwriter2.writeFileFromVector (fn2,sc2);
// string cmd = "bl2seq -i seq1 -j seq2 -p blastn -D 1";
double coverage = 0 ;
double identity = 0 ;
string line;
const char * cmd = " bl2seq -i seq1 -j seq2 -p blastn -D 1" ;
FILE *fp = popen (cmd, " r" );
char buff[1000 ];
vector<string> tokens;
while ( fgets ( buff, sizeof buff, fp ) != NULL ) {// doesn't exit out
string line (buff);
size_t found=line.find (" #" );
if (found==string::npos){
// cout << "XXX " << line << endl;
string del (" \t " );
Tokenize (line, tokens, del);
for (int i=0 ;i<tokens.size ();i++){
// cout << i << " " << tokens[i] << endl;
}
coverage = coverage + strtod (tokens[3 ].c_str (),NULL );
if (strtod (tokens[2 ].c_str (),NULL ) > identity){
identity = strtod (tokens[2 ].c_str (),NULL );
}
}
// cout << buff;
}
pclose ( fp );
if (tokens.size () < 1 ){
return retvalues;
}else {
// bool rc = false;
if (strtod (tokens[8 ].c_str (),NULL )>strtod (tokens[9 ].c_str (),NULL ))
*rc=true ;
else
*rc=false ;
cout << *rc;
}
retvalues.push_back (identity/100.0 );
retvalues.push_back (coverage/(int )inseq1.get_sequence ().size ());
return retvalues;
// return (float(maxident/100.0),float(coverage/len(seq1.seq.tostring())),rc)
}
// vector< vector<DBSeq> >
void SQLiteConstructor::get_same_seqs (vector<DBSeq> seqs, vector<DBSeq> * keep_seqs, vector<bool > * keep_rc){
// vector<DBSeq> keep_seqs;
// vector<DBSeq> keep_rc;
double maxide = 0 ;
double maxcov = 0 ;
bool rc = false ;
int reports = 100 ;
for (int i=0 ;i<seqs.size ();i++){
if (i%reports == 0 ){
cout << i << endl;
}
maxide = 0 ;
maxcov = 0 ;
rc = false ;
for (int j=0 ;j<known_seqs->size ();j++){
bool trc = false ;
vector<double > ret = get_blast_score_and_rc (known_seqs->at (j), seqs[i], &trc); // should be pointer?
if (ret.size () > 1 ){
/* if (ret[0] >maxide){
maxide = ret[0];
}
if (ret[1] > maxcov){ // should these be in the same conditional statement
maxcov = ret[1];
rc = trc;//need to get it somewhere else -- pointer probably
}*/
if (ret[0 ] >maxide && ret[1 ] > maxcov){ // should these be in the same conditional statement
maxide = ret[0 ];
maxcov = ret[1 ];
rc = trc;// need to get it somewhere else -- pointer probably
}
}
}
if (maxide >= identity && maxcov >= coverage){
keep_seqs->push_back (seqs[i]);
keep_rc->push_back (rc);
}
}
}
void SQLiteConstructor::get_same_seqs_pthreads (vector<DBSeq> seqs, vector<DBSeq> * keep_seqs, vector<bool > * keep_rc){
// vector<DBSeq> keep_seqs;
// vector<DBSeq> keep_rc;
/*
* begin the parallelization here
*/
// split the seqs into the num of threads
// vector<Same_seq_pthread_storage> storage;
struct thread_data thread_data_array[numthreads];
for (int i=0 ;i<numthreads; i++){
vector <DBSeq> st_seqs;
if ((i+1 ) < numthreads){
for (unsigned int j=(i*(seqs.size ()/numthreads));j<((seqs.size ()/numthreads))*(i+1 );j++){
// for(int j=(i*(seqs.size()/numthreads));j<100;j++){
st_seqs.push_back (seqs[j]);
}
}else {// last one
for (unsigned int j=(i*(seqs.size ()/numthreads));j<seqs.size ();j++){
// for(int j=(i*(seqs.size()/numthreads));j<100;j++){
st_seqs.push_back (seqs[j]);
}
}
cout << " spliting: " << st_seqs.size () << endl;
// Same_seq_pthread_storage temp (st_seqs,coverage,identity);
// storage.push_back(temp);
thread_data_array[i].thread_id = i;
thread_data_array[i].seqs = st_seqs;
thread_data_array[i].coverage = coverage;
thread_data_array[i].identity = identity;
thread_data_array[i].reports = 100 ;
thread_data_array[i].known_seqs = known_seqs;
vector<DBSeq> keep_seqs1;
vector<bool > keep_rc1;
thread_data_array[i].keep_seqs = keep_seqs1;
thread_data_array[i].keep_rc = keep_rc1;
}
pthread_t threads[numthreads];
void *status;
int rc;
pthread_attr_t attr;
pthread_attr_init (&attr);
pthread_attr_setdetachstate (&attr, PTHREAD_CREATE_JOINABLE);
for (int i=0 ; i <numthreads; i++){
cout << " thread: " << i <<endl;
rc = pthread_create (&threads[i], &attr, Same_seq_pthread_go, (void *) &thread_data_array[i]);
if (rc){
printf (" ERROR; return code from pthread_create() is %d\n " , rc);
exit (-1 );
}
}
pthread_attr_destroy (&attr);
for (int i=0 ;i<numthreads; i++){
cout << " joining: " << i << endl;
pthread_join ( threads[i], &status);
if (rc){
printf (" ERROR; return code from pthread_join() is %d\n " , rc);
exit (-1 );
}
printf (" Completed join with thread %d status= %ld\n " ,i, (long )status);
}
/*
* bring em back and combine for keep_seqs and keep_rc
*/
for (int i=0 ;i<numthreads; i++){
for (int j=0 ;j<thread_data_array[i].keep_seqs .size ();j++){
keep_seqs->push_back (thread_data_array[i].keep_seqs [j]);
keep_rc->push_back (thread_data_array[i].keep_rc [j]);
}
}
}