@@ -273,7 +273,6 @@ void SQLiteConstructor::run(){
for(int i=0;i<keep_rc->size();i++){
cout << keep_rc->at(i);
}cout << endl;
exit(0);
/*
* reduce genome sequences
*/
@@ -641,7 +640,7 @@ DBSeq SQLiteConstructor::add_higher_taxa(string taxon_id,vector<DBSeq> seqs){
*/
vector<DBSeq> * keep_seqs2 = new vector<DBSeq>();
vector<bool> * keep_rc2 = new vector<bool>();
get_same_seqs_pthreads_SWPS3(seqs_fn2,keep_seqs2,keep_rc2);
get_same_seqs_openmp_SWPS3(seqs_fn2,keep_seqs2,keep_rc2);
//take keep_seqs and the known_seqs and get the distances and get the best
vector<int> scores;
SBMatrix mat = swps3_readSBMatrix( "EDNAFULL" );
@@ -798,175 +797,6 @@ vector <DBSeq> SQLiteConstructor::include_gis_from_file(vector<DBSeq> seqs){
}


vector<double> SQLiteConstructor::get_blast_score_and_rc(Sequence inseq1, DBSeq inseq2, bool * rc){
vector<double> retvalues;
FastaUtil seqwriter1;
FastaUtil seqwriter2;
vector<Sequence> sc1;
vector<Sequence> sc2;
const string fn1 = "seq1";
const string fn2 = "seq2";
seqwriter1.writeFileFromVector(fn1,sc1);
seqwriter2.writeFileFromVector(fn2,sc2);
// string cmd = "bl2seq -i seq1 -j seq2 -p blastn -D 1";
double coverage = 0;
double identity = 0;
string line;

const char * cmd = "bl2seq -i seq1 -j seq2 -p blastn -D 1";
FILE *fp = popen(cmd, "r" );
char buff[1000];
vector<string> tokens;
while ( fgets( buff, sizeof buff, fp ) != NULL ) {//doesn't exit out
string line(buff);

size_t found=line.find("#");
if (found==string::npos){
//cout << "XXX " << line << endl;
string del("\t");
Tokenize(line, tokens, del);
for (int i=0;i<tokens.size();i++){
//cout << i << " " << tokens[i] << endl;
}
coverage = coverage + strtod(tokens[3].c_str(),NULL);
if (strtod(tokens[2].c_str(),NULL) > identity){
identity = strtod(tokens[2].c_str(),NULL);
}
}
//cout << buff;
}
pclose( fp );
if (tokens.size() < 1){
return retvalues;
}else{
//bool rc = false;
if (strtod(tokens[8].c_str(),NULL)>strtod(tokens[9].c_str(),NULL))
*rc=true;
else
*rc=false;
cout << *rc;
}
retvalues.push_back(identity/100.0);
retvalues.push_back(coverage/(int)inseq1.get_sequence().size());
return retvalues;
//return (float(maxident/100.0),float(coverage/len(seq1.seq.tostring())),rc)
}

//vector< vector<DBSeq> >
void SQLiteConstructor::get_same_seqs(vector<DBSeq> seqs, vector<DBSeq> * keep_seqs, vector<bool> * keep_rc){
//vector<DBSeq> keep_seqs;
//vector<DBSeq> keep_rc;
double maxide = 0;
double maxcov = 0;
bool rc = false;
int reports = 100;
for (int i=0;i<seqs.size();i++){
if(i%reports == 0){
cout << i << endl;
}
maxide = 0;
maxcov = 0;
rc = false;
for (int j=0;j<known_seqs->size();j++){
bool trc = false;
vector<double> ret = get_blast_score_and_rc(known_seqs->at(j), seqs[i], &trc); //should be pointer?
if (ret.size() > 1){
/*if (ret[0] >maxide){
maxide = ret[0];
}
if (ret[1] > maxcov){ // should these be in the same conditional statement
maxcov = ret[1];
rc = trc;//need to get it somewhere else -- pointer probably
}*/
if (ret[0] >maxide && ret[1] > maxcov){ // should these be in the same conditional statement
maxide = ret[0];
maxcov = ret[1];
rc = trc;//need to get it somewhere else -- pointer probably
}
}
}
if (maxide >= identity && maxcov >= coverage){
keep_seqs->push_back(seqs[i]);
keep_rc->push_back(rc);
}
}
}


void SQLiteConstructor::get_same_seqs_pthreads(vector<DBSeq> seqs, vector<DBSeq> * keep_seqs, vector<bool> * keep_rc){
//vector<DBSeq> keep_seqs;
//vector<DBSeq> keep_rc;

/*
* begin the parallelization here
*/
//split the seqs into the num of threads
//vector<Same_seq_pthread_storage> storage;

struct thread_data thread_data_array[numthreads];

for (int i=0;i<numthreads; i++){
vector <DBSeq> st_seqs;
if((i+1) < numthreads){
for(unsigned int j=(i*(seqs.size()/numthreads));j<((seqs.size()/numthreads))*(i+1);j++){
//for(int j=(i*(seqs.size()/numthreads));j<100;j++){
st_seqs.push_back(seqs[j]);
}
}else{//last one
for(unsigned int j=(i*(seqs.size()/numthreads));j<seqs.size();j++){
//for(int j=(i*(seqs.size()/numthreads));j<100;j++){
st_seqs.push_back(seqs[j]);
}
}
cout << "spliting: " << st_seqs.size() << endl;
//Same_seq_pthread_storage temp (st_seqs,coverage,identity);
//storage.push_back(temp);
thread_data_array[i].thread_id = i;
thread_data_array[i].seqs = st_seqs;
thread_data_array[i].coverage = coverage;
thread_data_array[i].identity = identity;
thread_data_array[i].reports = 100;
thread_data_array[i].known_seqs = known_seqs;
vector<DBSeq> keep_seqs1;
vector<bool> keep_rc1;
thread_data_array[i].keep_seqs = keep_seqs1;
thread_data_array[i].keep_rc = keep_rc1;
}
pthread_t threads[numthreads];
void *status;
int rc;
pthread_attr_t attr;
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
for(int i=0; i <numthreads; i++){
cout << "thread: " << i <<endl;
rc = pthread_create(&threads[i], &attr, Same_seq_pthread_go, (void *) &thread_data_array[i]);
if (rc){
printf("ERROR; return code from pthread_create() is %d\n", rc);
exit(-1);
}
}
pthread_attr_destroy(&attr);
for(int i=0;i<numthreads; i++){
cout << "joining: " << i << endl;
pthread_join( threads[i], &status);
if (rc){
printf("ERROR; return code from pthread_join() is %d\n", rc);
exit(-1);
}
printf("Completed join with thread %d status= %ld\n",i, (long)status);
}
/*
* bring em back and combine for keep_seqs and keep_rc
*/
for (int i=0;i<numthreads; i++){
for(int j=0;j<thread_data_array[i].keep_seqs.size();j++){
keep_seqs->push_back(thread_data_array[i].keep_seqs[j]);
keep_rc->push_back(thread_data_array[i].keep_rc[j]);
}

}
}

/*
* OPENMP version
@@ -1052,6 +882,7 @@ void SQLiteConstructor::remove_duplicates_SWPS3(vector<DBSeq> * keep_seqs, vecto
}

vector<int> remove;
#pragma omp parallel for shared(remove)
for(unsigned int i=0;i<unique_ids.size();i++){
mycount = 0;
mycount = (int) count (ids.begin(),ids.end(), unique_ids[i]);
@@ -1650,8 +1481,29 @@ void SQLiteConstructor::add_seqs_from_file_to_dbseqs_vector(string filename,vect
}



//DEPRECATED!!!!
//THESE ARE HERE FOR HISTORICAL REASONS





















/*
* things that are basically deprecated but keeping them just in case
@@ -1836,3 +1688,173 @@ void SQLiteConstructor::remove_duplicates(vector<DBSeq> * keep_seqs, vector<bool
keep_rc->erase(keep_rc->begin()+remove[i]);
}
}

vector<double> SQLiteConstructor::get_blast_score_and_rc(Sequence inseq1, DBSeq inseq2, bool * rc){
vector<double> retvalues;
FastaUtil seqwriter1;
FastaUtil seqwriter2;
vector<Sequence> sc1;
vector<Sequence> sc2;
const string fn1 = "seq1";
const string fn2 = "seq2";
seqwriter1.writeFileFromVector(fn1,sc1);
seqwriter2.writeFileFromVector(fn2,sc2);
// string cmd = "bl2seq -i seq1 -j seq2 -p blastn -D 1";
double coverage = 0;
double identity = 0;
string line;

const char * cmd = "bl2seq -i seq1 -j seq2 -p blastn -D 1";
FILE *fp = popen(cmd, "r" );
char buff[1000];
vector<string> tokens;
while ( fgets( buff, sizeof buff, fp ) != NULL ) {//doesn't exit out
string line(buff);

size_t found=line.find("#");
if (found==string::npos){
//cout << "XXX " << line << endl;
string del("\t");
Tokenize(line, tokens, del);
for (int i=0;i<tokens.size();i++){
//cout << i << " " << tokens[i] << endl;
}
coverage = coverage + strtod(tokens[3].c_str(),NULL);
if (strtod(tokens[2].c_str(),NULL) > identity){
identity = strtod(tokens[2].c_str(),NULL);
}
}
//cout << buff;
}
pclose( fp );
if (tokens.size() < 1){
return retvalues;
}else{
//bool rc = false;
if (strtod(tokens[8].c_str(),NULL)>strtod(tokens[9].c_str(),NULL))
*rc=true;
else
*rc=false;
cout << *rc;
}
retvalues.push_back(identity/100.0);
retvalues.push_back(coverage/(int)inseq1.get_sequence().size());
return retvalues;
//return (float(maxident/100.0),float(coverage/len(seq1.seq.tostring())),rc)
}

//vector< vector<DBSeq> >
void SQLiteConstructor::get_same_seqs(vector<DBSeq> seqs, vector<DBSeq> * keep_seqs, vector<bool> * keep_rc){
//vector<DBSeq> keep_seqs;
//vector<DBSeq> keep_rc;
double maxide = 0;
double maxcov = 0;
bool rc = false;
int reports = 100;
for (int i=0;i<seqs.size();i++){
if(i%reports == 0){
cout << i << endl;
}
maxide = 0;
maxcov = 0;
rc = false;
for (int j=0;j<known_seqs->size();j++){
bool trc = false;
vector<double> ret = get_blast_score_and_rc(known_seqs->at(j), seqs[i], &trc); //should be pointer?
if (ret.size() > 1){
/*if (ret[0] >maxide){
maxide = ret[0];
}
if (ret[1] > maxcov){ // should these be in the same conditional statement
maxcov = ret[1];
rc = trc;//need to get it somewhere else -- pointer probably
}*/
if (ret[0] >maxide && ret[1] > maxcov){ // should these be in the same conditional statement
maxide = ret[0];
maxcov = ret[1];
rc = trc;//need to get it somewhere else -- pointer probably
}
}
}
if (maxide >= identity && maxcov >= coverage){
keep_seqs->push_back(seqs[i]);
keep_rc->push_back(rc);
}
}
}


void SQLiteConstructor::get_same_seqs_pthreads(vector<DBSeq> seqs, vector<DBSeq> * keep_seqs, vector<bool> * keep_rc){
//vector<DBSeq> keep_seqs;
//vector<DBSeq> keep_rc;

/*
* begin the parallelization here
*/
//split the seqs into the num of threads
//vector<Same_seq_pthread_storage> storage;

struct thread_data thread_data_array[numthreads];

for (int i=0;i<numthreads; i++){
vector <DBSeq> st_seqs;
if((i+1) < numthreads){
for(unsigned int j=(i*(seqs.size()/numthreads));j<((seqs.size()/numthreads))*(i+1);j++){
//for(int j=(i*(seqs.size()/numthreads));j<100;j++){
st_seqs.push_back(seqs[j]);
}
}else{//last one
for(unsigned int j=(i*(seqs.size()/numthreads));j<seqs.size();j++){
//for(int j=(i*(seqs.size()/numthreads));j<100;j++){
st_seqs.push_back(seqs[j]);
}
}
cout << "spliting: " << st_seqs.size() << endl;
//Same_seq_pthread_storage temp (st_seqs,coverage,identity);
//storage.push_back(temp);
thread_data_array[i].thread_id = i;
thread_data_array[i].seqs = st_seqs;
thread_data_array[i].coverage = coverage;
thread_data_array[i].identity = identity;
thread_data_array[i].reports = 100;
thread_data_array[i].known_seqs = known_seqs;
vector<DBSeq> keep_seqs1;
vector<bool> keep_rc1;
thread_data_array[i].keep_seqs = keep_seqs1;
thread_data_array[i].keep_rc = keep_rc1;
}
pthread_t threads[numthreads];
void *status;
int rc;
pthread_attr_t attr;
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
for(int i=0; i <numthreads; i++){
cout << "thread: " << i <<endl;
rc = pthread_create(&threads[i], &attr, Same_seq_pthread_go, (void *) &thread_data_array[i]);
if (rc){
printf("ERROR; return code from pthread_create() is %d\n", rc);
exit(-1);
}
}
pthread_attr_destroy(&attr);
for(int i=0;i<numthreads; i++){
cout << "joining: " << i << endl;
pthread_join( threads[i], &status);
if (rc){
printf("ERROR; return code from pthread_join() is %d\n", rc);
exit(-1);
}
printf("Completed join with thread %d status= %ld\n",i, (long)status);
}
/*
* bring em back and combine for keep_seqs and keep_rc
*/
for (int i=0;i<numthreads; i++){
for(int j=0;j<thread_data_array[i].keep_seqs.size();j++){
keep_seqs->push_back(thread_data_array[i].keep_seqs[j]);
keep_rc->push_back(thread_data_array[i].keep_rc[j]);
}

}
}