Large diffs are not rendered by default.

@@ -35,6 +35,8 @@ using namespace std;
#include "sequence.h"

#include "DBSeq.h"
#include "tree.h"
#include "node.h"

class SQLiteConstructor {
private:
@@ -65,6 +67,9 @@ class SQLiteConstructor {
bool updateFILE;
string updatef;
bool ncbi_saturation;
bool usertree;
string usertreefile;
Tree * userguidetree;
vector<Sequence> * known_seqs;
vector<DBSeq> use_only_names_from_file(vector<DBSeq> seqs);
DBSeq add_higher_taxa(string taxon_id,vector<DBSeq> seqs);
@@ -77,11 +82,14 @@ class SQLiteConstructor {
void remove_duplicates_SWPS3(vector<DBSeq> * keep_seqs);
void reduce_genomes(vector<DBSeq> * keep_seqs);
void get_seqs_for_names(string name_id, vector<DBSeq> * seqs, vector<DBSeq> * temp_seqs);
void get_seqs_for_nodes(Node * node, vector<DBSeq> * seqs, vector<DBSeq> * temp_seqs);
vector<string> get_final_children(string name_id);
vector<string> get_final_children_node(Node * node);
void make_mafft_multiple_alignment(vector<DBSeq> * inseqs);
double calculate_MAD_quicktree();
double calculate_MAD_quicktree_sample(vector<DBSeq> * inseqs);
void saturation_tests(vector<string> name_ids, vector<string> names, vector<DBSeq> * keep_seqs);
int get_single_to_group_seq_score(Sequence & inseq,vector<Sequence> & ginseqs);
void write_gi_numbers(vector<DBSeq> *);
void add_seqs_from_file_to_dbseqs_vector(string filename,vector<DBSeq> * keep_seqs, map<string,string> & taxgimap);
public:
@@ -94,6 +102,7 @@ class SQLiteConstructor {
void set_exclude_names_from_file(string filename);
void set_exclude_gi_from_file(string filename);
void set_include_gi_from_file(string filename);
void set_user_guide_tree(string filename);
void run();
string get_cladename();
vector <string> get_search();
@@ -102,6 +111,7 @@ class SQLiteConstructor {
double get_coverage();
double get_identity();
int get_numthreads();
Tree * get_user_guide_tree_obj();
};


@@ -53,6 +53,8 @@ using namespace std;

#include "SQLiteProfiler.h"

#define SIXES 666666

template <class T>
inline std::string to_string (const T& t)
{
@@ -68,9 +70,11 @@ SQLiteProfiler::SQLiteProfiler(string gn, string cn, string dbs,bool autom,bool
db = dbs;
automated = autom;
updatedb = updb;
usertree = false;
profilefoldername = gene_name+"_PROFILE/";
}

//TODO: add the ability to have the usertree inform these alignments
void SQLiteProfiler::prelimalign(){
// if temp directory doesn't exist
mkdir(profilefoldername.c_str(),S_IRWXU | S_IRWXG | S_IROTH | S_IWOTH);
@@ -292,6 +296,9 @@ void SQLiteProfiler::prelimalign(){
}
}

/*
* TODO: probably better to have a operating system independent one
*/
int SQLiteProfiler::count_seqs(string dirc, string file_name){
string cmd = "grep -c \\> ";
cmd += dirc+"/";
@@ -310,31 +317,24 @@ int SQLiteProfiler::count_seqs(string dirc, string file_name){
return intReturn;
}

void SQLiteProfiler::set_user_guide_tree(Tree * tree){
usertree = true;
userguidetree = tree;
}

void SQLiteProfiler::run(){
if(updatedb == false){//standard
file_names = vector<string>();
cout << "getting file names" << endl;
getdir(profilefoldername.c_str(),file_names);
if(file_names.size() > 1){
//get guide tree
//if the guide tree is not there, then use the ncbi tree
string guiname = gene_name+".guide";
bool flag = false;
fstream fin;
fin.open(guiname.c_str(),ios::in);
if( fin.is_open() ){
flag=true;
}
fin.close();
//end test for guide tree
map<string,string> numnames;
map<string,string> namesnum;
vector< vector<double> > numlist;
//TODO: add incomplete user guide tree
if(flag == true){
//TODO: make sure that this works with update , incomplete user guide tree
if(usertree == true){
cout << "user guide tree" << endl;
Tree * tree = read_user_guide_tree(guiname);
create_distances(file_names,tree,&numnames,&namesnum,&numlist);
create_distances_user_tree(file_names,&numnames,&namesnum,&numlist);
}else{//use ncbi tree
cout << "ncbi guide tree" << endl;
create_distances(cladename,file_names,&numnames,&namesnum,&numlist);
@@ -381,39 +381,6 @@ string SQLiteProfiler::get_profilekey_value(string profile_string){
return match_string;
}

Tree * SQLiteProfiler::read_user_guide_tree(string filen){
TreeReader nw;
ifstream infile2(filen.c_str());
vector<string> lines;
string line;
while (getline(infile2, line)){
lines.push_back(line);
}
infile2.close();

Tree * tree = nw.readTree(lines[0]);
vector<string> orphans;
for(int i=0;i<file_names.size();i++){
try{
tree->getExternalNode(file_names[i])->getName();
}catch(int e){
orphans.push_back(file_names[i]);
use_orphan = true;
}
}
string orphfilename = profilefoldername+"orphan";
ofstream myfile(orphfilename.c_str());
for(int i=0;i<orphans.size();i++){
ifstream tfile (orphans[i].c_str());
string line;
while(getline(tfile,line)){
myfile << line; // make sure it adds the \n
}
}
myfile.close();
return tree;
}

void SQLiteProfiler::get_children(string in_id, vector<string> * in_ids, vector<string> * in_keepids){
Database conn(db);
string sql = "SELECT ncbi_id FROM taxonomy WHERE parent_ncbi_id = "+in_id;
@@ -564,74 +531,40 @@ void SQLiteProfiler::create_distances(string clade_name, vector<string> names,ma
for(int i=0;i<names.size();i++){
Database conn(db);
sql = "SELECT ncbi_id FROM taxonomy WHERE name = '"+names[i]+"' and name_class = 'scientific name';";
//cout << sql << endl;
Query query2(conn);
query2.get_result(sql);
// StoreQueryResult R = query2.store();
string nameid = get_right_one(allids, query2);
sql = "SELECT parent_ncbi_id FROM taxonomy WHERE ncbi_id = "+nameid+" and name_class = 'scientific name';";
Query query4(conn);
query4.get_result(sql);
// StoreQueryResult R2 = query4.store();
string parentid = get_right_one(allids,query4);
vector<string> route;
while(parentid != cladeid){
//cout << "loop" << endl;
route.push_back(parentid);
cout << "nameid " << nameid << endl;
cout << "parentid1 " << parentid << endl;
// cout << "nameid " << nameid << endl;
// cout << "parentid1 " << parentid << endl;
nameid = parentid;
sql = "SELECT parent_ncbi_id FROM taxonomy WHERE ncbi_id = "+nameid+" and name_class = 'scientific name';";
//cout << sql << endl;
Query query5(conn);
query5.get_result(sql);
// StoreQueryResult R3 = query5.store();
parentid = get_right_one(allids,query5);
//cout << "parentid2 " << parentid << endl;
}
route.push_back(parentid);
/*
* add using the left right for the route
*
string left,right;
sql = "SELECT left_value,right_value FROM taxon WHERE taxon.taxon_id = "+nameid;
Query query5 = conn.query(sql);
StoreQueryResult R3 = query5.store();
left = R3[0][0].c_str();
right = R3[0][1].c_str();

vector<string> route;
sql = "SELECT taxon.taxon_id FROM taxon WHERE left_value < "+left;
sql += "AND right_value > ";
sql += right;
sql+= "AND left_value >= ";
sql+= cladeleft;
sql+= "AND right_value <= ";
sql += claderight;
sql += "ORDER BY left_value DESC;";
Query query4 = conn.query(sql);
StoreQueryResult R2 = query4.store();
for(int j=0;j<R2.size();j++){
route.push_back(R2[j][0].c_str());
}
*
* end add
*/
vector<double> tdistance;
for(int j=0;j<names.size();j++){
if(j!=i){
sql = "SELECT ncbi_id FROM taxonomy WHERE name = '"+names[j]+"' and name_class = 'scientific name';";
Query query5(conn);
query5.get_result(sql);
// StoreQueryResult R3 = query5.store();
string jnameid = get_right_one(allids,query5);
double distance = 0;
while((int)count(route.begin(),route.end(),jnameid) == 0){
distance += 1;
sql = "SELECT parent_ncbi_id FROM taxonomy WHERE ncbi_id = "+jnameid+" and name_class = 'scientific name';";
Query query6(conn);
query6.get_result(sql);
// StoreQueryResult R4 = query6.store();
jnameid = get_right_one(allids,query6);
}
for(int k=0;k<route.size();k++){
@@ -640,7 +573,7 @@ void SQLiteProfiler::create_distances(string clade_name, vector<string> names,ma
}
tdistance.push_back(distance);
}else{
tdistance.push_back(666666);
tdistance.push_back(SIXES);
}
}
std::ostringstream stm;
@@ -650,28 +583,33 @@ void SQLiteProfiler::create_distances(string clade_name, vector<string> names,ma
numlist->push_back(tdistance);
cout << "distances complete: "<< names[i] << endl;
}
//distances = list(tdistance)
}

//user tree one
void SQLiteProfiler::create_distances(vector<string> names,Tree * tree,map<string,string> * numnames
void SQLiteProfiler::create_distances_user_tree(vector<string> file_names,map<string,string> * numnames
,map<string,string> * namesnum, vector< vector<double> > * numlist){
for(int i=0;i<names.size();i++){
Node * nd1 = tree->getExternalNode(names[i]);
//get the list of nodes for which distances are required
vector<Node *> nodesfordist;
for(int i=0;i<file_names.size();i++){
for(int j=0;j<userguidetree->getNodeCount();j++){
if (userguidetree->getNode(j)->getName()==file_names[i])
nodesfordist.push_back(userguidetree->getNode(j));
}
}
for(int i=0;i<nodesfordist.size();i++){
vector<double> tdistance;
for(int j=0;j<names.size();j++){
for(int j=0;j<nodesfordist.size();j++){
if (i == j){
tdistance.push_back(666666);
tdistance.push_back(SIXES);
}else{
Node * nd2 = tree->getExternalNode(names[j]);
double distance = get_distance_between_two_nodes(tree,nd1,nd2);
double distance = get_distance_between_two_nodes(userguidetree,nodesfordist[i],nodesfordist[j]);
tdistance.push_back(distance);
}
}
std::ostringstream stm;
stm << i;
numnames->insert( pair<string,string>(stm.str(),names[i]) );
namesnum->insert( pair<string,string>(names[i],stm.str()) );
numnames->insert( pair<string,string>(stm.str(),nodesfordist[i]->getName()) );
namesnum->insert( pair<string,string>(nodesfordist[i]->getName(),stm.str()) );
numlist->push_back(tdistance);
}
}
@@ -691,7 +629,7 @@ void SQLiteProfiler::get_shortest_distance_with_dicts(vector<string> names, map<
for(int j=0;j<file_names.size();j++){
string nameid2 = namesnum[file_names[j]];
double distance = tdistance[atoi(nameid2.c_str())];
if(distance < shortestdistance && distance != 666666){
if(distance < shortestdistance && distance != SIXES){
shortestdistance = distance;
*shortestnameone = names[i];
keepD = true;
@@ -704,7 +642,7 @@ void SQLiteProfiler::get_shortest_distance_with_dicts(vector<string> names, map<
shortestnametwo->clear();
for(int j=0;j<file_names.size();j++){
//int ct = (int) count(shortestnametwo->begin(),shortestnametwo->end(),file_names[j]);
if(distances[j]==shortestdistance && distances[j]!=666666){
if(distances[j]==shortestdistance && distances[j]!=SIXES){
shortestnametwo->push_back(file_names[j]);
cout << "f " <<file_names[j]<<endl;
}
@@ -45,15 +45,16 @@ class SQLiteProfiler{
bool use_orphan;
bool automated;
bool updatedb;
Tree * read_user_guide_tree(string filen);
Tree * userguidetree;
bool usertree;
void get_children(string in_id, vector<string> * in_ids, vector<string> * in_keepids);
vector<string> get_final_children(string id);
int count_seqs(string dirc, string file_name);
string get_right_one(vector<string> allids,Query & res);
vector<string> get_left_right_children(string id);
void create_distances(string clade_name, vector<string> names,map<string,string> * numnames,
map<string,string>* namesnum, vector< vector<double> > * numlist);
void create_distances(vector<string> names,Tree * tree,map<string,string> * numnames
void create_distances_user_tree(vector<string> names,map<string,string> * numnames
,map<string,string> * namesnum, vector< vector<double> > * numlist);
void get_shortest_distance_with_dicts(vector<string> names,map<string,string> numnames,map<string,string> namesnum,
vector< vector<double> > numlist, string * shortestnameone, vector<string> * shortestnametwo);
@@ -80,6 +81,7 @@ class SQLiteProfiler{
SQLiteProfiler(string gn, string cn, string dbs, bool autom,bool updb);
void prelimalign();
void run();
void set_user_guide_tree(Tree * tree);
};

#endif /* MQPROFILER_H_ */
@@ -34,13 +34,14 @@ using namespace std;
#include "SQLiteDBController.h"
#include "SmithWatermanGotoh.h"

#include "tree.h"
#include "sequence.h"
#include "omp.h"
#include "SWPS3_matrix.h"

int main(int argc, char* argv[]){
if(argc != 3){
cout << "PHLAWD 2.0a" << endl;
cout << "PHLAWD 3.0a" << endl;
cout << "you need more arguments." << endl;
cout << "usage: PHLAWD task configfile" << endl;
cout << "possible tasks include:" << endl;
@@ -109,6 +110,9 @@ int main(int argc, char* argv[]){
bool updateFILE = false;
string updatef = "";
string maskurl = "";
bool usertree = false;//guide tree
string usertreefile = "";//guide tree
Tree * usertreeobj;
//read file
ifstream ifs(argv[2]);
string line;
@@ -176,6 +180,10 @@ int main(int argc, char* argv[]){
updatef = tokens[1];
cout << "updateFILE" << endl;
cout << "updated file " << updatef << endl;
}else if(!strcmp(tokens[0].c_str(), "userguidetree")){
usertree = true;
usertreefile = tokens[1];
cout << "user guide treefile: "<< usertreefile <<endl;
}
}
ifs.close();
@@ -219,7 +227,14 @@ int main(int argc, char* argv[]){
cout << "using ITS mode: true" << endl;
cout << "warning: highly experimental" << endl;
}
if(usertree == true){
cout << "using user guide tree: "<< usertreefile <<endl;
a->set_user_guide_tree(usertreefile);
}
a->run();
if(usertree == true){
usertreeobj = a->get_user_guide_tree_obj();
}
delete(a);
}
/*
@@ -229,6 +244,9 @@ int main(int argc, char* argv[]){
SQLiteProfiler * b;
b = new SQLiteProfiler(gene,clade,db,automated,updateDB);
b->prelimalign();
if(usertree == true && asse == true){
b->set_user_guide_tree(usertreeobj);
}
b->run();
delete(b);
}