# Notebook 1: Preparing CSV files

First part of the migration procedure. It consists in preparing a set of CSV files that will be imported in the Neo4j db in the second part.
The program create a CSV file for each entities (nodes and relationship) of the future graph. Above there are 3 variables to configure:

- **SQLITE_DB_NAME** : the name of the SQLite database (that will be fetched from the current directory)
- **IMPORT_NEO_FOLDER** : Absolute path of the directory where CSV will be stored. Neo4j security has a default setting that local files can only be read from the Neo4j import directory, which is different based on the used operating system. [In Neo4j doc](https://neo4j.com/docs/cypher-manual/current/clauses/load-csv/#query-load-csv-introduction), it is explained how to change these settings and which are the default import directories.
- **CSV_SUB_DIR** : the name of the subdirectory where CSV will be stored.

In [None]:
! pip install neo4j --user
! pip install pandas --user

In [None]:
#imports
import sqlite3;
import pandas as pd;
from pandas.io.sql import DatabaseError
import os;
from sqlite3 import OperationalError
from datetime import datetime

In [None]:
#CONFIGURATION ----------------------------------------------------------------------------------------------------------

CSV_SUB_DIR="entities"
IMPORT_NEO_FOLDER="C:/Users/39391/.Neo4jDesktop/relate-data/dbmss/dbms-c4c5ade6-22d3-4366-8b33-f83252bd7d2c/import/"; #example path
SQLITE_DB_NAME="June 21/output UK/results.sqlite";

#------------------------------------------------------------------------------------------------------------------------

In [None]:
#utils

connector = sqlite3.connect(SQLITE_DB_NAME);
 

class StopExecution(Exception):
    def _render_traceback_(self):
        pass
    
class Log:

    def error(type,name,message):
        print("["+datetime.now().strftime("%H:%M:%S")+"]"+"[ERROR]["+type+"]"+"["+name+"] " +message);

    def warn(type,name,message):
        print("["+datetime.now().strftime("%H:%M:%S")+"]"+"[WARN]["+type+"]"+"["+name+"] " +message);

def get_list_from_sqlite(query,connector,entity_name):
    try:
        cursor=connector.cursor();
        cursor.execute(query);
        l=cursor.fetchall();
        cursor.close();
        return l;
    except OperationalError as e:
        Log.warn("GENERIC",entity_name,"A problem occur while searching for other elements of this type.");
        return list();
        

def get_data_frame_from_list(list,headers=None):
    if len(list)==0:
        return pd.DataFrame([]);
    df=pd.DataFrame(list);
    if headers!=None:
        df.columns=headers;
    return df;
 

### From SQLite to CSV tables (Part 1)
<span style="color:#9e9e9e"> Exporting WEB_SITE, LANDING_NAME, NETWORK nodes and LAND,LOCATED, CNAME relationships</span>

##### Creating tables (pandas dataframe) that fully describe each node and relationship type:

In [None]:
#Queries for NODES: WEB_SITE(url_id,domain_name_id), LANDING_NAME(domain_name,landing_url,landing_https)
query_node_web_site = "SELECT ws.url_id, wsdn.domain_name_id FROM web_site ws INNER JOIN web_site_domain_name wsdn on ws.url_id = wsdn.web_site_id WHERE url_id IS NOT NULL";
query_node_landing_name= "SELECT DISTINCT  ws.name_id AS domain_name, wsl.landing_url, wsl.landing_https FROM web_site_lands wsl LEFT JOIN web_server ws on ws.name_id = wsl.web_server_id WHERE landing_url IS NOT NULL"
query_node_network ="SELECT DISTINCT iad.ip_network_id, COALESCE(r.state,'-') AS state FROM ip_address_depends iad LEFT JOIN ip_range_rov irr on iad.ip_range_rov_id = irr.compressed_notation LEFT JOIN prefixes_table pt on irr.compressed_notation = pt.ip_range_rov_id LEFT JOIN rov r on r.id = pt.rov_id";

In [None]:
#Queries for RELATIONSHIPS: LAND (from WEB_SITE to LANDING_NAME) and LOCATED (from LANDING_NAME to NETWORK) relationships
query_rel_land = "SELECT ws.url_id, wsl.starting_https, wsl.landing_https, wsl.landing_url FROM web_site ws INNER JOIN web_site_lands wsl ON ws.url_id=wsl.web_site_id WHERE wsl.landing_url IS NOT NULL;";
query_rel_located_ln ="SELECT ws.name_id,iad.ip_address_id, iad.ip_network_id FROM web_site_lands wsl INNER JOIN web_server ws ON wsl.web_server_id=ws.name_id INNER JOIN domain_name dn on ws.name_id = dn.string INNER JOIN access a on dn.string = a.domain_name_id INNER JOIN ip_address ia on a.ip_address_id = ia.exploded_notation INNER JOIN ip_address_depends iad on ia.exploded_notation = iad.ip_address_id";
query_rel_belong_ln="SELECT ws.name_id AS ln_name, z.name AS ln_zone FROM web_site_lands wsl INNER JOIN web_server ws on wsl.web_server_id = ws.name_id INNER JOIN domain_name dn on ws.name_id = dn.string INNER JOIN direct_zone dz on dn.string = dz.domain_name_id INNER JOIN zone z on dz.zone_id = z.name";

In [None]:
#Saving results in pandas dataframes

#NODES:
try:
    df_node_web_site = pd.read_sql_query(query_node_web_site, connector);
except DatabaseError as e:
    Log.error("NODE","WEB_SITE","An error occur while loading the current entity. Cannot continue.");
    raise StopExecution;
try:
    df_node_landing_name = pd.read_sql_query(query_node_landing_name, connector);
except DatabaseError as e:
    Log.error("NODE","LANDING_NAME","An error occur while loading the current entity. Cannot continue.");
    raise StopExecution;
try:
    df_node_network = pd.read_sql_query(query_node_network,connector);
except DatabaseError as e:
    Log.error("NODE","NETWORK","An error occur while loading the current entity. Cannot continue.");
    raise StopExecution;

#RELATIONSHIP:
try:
    df_rel_land = pd.read_sql_query(query_rel_land,connector);
except DatabaseError as e:
    Log.warn("REL","LAND","Cannot load the current relationship..");
try:
    df_rel_located_ln = pd.read_sql_query(query_rel_located_ln,connector);
except DatabaseError as e:
    Log.warn("REL","LOCATED_LANDING_NAME","Cannot load the current relationship..");
try:
    df_rel_belong_ln=pd.read_sql_query(query_rel_belong_ln,connector);
except DatabaseError as e:
    Log.warn("REL","BELONG_LANDING_NAME","Cannot load the current relationship..");


##### Deep search for other LANDING_NAME nodes with relative CNAME and LOCATED relationships:

In [None]:
#Queries for the first search: search from already-founded LANDING_NAME nodes for RR CNAME, obtaining new LANDING_NAME nodes:
query_nodes_ln =" SELECT DISTINCT  a.alias_id AS domain_name, \"-\" AS landing_url, wsl.landing_https FROM web_site_lands wsl INNER JOIN web_server ws ON wsl.web_server_id=ws.name_id INNER JOIN domain_name dn ON ws.name_id=dn.string INNER JOIN alias a ON a.name_id=dn.string;"
query_rel_cname ="SELECT  ws.name_id AS name_1, a.alias_id AS name_2 FROM web_site_lands wsl INNER JOIN web_server ws ON wsl.web_server_id=ws.name_id INNER JOIN domain_name dn ON ws.name_id=dn.string INNER JOIN alias a ON a.name_id=dn.string;";
#Get founded LANDING_NAME nodes and CNAME relationships in 2 lists:
init_nodes_ln_list = get_list_from_sqlite(query_nodes_ln,connector,"LANDING_NAME");
init_rel_cname = get_list_from_sqlite(query_rel_cname,connector,"CNAME");
#Set also an empty list that will contain LOCATED and BELONG relationships for all the new LANDING_NAME nodes
init_rel_located =list();
init_rel_belong = list();
 
#Set a buffer list containing the new LANDING_NAME nodes. In this loop we will deep search all new LANDING_NAMES founded from CNAME RR:
buffer_list_of_ln_nodes = init_nodes_ln_list.copy();

while len(buffer_list_of_ln_nodes)>0:
    node= buffer_list_of_ln_nodes.pop();
    #From each new LANDING_NAME node, search for another LANDING_NAME node (and CNAME rel) caused by a CNAME RR (searching in alias table)
    query_nodes_ln= "SELECT a.alias_id AS domain_name, \"-\" AS landing_url,\"{}\" AS landing_https  FROM alias a WHERE a.name_id=\"{}\"".format(node[2], node[0]);
    query_rel_cname="SELECT \"{}\" AS name_1, al.alias_id AS name_2  FROM alias al WHERE al.name_id=\"{}\"".format(node[0], node[0]);
    #From each new LANDING_NAME, search for LOCATED (from LANDING_NAME to NETWORK relationship) (i.e. check if there's a record in the access table):
    query_rel_located=" SELECT a.alias_id AS name_id,iad.ip_address_id,  iad.ip_network_id  FROM alias a INNER JOIN domain_name dn on a.alias_id = dn.string INNER JOIN access a on dn.string = a.domain_name_id INNER JOIN ip_address ia on a.ip_address_id = ia.exploded_notation INNER JOIN ip_address_depends iad on ia.exploded_notation = iad.ip_address_id WHERE a.alias_id=\"{}\";".format(node[0]);
    #From each new LANDING_NAME, search for BELONG (from LANDING_NAME to ZONE relationship):
    query_rel_belong="SELECT a.alias_id AS ln_name, z.name AS ln_zone from alias a INNER JOIN domain_name dn ON a.alias_id = dn.string INNER JOIN direct_zone dz ON dn.string = dz.domain_name_id INNER JOIN zone z ON dz.zone_id = z.name where a.alias_id=\"{}\";".format(node[0]);
    #Add founded LANDING_NAME nodes in the buffer list, in order to check in next iterations if there are others CNAME RR:
    buffer_list_of_ln_nodes=buffer_list_of_ln_nodes +get_list_from_sqlite(query_nodes_ln,connector,"LANDING_NAME");
    #Filter buffer_list in order to eventually remove empty rows:
    buffer_list_of_ln_nodes = list(filter(None, buffer_list_of_ln_nodes));
    #Update initial lists containing LANDING_NAMES, CNAME, BELONG and LOCATED entities:
    init_nodes_ln_list.extend(get_list_from_sqlite(query_nodes_ln,connector,"LANDING_NAME"));
    init_rel_cname.extend(get_list_from_sqlite(query_rel_cname,connector,"CNAME"));
    init_rel_located.extend(get_list_from_sqlite(query_rel_located,connector,"LOCATED"));
    init_rel_belong.extend(get_list_from_sqlite(query_rel_belong,connector,"BELONG"));

 

#Filter final lists in order to eventually remove empty rows:
init_nodes_ln_list = list(filter(None, init_nodes_ln_list));
init_rel_cname = list(filter(None, init_rel_cname));
init_rel_located = list(filter(None, init_rel_located));
init_rel_belong = list(filter(None,init_rel_belong));

#Store results in pandas dataframes:
df_rel_cname_ln= get_data_frame_from_list(init_rel_cname,["name_1","name_2"]);
df_node_landing_name=pd.concat([df_node_landing_name,get_data_frame_from_list(init_nodes_ln_list,["domain_name","landing_url","landing_https"])]);
df_rel_located_ln=pd.concat([df_rel_located_ln,get_data_frame_from_list(init_rel_located,["name_id","ip_address_id","ip_network_id"])]);
df_rel_belong_ln  =pd.concat([df_rel_belong_ln,get_data_frame_from_list(init_rel_belong,["ln_name","ln_zone"])]);

        

##### Exporting entities in CSV files (in the directory specified in configuration strings):


In [None]:
#Exporting nodes and relationships in csv files

os.makedirs(IMPORT_NEO_FOLDER+CSV_SUB_DIR, exist_ok=True);

try:
    #Nodes:
    df_node_web_site.to_csv(IMPORT_NEO_FOLDER+CSV_SUB_DIR+"/node_web_site.csv", index=False);
    df_node_landing_name.to_csv(IMPORT_NEO_FOLDER+CSV_SUB_DIR+"/node_landing_name.csv", index=False);
    df_node_network.to_csv(IMPORT_NEO_FOLDER+CSV_SUB_DIR+"/node_network.csv", index=False);

    #Relationships:

    df_rel_land.to_csv(IMPORT_NEO_FOLDER+CSV_SUB_DIR+"/rel_land.csv", index=False);
    df_rel_cname_ln.to_csv(IMPORT_NEO_FOLDER+CSV_SUB_DIR+"/rel_cname_ln.csv", index=False);
    df_rel_located_ln.to_csv(IMPORT_NEO_FOLDER+CSV_SUB_DIR+"/rel_located_ln.csv", index=False);
    df_rel_belong_ln.to_csv(IMPORT_NEO_FOLDER+CSV_SUB_DIR+"/rel_belong_ln.csv", index=False);
except Exception as e:
    Log.error("SAVE_CSV","","A generic error occur while saving some entities in csv files.")

### From SQLite to CSV tables (Part 2)
<span style="color:#9e9e9e">Exporting WEB_SERVER, ZONE nodes and BELONG,LOCATED, CNAME relationships</span>

##### Creating tables (pandas dataframe) that fully describe each node and relationship type:

In [None]:
#Queries for NODES: NAME_SERVER(name_id), ZONE (name)
query_node_name_server = "SELECT name_id FROM name_server WHERE name_id IS NOT NULL";
query_node_zone = "SELECT name FROM zone WHERE name IS NOT NULL AND NOT(length(name)-length(replace(name,'.','')))<2";

In [None]:
#Queries for RELATIONSHIPS: BELONG (from NAME_SERVER to ZONE), LOCATED (from NAME_SERVER to NETWORK)
query_rel_belong ="SELECT ns.name_id AS ns_name, z.name AS ns_zone FROM name_server ns INNER JOIN domain_name dn on ns.name_id = dn.string INNER JOIN direct_zone dz on dn.string = dz.domain_name_id INNER JOIN zone z on dz.zone_id = z.name;"
query_rel_located="SELECT ns.name_id, a.ip_address_id,iad.ip_network_id FROM name_server ns INNER JOIN domain_name dn on dn.string = ns.name_id INNER JOIN access a on dn.string = a.domain_name_id INNER JOIN ip_address ia on a.ip_address_id = ia.exploded_notation INNER JOIN ip_address_depends iad on ia.exploded_notation = iad.ip_address_id;";

In [None]:
#Saving results in pandas dataframes

#NODES:
try:
    df_node_name_server = pd.read_sql_query(query_node_name_server, connector);
except DatabaseError as e:
    Log.error("NODE","NAME_SERVER","An error occur while loading the current entity. Cannot continue.");
    raise StopExecution;
try:
    df_node_zone = pd.read_sql_query(query_node_zone, connector);
except DatabaseError as e:
    Log.error("NODE","ZONE","An error occur while loading the current entity. Cannot continue.");
    raise StopExecution;

#RELATIONSHIP:
try:
    df_rel_belong = pd.read_sql_query(query_rel_belong,connector);
except DatabaseError as e:
    Log.warn("REL","BELONG_NAME_SERVER","Cannot load the current relationship..");
try:
    df_rel_located= pd.read_sql_query(query_rel_located,connector);
except DatabaseError as e:
    Log.warn("REL","LOCATED_NAME_SERVER","Cannot load the current relationship..");

##### Deep search for other NAME_SERVER nodes with relative CNAME, BELONG and LOCATED relationships:

In [None]:
#Queries for the first search: search from already-founded NAME_SERVER nodes for RR CNAME, obtaining new NAME_SERVER nodes:
query_nodes_ns ="SELECT a.alias_id AS name_id FROM name_server ns INNER JOIN domain_name dn on dn.string = ns.name_id INNER JOIN alias a on dn.string = a.name_id";
query_rel_cnames = "SELECT ns.name_id AS name_1, a.alias_id AS name_2 FROM name_server ns INNER JOIN domain_name dn on dn.string = ns.name_id INNER JOIN alias a on dn.string = a.name_id";
#Get founded NAME_SERVER nodes and CNAME relationships in 2 lists:
init_nodes_ns_list = get_list_from_sqlite(query_nodes_ns,connector,"NAME_SERVER");
init_rel_cname_list = get_list_from_sqlite(query_rel_cnames,connector,"CNAME");
#Set also 2 empty lists that will contain LOCATED and BELONG relationships for all the new NAME_SERVER nodes
init_rel_belong_list =list();
init_rel_located_list=list();
#Set a buffer list containing the new NAME_SERVER nodes. In this loop we will deep search all new NAME_SERVER founded from CNAME RR:
buffer_list_of_ns_nodes = init_nodes_ns_list.copy();
while len(buffer_list_of_ns_nodes)>0:
    node= buffer_list_of_ns_nodes.pop();
    #From each new NAME_SERVER node, search for another NAME_SERVER node (and CNAME rel) caused by a CNAME RR (searching in alias table)
    query_node_ns="SELECT a.alias_id AS name_id FROM alias a WHERE a.name_id=\"{}\";".format(node[0]);
    query_rel_cnames="SELECT a.name_id AS name_1, a.alias_id AS name_2 FROM alias a WHERE a.name_id=\"{}\";".format(node[0]);
    #From each new NAME_SERVER, search for BELONG (from NAME_SERVER to ZONE relationship) (i.e. check in the direct_zone table)
    query_rel_belongs = "SELECT a.alias_id AS ns_name, z.name AS ns_zone FROM alias a INNER JOIN domain_name dn ON a.alias_id=dn.string INNER JOIN direct_zone dz on dn.string = dz.domain_name_id INNER JOIN zone z on dz.zone_id = z.name WHERE a.alias_id=\"{}\";".format(node[0]);
    #From each new NAME_SERVER, search for LOCATED (from NAME_SERVER to NETWORK relationship) (i.e. check if there's a record in the access table):
    query_rel_located ="SELECT a.alias_id AS name_id, a2.ip_address_id,iad.ip_network_id FROM alias a INNER JOIN domain_name dn on a.alias_id = dn.string INNER JOIN access a2 on dn.string = a2.domain_name_id INNER JOIN ip_address ia on a2.ip_address_id = ia.exploded_notation INNER JOIN ip_address_depends iad on ia.exploded_notation = iad.ip_address_id WHERE a.alias_id=\"{}\";".format(node[0]);
    #Add founded NAME_SERVER nodes in the buffer list, in order to check in next iterations if there are others CNAME RR:
    buffer_list_of_ns_nodes=buffer_list_of_ns_nodes+get_list_from_sqlite(query_node_ns,connector,"NAME_SERVER"); #buffer_list_of_ns_nodes.append(get_list_from_sqlite(query_node_ns,connector));
    #Filter buffer_list in order to eventually remove empty rows:
    buffer_list_of_ns_nodes = list(filter(None, buffer_list_of_ns_nodes));
    #Update initial lists containing NAME_SERVER, CNAME,BELONG LOCATED entities:
    init_nodes_ns_list=init_nodes_ns_list+ get_list_from_sqlite(query_node_ns,connector,"NAME_SERVER");
    init_rel_cname_list=init_rel_cname_list + get_list_from_sqlite(query_rel_cnames,connector,"CNAME");
    init_rel_belong_list=init_rel_belong_list+ get_list_from_sqlite(query_rel_belongs,connector,"BELONG");
    init_rel_located_list=init_rel_located_list+get_list_from_sqlite(query_rel_located,connector,"LOCATED");
    
#Filter final lists in order to eventually remove empty rows:
init_nodes_ns_list = list(filter(None, init_nodes_ns_list));
init_rel_cname_list= list(filter(None, init_rel_cname_list));
init_rel_belong_list= list(filter(None, init_rel_belong_list));
init_rel_located_list= list(filter(None, init_rel_located_list));

#Store results in pandas dataframes:
df_node_name_server=pd.concat([df_node_name_server,get_data_frame_from_list(init_nodes_ns_list,["name_id"])]);
df_rel_cname=get_data_frame_from_list(init_rel_cname_list,["name_1","name_2"]);
df_rel_belong =pd.concat([df_rel_belong,get_data_frame_from_list(init_rel_belong_list,["ns_name","ns_zone"])]);


df_rel_located=pd.concat([df_rel_located,get_data_frame_from_list(init_rel_located_list,["name_id","ip_address_id","ip_network_id"])]);
 

##### Exporting entities in CSV files (in the directory specified in configuration strings):

In [None]:
#Exporting nodes and relationships in csv files

os.makedirs(IMPORT_NEO_FOLDER+CSV_SUB_DIR, exist_ok=True);

try:
    #Nodes:

    df_node_name_server.to_csv(IMPORT_NEO_FOLDER+CSV_SUB_DIR+"/node_name_server.csv", index=False);
    df_node_zone.to_csv(IMPORT_NEO_FOLDER+CSV_SUB_DIR+"/node_zone.csv", index=False);

    #Relationships:

    df_rel_belong.to_csv(IMPORT_NEO_FOLDER+CSV_SUB_DIR+"/rel_belong_ns.csv", index=False);
    df_rel_located.to_csv(IMPORT_NEO_FOLDER+CSV_SUB_DIR+"/rel_located_ns.csv", index=False);
    df_rel_cname.to_csv(IMPORT_NEO_FOLDER+CSV_SUB_DIR+"/rel_cname_ns.csv", index=False);
except Exception as e:
    Log.error("SAVE_CSV","","A generic error occur while saving some entities in csv files.")

### From SQLite to CSV tables (Part 3) 
 <span style="color:#9e9e9e">Exporting AUTONOMOUS_SYSTEM nodes and COMPOSED_BY, DEPEND, MANAGED relationships</span>

##### Creating tables (pandas dataframe) that fully describe each node and relationship type:

In [None]:
#Queries for NODES: AUTONOMOUS_SYSTEM (number, description, country_code, state)
query_node_aut_sys = "SELECT asy.number,asy.description,\"none\" AS country_code FROM autonomous_system asy";

In [None]:
#Queries for RELATIONSHIPS: COMPOSED_BY (from ZONE to NAME_SERVER), DEPEND (from ZONE to ZONE), MANAGED (from NETWORK to AUTONOMOUS_SYSTEM)
query_rel_composed_by ="SELECT z.name AS zone_name, ns.name_id  AS ns_name FROM zone z INNER JOIN zone_composed zc on z.name = zc.zone_id INNER JOIN name_server ns on ns.name_id = zc.name_server_id WHERE NOT(length(z.name)-length(replace(z.name,'.','')))<2";
#query_rel_depend="SELECT z.name, zl.dependency_id FROM zone z INNER JOIN zone_links zl on z.name = zl.zone_id WHERE NOT(z.name=zl.dependency_id) AND NOT(length(z.name)-length(replace(z.name,'.','')))<2";
query_rel_depend="SELECT z.name, zl.dependency_id FROM zone z INNER JOIN zone_links zl on z.name = zl.zone_id WHERE NOT(z.name=zl.dependency_id) AND NOT(length(z.name)-length(replace(z.name,'.','')))<2 AND NOT (z.name LIKE '%'||zl.dependency_id AND length(z.name)-length(replace(z.name,'.',''))-length(zl.dependency_id)+length(replace(zl.dependency_id,'.',''))=1)";
query_rel_managed="SELECT iad.ip_network_id,asy.number FROM ip_address_depends iad INNER JOIN ip_range_tsv irt on irt.compressed_notation = iad.ip_range_tsv_id INNER JOIN network_numbers nn on irt.compressed_notation = nn.ip_range_tsv_id INNER JOIN autonomous_system asy on asy.number = nn.autonomous_system_id";

query_rel_parent="SELECT z.name, zl.dependency_id FROM zone z INNER JOIN zone_links zl on z.name = zl.zone_id WHERE NOT(z.name=zl.dependency_id) AND NOT(length(z.name)-length(replace(z.name,'.','')))<2 AND  z.name LIKE '%'||zl.dependency_id AND length(z.name)-length(replace(z.name,'.',''))-length(zl.dependency_id)+length(replace(zl.dependency_id,'.',''))=1";

In [None]:
#Saving results in pandas dataframes

#NODES:
try:
    df_node_aut_sys= pd.read_sql_query(query_node_aut_sys,connector);
except DatabaseError as e:
    Log.error("NODE","AUT_SYSTEM","An error occur while loading the current entity. Cannot continue.");
    raise StopExecution;

#RELATIONSHIP:
try:
    df_rel_composed_by= pd.read_sql_query(query_rel_composed_by,connector);
except DatabaseError as e:
    Log.warn("REL","COMPOSED_BY","Cannot load the current relationship..");
try:
    df_rel_depend=pd.read_sql_query(query_rel_depend,connector);
except DatabaseError as e:
    Log.warn("REL","DEPEND","Cannot load the current relationship..");
try:
    df_rel_managed=pd.read_sql_query(query_rel_managed,connector);
except DatabaseError as e:
    Log.warn("REL","MANAGED_BY","Cannot load the current relationship..");

try:
    df_rel_parent=pd.read_sql_query(query_rel_parent,connector);
except DatabaseError as e:
    Log.warn("REL","PARENT","Cannot load the current relationship..");

##### Exporting entities in CSV files (in the directory specified in config.py):

In [None]:
#Exporting nodes and relationships in csv files

os.makedirs(IMPORT_NEO_FOLDER+CSV_SUB_DIR, exist_ok=True);
try:
    #Nodes

    df_node_aut_sys.to_csv(IMPORT_NEO_FOLDER+CSV_SUB_DIR+"/node_aut_sys.csv", index=False);
    print(df_node_aut_sys)

    #Relationships

    df_rel_composed_by.to_csv(IMPORT_NEO_FOLDER+CSV_SUB_DIR+"/rel_composed_by.csv", index=False);
    df_rel_depend.to_csv(IMPORT_NEO_FOLDER+CSV_SUB_DIR+"/rel_depend.csv", index=False);
    df_rel_managed.to_csv(IMPORT_NEO_FOLDER+CSV_SUB_DIR+"/rel_managed.csv", index=False);
    df_rel_parent.to_csv(IMPORT_NEO_FOLDER+CSV_SUB_DIR+"/rel_parent.csv", index=False);
except Exception as e:
    Log.error("SAVE_CSV","","A generic error occur while saving some entities in csv files.")

### From SQLite to CSV tables (Part 4) 
 <span style="color:#9e9e9e">Exporting MAIL_DOMAIN, MAIL_SERVER nodes and BELONG, MAPPED_IN, CNAME, LOCATED relationships</span>

##### Creating tables (pandas dataframe) that fully describe each node and relationship type:

In [None]:
#Queries for NODES: MAIL_DOMAIN(name_id), MAIL_SERVER (name_id)
query_node_mail_domain="SELECT md.name_id FROM mail_domain md WHERE md.name_id IS NOT NULL;";
query_node_mail_server="SELECT ms.name_id FROM mail_server ms WHERE ms.name_id IS NOT NULL;";

In [None]:
#Queries for RELATIONSHIPS: 
query_rel_belong_md ="SELECT md.name_id AS md_name, z.name AS md_zone FROM mail_domain md LEFT JOIN domain_name dn on md.name_id = dn.string LEFT JOIN direct_zone dz on dn.string = dz.domain_name_id LEFT JOIN zone z on z.name = dz.zone_id WHERE md.name_id IS NOT NULL;"
query_rel_belong_ms="SELECT ms.name_id AS ms_name, z.name AS ms_zone FROM mail_server ms LEFT JOIN domain_name dn on ms.name_id = dn.string LEFT JOIN direct_zone dz on dn.string = dz.domain_name_id LEFT JOIN zone z on z.name = dz.zone_id WHERE ms.name_id IS NOT NULL;";
query_rel_mapped_in ="SELECT md.name_id AS md_name, ms.name_id AS ms_name FROM mail_domain md INNER JOIN mail_domain_composed mdc on md.name_id = mdc.mail_domain_id INNER JOIN mail_server ms on ms.name_id = mdc.mail_server_id;";
query_rel_located ="SELECT ms.name_id, a.ip_address_id,iad.ip_network_id  FROM mail_server ms INNER JOIN domain_name dn on ms.name_id = dn.string INNER JOIN access a on dn.string = a.domain_name_id INNER JOIN ip_address ia on a.ip_address_id = ia.exploded_notation INNER JOIN ip_address_depends iad on ia.exploded_notation = iad.ip_address_id;";

In [None]:
#Saving results in pandas dataframes

#NODES:
try:
    df_node_mail_domain = pd.read_sql_query(query_node_mail_domain, connector);
except DatabaseError as e:
    Log.error("NODE","MAIL_DOMAIN","An error occur while loading the current entity. Cannot continue.");
    raise StopExecution;
try:
    df_node_mail_server = pd.read_sql_query(query_node_mail_server, connector);
except DatabaseError as e:
    Log.error("NODE","MAIL_SERVER","An error occur while loading the current entity. Cannot continue.");
    raise StopExecution;

#RELATIONSHIP:
try:
    df_rel_belong_md = pd.read_sql_query(query_rel_belong_md,connector);
except DatabaseError as e:
    Log.warn("REL","BELONG_MAIL_DOMAIN","Cannot load the current relationship..");
try:
    df_rel_belong_ms=pd.read_sql_query(query_rel_belong_ms,connector);
except DatabaseError as e:
    Log.warn("REL","BELONG_MAIL_SERVER","Cannot load the current relationship..");
try:
    df_rel_mapped_in = pd.read_sql_query(query_rel_mapped_in,connector);
except DatabaseError as e:
    Log.warn("REL","MAPPED_IN","Cannot load the current relationship..");
try:
    df_rel_located= pd.read_sql_query(query_rel_located,connector);
except DatabaseError as e:
    Log.warn("REL","LOCATED_MAIL_SERVER","Cannot load the current relationship..");


##### Deep search for other MAIL_SERVER nodes with relative CNAME, BELONG, LOCATED relationships:

In [None]:
#Queries for the first search: search from already-founded MAIL_SERVER nodes for RR CNAME, obtaining new MAIL_SERVER nodes:
query_nodes_ms ="SELECT a.alias_id AS name_id FROM mail_server ms INNER JOIN domain_name dn on dn.string = ms.name_id INNER JOIN alias a on dn.string = a.name_id;";
query_rel_cnames = "SELECT ms.name_id AS name_1, a.alias_id AS name_2 FROM mail_server ms INNER JOIN domain_name dn on dn.string = ms.name_id INNER JOIN alias a on dn.string = a.name_id";
#Get founded MAIL_SERVER nodes and CNAME relationships in 2 lists:
init_nodes_ms_list = get_list_from_sqlite(query_nodes_ms,connector,"MAIL_SERVER");
init_rel_cname_list = get_list_from_sqlite(query_rel_cnames,connector,"CNAME");
#Set also 2 empty lists that will contain LOCATED and BELONG relationships for all the new NAME_SERVER nodes
init_rel_belong_list =list();
init_rel_located_list=list();
#Set a buffer list containing the new MAIL_SERVER nodes. In this loop we will deep search all new MAIL_SERVER founded from CNAME RR:
buffer_list_of_ms_nodes = init_nodes_ms_list.copy();
while len(buffer_list_of_ms_nodes)>0:
    node= buffer_list_of_ms_nodes.pop();
    #From each new MAIL_SERVER node, search for another MAIL_SERVER node (and CNAME rel) caused by a CNAME RR (searching in alias table)
    query_node_ms="SELECT a.alias_id AS name_id FROM alias a WHERE a.name_id=\"{}\";".format(node[0]);
    query_rel_cnames="SELECT a.name_id AS name_1, a.alias_id AS name_2 FROM alias a WHERE a.name_id=\"{}\";".format(node[0]);
    #From each new MAIL_SERVER, search for BELONG (from MAIL_SERVER to ZONE relationship) (i.e. check in the direct_zone table)
    query_rel_belongs = "SELECT a.alias_id AS ms_name, z.name AS ms_zone FROM alias a INNER JOIN domain_name dn ON a.alias_id=dn.string INNER JOIN direct_zone dz on dn.string = dz.domain_name_id INNER JOIN zone z on dz.zone_id = z.name WHERE a.alias_id=\"{}\";".format(node[0]);
    #From each new MAIL_SERVER, search for LOCATED (from MAIL_SERVER to NETWORK relationship) (i.e. check if there's a record in the access table):
    query_rel_located ="SELECT a.alias_id AS name_id, a2.ip_address_id,iad.ip_network_id FROM alias a INNER JOIN domain_name dn on a.alias_id = dn.string INNER JOIN access a2 on dn.string = a2.domain_name_id INNER JOIN ip_address ia on a2.ip_address_id = ia.exploded_notation INNER JOIN ip_address_depends iad on ia.exploded_notation = iad.ip_address_id WHERE a.alias_id=\"{}\";".format(node[0]);
    #Add founded MAIL_SERVER nodes in the buffer list, in order to check in next iterations if there are others CNAME RR:
    buffer_list_of_ms_nodes.append(get_list_from_sqlite(query_node_ms,connector,"MAIL_SERVER"));
    #Filter buffer_list in order to eventually remove empty rows:
    buffer_list_of_ms_nodes = list(filter(None, buffer_list_of_ms_nodes));
    #Update initial lists containing MAIL_SERVER, CNAME,BELONG LOCATED entities:
    init_nodes_ms_list=init_nodes_ms_list+ get_list_from_sqlite(query_node_ms,connector,"MAIL_SERVER");
    init_rel_cname_list=init_rel_cname_list + get_list_from_sqlite(query_rel_cnames,connector,"CNAME");
    init_rel_belong_list=init_rel_belong_list+ get_list_from_sqlite(query_rel_belongs,connector,"BELONG");
    init_rel_located_list=init_rel_located_list+get_list_from_sqlite(query_rel_located,connector,"LOCATED");

#Filter final lists in order to eventually remove empty rows:
init_nodes_ms_list = list(filter(None, init_nodes_ms_list));
init_rel_cname_list= list(filter(None, init_rel_cname_list));
init_rel_belong_list= list(filter(None, init_rel_belong_list));
init_rel_located_list= list(filter(None, init_rel_located_list));

#Store results in pandas dataframes:
df_node_mail_server=pd.concat([df_node_mail_server,get_data_frame_from_list(init_nodes_ms_list,["name_id"])]);
df_rel_cname=get_data_frame_from_list(init_rel_cname_list,["name_1","name_2"]);
df_rel_belong =pd.concat([df_rel_belong_ms,get_data_frame_from_list(init_rel_belong_list,["ms_name","ms_zone"])]);
df_rel_located=pd.concat([df_rel_located,get_data_frame_from_list(init_rel_located_list,["name_id","ip_address_id","ip_network_id"])]);

##### Exporting entities in CSV files (in the directory specified in configuration strings):

In [None]:
#Exporting nodes and relationships in csv files

os.makedirs(IMPORT_NEO_FOLDER+CSV_SUB_DIR, exist_ok=True);
try:

    #Nodes:

    df_node_mail_domain.to_csv(IMPORT_NEO_FOLDER+CSV_SUB_DIR+"/node_mail_domain.csv", index=False);
    df_node_mail_server.to_csv(IMPORT_NEO_FOLDER+CSV_SUB_DIR+"/node_mail_server.csv", index=False);

    #Relationships:

    df_rel_belong_ms.to_csv(IMPORT_NEO_FOLDER+CSV_SUB_DIR+"/rel_belong_ms.csv", index=False);
    df_rel_belong_md.to_csv(IMPORT_NEO_FOLDER+CSV_SUB_DIR+"/rel_belong_md.csv", index=False);
    df_rel_located.to_csv(IMPORT_NEO_FOLDER+CSV_SUB_DIR+"/rel_located_ms.csv", index=False);
    df_rel_cname.to_csv(IMPORT_NEO_FOLDER+CSV_SUB_DIR+"/rel_cname_ms.csv", index=False);
    df_rel_mapped_in.to_csv(IMPORT_NEO_FOLDER+CSV_SUB_DIR+"/rel_mapped_in.csv", index=False);
except Exception as e:
    Log.error("SAVE_CSV","","A generic error occur while saving some entities in csv files.")
 