In [43]:
from xml.etree import ElementTree
import numpy as np
from itertools import combinations
from bs4 import BeautifulSoup
import pandas as pd

In [5]:
tmvseqs = ElementTree.parse("./data/seqs/ledger.xml")

In [55]:
tmvseqs.getroot()

<Element 'DOCUMENT_ROOT' at 0x00000247FB924EF0>

In [14]:
seqs = tmvseqs.getroot().findall("RECORD")

In [16]:
for seq in seqs:
    print(seq.find("ACCESSION_NO").text)

OK377342
OK377341
OK377343
OK424595
OK362288


In [19]:
fasta = []

for seq in seqs:
   fasta.append(seq.find("FASTA").text.replace('\n', '').replace('\t', ''))

In [22]:
# All sequences seem to have the same length!

[len(f) for f in fasta]

[2746, 2746, 2746, 2746, 2746]

In [51]:
seqarrays = [np.array([letter for letter in f]) for f in fasta]

In [52]:
seqpairs = list(combinations(seqarrays, r = 2))

In [53]:
# There are differences.

[(sp[0] != sp[1]).sum() for sp in seqpairs]

[37, 44, 12, 5, 7, 39, 32, 42, 39, 7]

In [42]:
# https://ictv.global/report/chapter/geminiviridae/geminiviridae/begomovirus

In [40]:
# Copy pasted the HTML manually from Firefox. _|_ lazy loads.

with open("./data/Genus Begomovirus ICTV.html", mode = "r", encoding = "utf8") as html:
    content = html.read()
    soup = BeautifulSoup(content, "html.parser")

In [67]:
table = soup.find_all("table", attrs = {"class" : "virus-isolates-table"})[0]

In [68]:
len(table)

2

In [87]:
headers = [th.text for th in table.find_all("th")]
headers

['',
 'Genus',
 'Subgenus',
 'Species',
 'Virus name',
 'Isolate',
 'Accession',
 'Available sequence',
 'Abbrev.']

In [88]:
table_dict = {
    "Species": [],
    "Virus name": [],
    "Isolate": [],
    "Accession": [],
    "Available sequence": [],
    "Abbrev":[]
}

In [105]:
[th.text for th in table.find_all("tr")[0].find_all("th")]

['',
 'Genus',
 'Subgenus',
 'Species',
 'Virus name',
 'Isolate',
 'Accession',
 'Available sequence',
 'Abbrev.']

In [108]:
table_dict

{'Species': [],
 'Virus name': [],
 'Isolate': [],
 'Accession': [],
 'Available sequence': [],
 'Abbrev': []}

In [110]:
# Except for the row with table headers, there seems to be two kinds of table rows.
# 1) <tr class="virus-row">
# 2) <tr class="alt-virus-row">

counter = 0
for tr in table.find_all("tr"):
    if counter == 0:
        counter += 1
        continue
    else:
        table_dict["Species"].append(tr.find("td", attrs = {"class": "col-species"}).text)
        table_dict["Virus name"].append(tr.find("td", attrs = {"class": "col-alternativeNameCSV"}).text)
        table_dict["Isolate"].append(tr.find("td", attrs = {"class": "col-isolate"}).text)
        table_dict["Accession"].append(tr.find("td", attrs = {"class": "col-accessionNumber"}).text)
        table_dict["Available sequence"].append(tr.find("td", attrs = {"class": "col-availableSequence"}).text)
        table_dict["Abbrev"].append(tr.find("td", attrs = {"class": "col-abbrev"}).text)
        counter += 1

In [113]:
begomovirus = pd.DataFrame(table_dict)

In [114]:
begomovirus.to_csv("./data/begomovirus_ictv.csv", sep = ',', header = True, index = False, mode = "w")