In [1]:
from xml.etree import ElementTree
import numpy as np
from itertools import combinations
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import os

In [42]:
# https://ictv.global/report/chapter/geminiviridae/geminiviridae/begomovirus

In [40]:
# Copy pasted the HTML manually from Firefox. _|_ lazy loads.

with open("./data/Genus Begomovirus ICTV.html", mode = "r", encoding = "utf8") as html:
    content = html.read()
    soup = BeautifulSoup(content, "html.parser")

In [67]:
table = soup.find_all("table", attrs = {"class" : "virus-isolates-table"})[0]

In [68]:
len(table)

2

In [87]:
headers = [th.text for th in table.find_all("th")]
headers

['',
 'Genus',
 'Subgenus',
 'Species',
 'Virus name',
 'Isolate',
 'Accession',
 'Available sequence',
 'Abbrev.']

In [88]:
table_dict = {
    "Species": [],
    "Virus name": [],
    "Isolate": [],
    "Accession": [],
    "Available sequence": [],
    "Abbrev":[]
}

In [105]:
[th.text for th in table.find_all("tr")[0].find_all("th")]

['',
 'Genus',
 'Subgenus',
 'Species',
 'Virus name',
 'Isolate',
 'Accession',
 'Available sequence',
 'Abbrev.']

In [108]:
table_dict

{'Species': [],
 'Virus name': [],
 'Isolate': [],
 'Accession': [],
 'Available sequence': [],
 'Abbrev': []}

In [110]:
# Except for the row with table headers, there seems to be two kinds of table rows.
# 1) <tr class="virus-row">
# 2) <tr class="alt-virus-row">

counter = 0
for tr in table.find_all("tr"):
    if counter == 0:
        counter += 1
        continue
    else:
        table_dict["Species"].append(tr.find("td", attrs = {"class": "col-species"}).text)
        table_dict["Virus name"].append(tr.find("td", attrs = {"class": "col-alternativeNameCSV"}).text)
        table_dict["Isolate"].append(tr.find("td", attrs = {"class": "col-isolate"}).text)
        table_dict["Accession"].append(tr.find("td", attrs = {"class": "col-accessionNumber"}).text)
        table_dict["Available sequence"].append(tr.find("td", attrs = {"class": "col-availableSequence"}).text)
        table_dict["Abbrev"].append(tr.find("td", attrs = {"class": "col-abbrev"}).text)
        counter += 1

In [113]:
begomovirus = pd.DataFrame(table_dict)

In [114]:
begomovirus.to_csv("./data/begomovirus_ictv.csv", sep = ',', header = True, index = False, mode = "w")

In [2]:
ictv_begomo = pd.read_csv("./data/begomovirus_ictv.csv")

In [13]:
ind = np.array([True if "cassava" in name.lower() else False for name in ictv_begomo["Virus name"].values])

In [18]:
ictv_begomo.loc[ind, :].to_csv("./data/cassava_filtered.csv", sep = ',', header = True, index = False, mode = "w")

In [6]:
begomo = pd.read_csv("./data/begomovirus_ictv.csv")

In [7]:
begomo.columns

Index(['Species', 'Virus name', 'Isolate', 'Accession', 'Available sequence',
       'Abbrev'],
      dtype='object')

In [8]:
begomo.loc[begomo.Species.str.contains("cassava", case = False), :]

Unnamed: 0,Species,Virus name,Isolate,Accession,Available sequence,Abbrev
4,African cassava mosaic Burkina Faso virus,African cassava mosaic Burkina Faso virus,BF-Oua-127-08,DNA-A: HE616777;DNA-B: HE616778,Complete genome,ACMBFV
5,African cassava mosaic virus,African cassava mosaic virus,Cameroon/1998,DNA-A: J02057;DNA-B: J02058,Complete genome,ACMV
58,Cassava mosaic Madagascar virus,cassava mosaic Madagascar virus,Madgascar/Toliary/2006,DNA-A: HE617299;DNA-B: HE617300,Complete genome,CMMGV
147,East African cassava mosaic Cameroon virus,East African cassava mosaic Cameroon virus,Cameroon/1998,DNA-A: AF112354;DNA-B: AF112355,Complete genome,EACMCMV
148,East African cassava mosaic Kenya virus,East African cassava mosaic Kenya virus,Kenya/Mitaboni/K298/2002,DNA-A: AJ717572;DNA-B: AJ704971,Complete genome,EACMKV
149,East African cassava mosaic Malawi virus,East African cassava mosaic Malawi virus,Malawi/K/1996,DNA-A: AJ006460,Complete genome,EACMMV
150,East African cassava mosaic virus,East African cassava mosaic virus,Uganda/Severe 2/1997/Uganda,DNA-A: AF126806;DNA-B: AF126807,Complete genome,EACMV/UG
151,East African cassava mosaic virus,East African cassava mosaic virus,Kenya/Boa/K48/2001/Kenya,DNA-A: AJ717542;DNA-B: AJ704949,Complete genome,EACMV/KE
152,East African cassava mosaic virus,East African cassava mosaic virus,Malawi/8N/2007/Malawi,JX473582,Complete genome,EACMV/MW
153,East African cassava mosaic virus,East African cassava mosaic virus,Tanzania/1/2001/Tanzania,DNA-A: AY795983;DNA-B: AY795989,Complete genome,EACMV/TZ


In [12]:
begomo.loc[begomo.Species.str.contains("cassava", case = False), :].shape

(18, 6)

In [44]:
begomo

Unnamed: 0,Species,Virus name,Isolate,Accession,Available sequence,Abbrev
0,Abutilon golden mosaic virus,Abutilon golden mosaic virus,Mexico/Yucatan/2007,DNA-A: KC430935,Complete genome,AbGMV
1,Abutilon mosaic Bolivia virus,Abutilon mosaic Bolivia virus,Bolivia/2007,DNA-A: HM585445;DNA-B: HM585446,Complete genome,AbMBoV
2,Abutilon mosaic Brazil virus,Abutilon mosaic Brazil virus,Brazil/BGV01A.1.C21,DNA-A: JF694480;DNA-B: JF694479,Complete genome,AbMBV
3,Abutilon mosaic virus,Abutilon mosaic virus,Germany,DNA-A: X15983;DNA-B: X15984,Complete genome,AbMV
4,African cassava mosaic Burkina Faso virus,African cassava mosaic Burkina Faso virus,BF-Oua-127-08,DNA-A: HE616777;DNA-B: HE616778,Complete genome,ACMBFV
...,...,...,...,...,...,...
624,Whitefly-associated begomovirus 4,whitefly-asssociated begomovirus 4,Guatemala-GtSq8-2012,DNA-A: KT099128,Complete genome,WfaBV4
625,Whitefly-associated begomovirus 6,whitefly-asssociated begomovirus 6,Puerto Rico-PR10-2010,DNA-A: KT099139,Complete genome,WfaBV6
626,Whitefly-associated begomovirus 7,whitefly-asssociated begomovirus 7,Spain-Sp5_4-2011,KT099156,Complete genome,WfaBV7
627,Wissadula golden mosaic virus,Wissadula golden mosaic virus,Jamaica/St Thomas/2005,DNA-A: DQ395343;DNA-B: EU158095,Complete genome,WGMV


In [3]:
# https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&save=file&log$=seqview&db=nuccore&report=fasta&id=89330928&conwithfeat=on&withparts=on&show-sequence=on&hide-cdd=on&ncbi_phid=CE8AC93650A0D641000000000A89088C

In [10]:
cmv = pd.read_csv("./data/cassava_filtered.csv")

In [13]:
cmv.columns

Index(['Species', 'Virus name', 'Isolate', 'Accession', 'Available sequence',
       'Abbrev'],
      dtype='object')

In [16]:
cmv.loc[:, ["Abbrev", "Accession"]]

Unnamed: 0,Abbrev,Accession
0,ACMBFV,DNA-A: HE616777;DNA-B: HE616778
1,ACMV,DNA-A: J02057;DNA-B: J02058
2,CMMGV,DNA-A: HE617299;DNA-B: HE617300
3,EACMCMV,DNA-A: AF112354;DNA-B: AF112355
4,EACMKV,DNA-A: AJ717572;DNA-B: AJ704971
5,EACMMV,DNA-A: AJ006460
6,EACMV/UG,DNA-A: AF126806;DNA-B: AF126807
7,EACMV/KE,DNA-A: AJ717542;DNA-B: AJ704949
8,EACMV/MW,JX473582
9,EACMV/TZ,DNA-A: AY795983;DNA-B: AY795989


In [35]:
[acc[0] for acc in cmv.Accession.str.split(";")]

['DNA-A: HE616777',
 'DNA-A: J02057',
 'DNA-A: HE617299',
 'DNA-A: AF112354',
 'DNA-A: AJ717572',
 'DNA-A: AJ006460',
 'DNA-A: AF126806',
 'DNA-A: AJ717542',
 'JX473582',
 'DNA-A: AY795983',
 'DNA-A: AF422174',
 'DNA-A: Z24758',
 'DNA-A: JF496657',
 'DNA-A: GQ924760',
 'DNA-A: AJ575819',
 'DNA-A: AF155806',
 'DNA-A: AJ314737',
 'DNA-A: AJ579307']

In [38]:
accessions = []

for fasta in os.listdir("./data/refseqs/edited/"):
    with open("./data/refseqs/edited/" + fasta, "r") as file:
        accessions.append(str(file.read()).split("|")[0].replace('>', ''))

In [42]:
accessions

['HE616777',
 'J02057',
 'HE617299',
 'AF112354',
 'AJ717572',
 'AJ006460',
 'AJ717542',
 'JX473582',
 'AY795983',
 'AF126806',
 'AF422174',
 'Z24758',
 'GQ924760',
 'AJ575819',
 'JF496657',
 'AF155806',
 'AJ579307',
 'AJ314737']

In [43]:
cmv

Unnamed: 0,Species,Virus name,Isolate,Accession,Available sequence,Abbrev
0,African cassava mosaic Burkina Faso virus,African cassava mosaic Burkina Faso virus,BF-Oua-127-08,DNA-A: HE616777;DNA-B: HE616778,Complete genome,ACMBFV
1,African cassava mosaic virus,African cassava mosaic virus,Cameroon/1998,DNA-A: J02057;DNA-B: J02058,Complete genome,ACMV
2,Cassava mosaic Madagascar virus,cassava mosaic Madagascar virus,Madgascar/Toliary/2006,DNA-A: HE617299;DNA-B: HE617300,Complete genome,CMMGV
3,East African cassava mosaic Cameroon virus,East African cassava mosaic Cameroon virus,Cameroon/1998,DNA-A: AF112354;DNA-B: AF112355,Complete genome,EACMCMV
4,East African cassava mosaic Kenya virus,East African cassava mosaic Kenya virus,Kenya/Mitaboni/K298/2002,DNA-A: AJ717572;DNA-B: AJ704971,Complete genome,EACMKV
5,East African cassava mosaic Malawi virus,East African cassava mosaic Malawi virus,Malawi/K/1996,DNA-A: AJ006460,Complete genome,EACMMV
6,East African cassava mosaic virus,East African cassava mosaic virus,Uganda/Severe 2/1997/Uganda,DNA-A: AF126806;DNA-B: AF126807,Complete genome,EACMV/UG
7,East African cassava mosaic virus,East African cassava mosaic virus,Kenya/Boa/K48/2001/Kenya,DNA-A: AJ717542;DNA-B: AJ704949,Complete genome,EACMV/KE
8,East African cassava mosaic virus,East African cassava mosaic virus,Malawi/8N/2007/Malawi,JX473582,Complete genome,EACMV/MW
9,East African cassava mosaic virus,East African cassava mosaic virus,Tanzania/1/2001/Tanzania,DNA-A: AY795983;DNA-B: AY795989,Complete genome,EACMV/TZ
