In [None]:
import os, re, glob
import pandas as pd
import numpy as np
import PyAstronomy
from PyAstronomy import pyasl
from datetime import datetime as dt
import time

def capitalize_keys(dictionary):
    capitalized_dict = {key.capitalize(): value for key, value in dictionary.items()}
    return capitalized_dict

f = open(glob.glob("hMPXV.State.history.MJ.txt")[0], 'r')
ldf=pd.read_csv(f, sep=",", skiprows=0)
maxdf=2023.4123
ldf["RT"]=maxdf-ldf["time"].astype(float)
MT=ldf.reset_index().rename(columns={"index":"state"})
MT["DD"]=[PyAstronomy.pyasl.decimalYearGregorianDate(xx, "yyyy-mm-dd") for xx in MT["RT"]]
MT["year"]=[xx.split("-")[0] for xx in MT["DD"]]
MT["month"]=[xx.split("-")[1] for xx in MT["DD"]]
MT["cd"]=MT["year"]+"-"+MT["month"]
MT=MT[(MT["startLocation"] !="cameroon")]
MT=MT[(MT["endLocation"] !="cameroon")]

In [None]:
pdf=pd.DataFrame(MT.groupby(["startLocation", "endLocation"])["treeId"].apply(list)).reset_index()
pdf["count"]=[len(x) for x in pdf["treeId"]]
pdf["mean"]=pdf["count"]/len(MT["treeId"].unique())
pdf["distribution_counts"]=[list(pd.Series(x).value_counts().values) for x in pdf["treeId"]]
pdf=pdf.sort_values("mean", ascending=False)
pdf=pdf[pdf["mean"]>=1]
pdf["startLocation"]=pdf["startLocation"].str.replace("nigeria", "Nigeria")
pdf["endLocation"]=pdf["endLocation"].str.replace("nigeria", "Nigeria")
df=pd.read_csv("State_class.txt", sep="\t")
df["taxa"]=df["taxa"].str.replace("akwaibom", "Akwa-Ibom")
mmap=dict(zip(df["taxa"], df["class"]))

pdf["start_region"]=pdf["startLocation"].map(mmap)
pdf["end_region"]=pdf["endLocation"].map(mmap)

pdf["start_region"][pdf["startLocation"]=="rivers"]="Rivers"
pdf["startLocation"]=[s.title() for s in pdf["startLocation"]]
pdf["endLocation"]=[s.title() for s in pdf["endLocation"]]
pdf=pdf[(pdf["startLocation"]!="Nigeria") | (pdf["endLocation"]!="Nigeria")]
pdf=pdf[(pdf["start_region"]!="Rest")]
pdf=pdf[(pdf["end_region"]!="Rest")]


In [None]:
ndf=pdf[["startLocation", "endLocation", "distribution_counts"]]
ndf["distribution_counts"]=[x for x in ndf["distribution_counts"]]
ndf["S_E"]=ndf["startLocation"]+"_"+ndf["endLocation"]

DL=dict(zip(ndf["S_E"], ndf["distribution_counts"]))

mx=[]
for k,v in DL.items():
    mx.append(len(v))

mx=np.max(mx)

for k,v in DL.items():
    if len(v) < mx:
        v=v
        need=mx-len(v)
        for _ in range(need):
             v.append(0)
    v=[x if x >= 0 else 0 for x in v]
data = pd.DataFrame(DL)

capitalized_dict = capitalize_keys(mmap)
capitalized_dict["Rivers"]="Rivers"

old_key = 'Akwa-ibom'
new_key = 'Akwa-Ibom'

if old_key in capitalized_dict:
    capitalized_dict[new_key] = capitalized_dict[old_key]
    if old_key != new_key:
        del capitalized_dict[old_key]


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import re
import matplotlib as mpl

from matplotlib.ticker import MultipleLocator


mpl.rcParams['font.family'] = 'sans-serif'

dd={'NC': "#960018",
  'NE': "#FE5A1D",
  'NW': "#FFD700",
  'SE': "#0070BB",
  'SW': "#138808",
    'S': "#16166B",
    "Rivers":"#273BE2",

  'Rest': "#C0C0C0"}

categories = list(data.columns)
num_points = mx


sorted_categories = sorted(categories, key=lambda x: data[x].median())
data_list = [data[cat] for cat in sorted_categories]

plt.figure(figsize=(5, 5))

violins=plt.violinplot(data_list, showmeans=False, showmedians=True,  widths=1, vert=False)

plt.yticks(range(1, len(categories) + 1), categories)
labels = [item.get_text() for item in plt.gca().get_yticklabels()]
labels.reverse()
start_location=[re.sub('_(.*)', '', label) for label in labels]
new_labels = [re.sub('(.*)_', '', label) for label in labels]
plt.gca().set_yticklabels(new_labels)
colors =[dd[capitalized_dict[x]] for x in start_location]

yticks_locations, yticks_labels = plt.yticks()

for ii in range(0, len(yticks_locations)):
    loc=yticks_locations[ii]
    lab=yticks_labels[ii]
    lab=lab.get_text().lower()
    lab=re.sub("akwa-ibom", "Akwa-Ibom", lab)
    plt.fill_between([0,45], loc-0.5, loc+0.5,color= dd[mmap[lab]], alpha=0.1)

for pc in violins['bodies']:
    pc.set_alpha(1)  # Set transparency to 0.5

for i, violin in enumerate(violins['bodies']):
    violin.set_facecolor(colors[i])

for body in violins['bodies']:
    body.set_edgecolor('black')  

for partname in ('cbars','cmins','cmaxes','cmedians'):
    vp = violins[partname]
    vp.set_color('black')
    vp.set_linewidth(0.5)# Set the color to red


plt.ylabel('End location', fontsize=15)
plt.xlabel('Number of introductions', size=15)
plt.title('')

uu=45
yy=3
for k,v in dd.items():

    if k != "Rest":
        k=re.sub("S$", "SS", k)
        plt.scatter(uu,yy,s=200,color=v,zorder=400, edgecolor='black')
        plt.text(uu+1.5, yy-0.5, k, size=13)
        # uu=uu+0.005
        yy=yy+2  

plt.text(43, 17, "Start\nLocation", size=15)

ax = plt.gca()
plt.grid(which='major', axis='x', linestyle=':', linewidth='1', color='gray', zorder=0)

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.xaxis.set_minor_locator(MultipleLocator(1))
plt.xticks(fontsize=15)

#plt.show()

plt.savefig("Total_jumps_states.png",bbox_inches='tight', dpi=1000)   


In [None]:
keep=[xx.lower() for xx in list(data.columns)]
MT["year"]=MT["year"].astype(int)
MT=MT[MT["year"]>1970]

from datetime import datetime as dt
import time


def toYearFraction(date):
    def sinceEpoch(date): 
        return time.mktime(date.timetuple())
    s = sinceEpoch
    year = date.year
    startOfThisYear = dt(year=year, month=1, day=1)
    startOfNextYear = dt(year=year+1, month=1, day=1)
    yearElapsed = s(date) - s(startOfThisYear)
    yearDuration = s(startOfNextYear) - s(startOfThisYear)
    fraction = yearElapsed/yearDuration
    return date.year + fraction



pdf=pd.DataFrame(MT.groupby(["startLocation", "endLocation", "cd"])["treeId"].apply(list)).reset_index()
pdf["startLocation"]=pdf["startLocation"].str.replace("nigeria", "Nigeria")
pdf["endLocation"]=pdf["endLocation"].str.replace("nigeria", "Nigeria")
pdf["count"]=[len(x) for x in pdf["treeId"]]
pdf["mean"]=pdf["count"]/len(MT["treeId"].unique())
pdf["distribution_counts"]=[list(pd.Series(x).value_counts().values) for x in pdf["treeId"]]
pdf=pdf.sort_values("mean", ascending=False)
pdf["C"]=pdf["startLocation"]+"_"+pdf["endLocation"]
pdf["C"]=pdf["C"].str.lower()
pdf=pdf[pdf["C"].isin(keep)]
pdf=pdf.sort_values("cd", ascending=True)
pdf["cd"]=pdf["cd"]+"-01"
pdf["cd"]=pd.to_datetime(pd.Series(pdf["cd"]))
pdf["cd"]=[toYearFraction(xx) for xx in pdf["cd"].to_list()]

time_l=list(np.sort(pdf["cd"].unique()))
time_l=pd.to_datetime(pd.Series([xx for xx in time_l]))
time_l=[toYearFraction(xx) for xx in time_l]


In [None]:
data={}

for cc in pdf["C"].unique():
    ndf=pdf[pdf["C"]==cc]
    ndf=ndf[["C", "mean", "cd"]]
    tt=list(np.sort(pdf["cd"].unique()))
    for t in tt:
        if t not in ndf["cd"].unique():
            n_d={'C' : cc,'mean':0, 'cd': t}
            new_df = pd.DataFrame.from_dict(n_d, orient='index').T
            ndf = pd.concat([ndf, new_df], ignore_index=True)
    ndf=ndf.sort_values("cd", ascending=True)
    data.update( { cc: ndf["mean"]})
data.update({'time':ndf["cd"]})

data['time']=data['time'].astype(float)
data['time']=np.nan_to_num(data['time'])

max_l={}
for k, v in data.items():
    max_l[k]=np.sum(v)

sorted_data = sorted(max_l.items(), key=lambda x: np.max(x[1]), reverse=True)
sort_me=[xx[0] for xx in sorted_data]
sorted_keys = sorted(data.keys(), key=lambda x: sort_me.index(x))
sorted_data = {key: data[key] for key in sorted_keys}
tt=sorted_data['time']
del sorted_data['time']


dd={  'Rest': "#C0C0C0",
   "Nigeria": "black",
    'NC': "#960018",
  'NE': "#FE5A1D",
  'NW': "#FFD700",
  'SE': "#0070BB",
  'SW': "#138808",
    'S': "#16166B",

   "Rivers":"#273BE2"}

mmap["akwa-ibom"]="S"
mmap['rivers']="Rivers"

In [None]:
import matplotlib.pyplot as plt

import numpy as np

for k, v in sorted_data.items():
    sorted_data[k]=np.nan_to_num(sorted_data[k].astype(float))

fig, ax = plt.subplots(1, 1, figsize=(10, 5)) 


kk=list(sorted_data.keys())
for k in kk[0:1]:
    d=re.sub("(.*)_", "", k)
    plt.fill_between(tt, sorted_data[k], color=dd[mmap[d]], alpha=0.75, edgecolor="black")

i=1
for k in kk[1:]:
    d=re.sub("(.*)_", "", k)
    if re.sub("_(.*)", "", k) == "rivers":
        aa=0.75
    else:
        aa=0.25
    use=kk[:i]
    valz=[]
    for u in use:
        valz.append(sorted_data[u])
    result = [sum(x) for x in zip(*valz)]
    use=kk[:i+1]
    valz=[]
    for u in use:
        valz.append(sorted_data[u])
    result2 = [sum(x) for x in zip(*valz)]
    plt.fill_between(tt, result, result2, color=dd[mmap[d]], alpha=aa, edgecolor="black")
    i=i+1


rr=list(dd.keys())[::-1]

uu=2015.5
yy=4

for k in rr[:5]:
    if k != "Not_specified" and k != "Rivers":
        plt.scatter(uu,yy,s=200,color=dd[k],zorder=400, edgecolor='black')
        plt.text(uu+0.15, yy-0.08, re.sub("^S$", "SS", k), size=13)
        yy=yy-0.75   

uu=2016.2
yy=4
for k in rr[5:]:
    if k != "Not_specified" and k != "Rivers":
        plt.scatter(uu,yy,s=200,color=dd[k],zorder=400, edgecolor='black')
        plt.text(uu+0.15, yy-0.08, re.sub("^S$", "SS", k), size=13)
        yy=yy-0.75   


plt.text(2015.4, 4.5, "End Location", size=15)
plt.text(2015.4, 8, "Start Location", size=15)
plt.scatter(2015.5,7.4,s=200,color="#273BE2",zorder=400, edgecolor='black', alpha=1)
plt.text(2015.5+0.15, 7.4-0.08, "Rivers", size=13)
plt.scatter(2015.5, 6.5,s=200,color="#6CB4EE",zorder=400, edgecolor='black', alpha=0.25)
plt.text(2015.5+0.15, 6.5-0.08, "Remaining", size=13)

ax = plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.ylabel('Mean number of introductions', fontsize=15)


plt.xticks(fontsize=15)
plt.axvline(x=2017.71, color='r', linestyle='--')  # Plot horizontal line at y=3
plt.ylim(0, 10)
plt.text(2017.71+0.15, 2.8, "First case detected", size=18)

ax = plt.gca()
y_ticks = ax.get_yticks()
y_labels = ax.get_yticklabels()
ax.set_yticks(y_ticks[::1])
ax.set_yticklabels(y_labels[::1])

plt.xlim(2015, 2023.5)

plt.savefig("Timing_jumps.png",bbox_inches='tight', dpi=1000)   
