In [1]:
import altair as alt
import pandas as pd

In [2]:
#Read data
df = pd.read_excel("Data/hgt_summary_altair.xlsx", sheet_name="methods")

In [3]:
#Assign label of each approach whether it is machine learning (ML), deep learning (DL) or others 
df["Approach"] = df["Processes"].astype(object).apply(lambda x: 'Machine Learning (ML)' if "machine" in x and "deep" not in x else ('Deep Learning (DL)' if "deep" in x else "Others"))

In [4]:
#trend of ML and DL applications in computational approaches for Horizontal Gene Transfer (HGT)
ml_dl_trend = df[["Year","Approach"]].groupby(["Year","Approach"])["Approach"].count()\
                                                                            .unstack()\
                                                                            .reset_index()\
                                                                            .sort_values(by=['Year'], ascending=True)\
                                                                            .set_index('Year')\
                                                                            .fillna(0)\
                                                                            .cumsum()

ml_dl_trend_new = ml_dl_trend.stack().reset_index().rename(columns={0:"Cumulative Count"})\
                            .sort_values(by=['Year','Approach'], ascending=[True, False])

In [5]:
ml_dl_trend_new.head()

Unnamed: 0,Year,Approach,Cumulative Count
2,2000,Others,1.0
1,2000,Machine Learning (ML),0.0
0,2000,Deep Learning (DL),0.0
5,2001,Others,3.0
4,2001,Machine Learning (ML),1.0


In [6]:
#plot the trend of ML and DL applications in HGT detection

ml_dl_yearly_trend = alt.Chart(ml_dl_trend_new).mark_bar(stroke='gray').encode(
    x=alt.X('Year:O', axis=alt.Axis(labelAngle=-45)),
    y="Cumulative Count",
    color=alt.Color('Approach', 
                    scale=alt.Scale(domain=["Deep Learning (DL)","Machine Learning (ML)","Others"],
                                    range=['#A2A2A2','#555555','#FFFFFF']),
                   legend=alt.Legend(orient='none',
                                    legendX=130, legendY=350,
                                    direction='horizontal',
                                    title='',
                                    labelPadding=10.0,
                                    labelOffset=1.0,
                                    columnPadding=25.0)),
    order=alt.Order("Approach").sort("ascending")
).properties(width=600)

ml_dl_yearly_trend

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [7]:
#process the table to calculate the cumulative sum of each computational group per year
computational_group_trend = df[["Year","Computational Group"]].groupby(["Year","Computational Group"])["Computational Group"].count()\
                                                                                                                            .unstack()\
                                                                                                                            .reset_index()\
                                                                                                                            .sort_values(by=['Year'], ascending=True)\
                                                                                                                            .set_index('Year')\
                                                                                                                            .fillna(0)\
                                                                                                                            .cumsum()

#collapse the column into rows for creating a stacked bar chart
computational_group_trend_stack = computational_group_trend.stack().reset_index().rename(columns={0:"Cumulative Count"})

In [8]:
#color schema of tableau10
computational_group_trend_chart = alt.Chart(computational_group_trend_stack).mark_bar(stroke='gray').encode(
    x=alt.X('Year:O', axis=alt.Axis(labelAngle=-45)),
    y="Cumulative Count",
    color=alt.Color('Computational Group', 
                                        scale=alt.Scale(domain=["artificial intelligence (AI)","comparative genomics","hybrid","sequence composition"],
                                                        range=['#000000','#a6cee3','#1f78b4','#b2df8a']),
                   legend=alt.Legend(orient='none',
                                                                            legendX=80, legendY=420,
                                                                            direction='horizontal',
                                                                            title='',
                                                                            labelPadding=10.0,
                                                                            labelOffset=1.0,
                                                                            columnPadding=25.0)),
    order=alt.Order("Computational Group").sort("ascending")
).properties(width=600)
computational_group_trend_chart

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [9]:
#take the latest year (2023) to show the current state of the computational groups for HGT detection
latest_state = computational_group_trend_stack[computational_group_trend_stack['Year']==2023]

In [10]:
#calculate the proportion of each computational group in the latest year
total_computational_approaches = latest_state.loc[:,'Cumulative Count'].values.sum()
latest_state.loc[:,'proportion'] = latest_state['Cumulative Count']/total_computational_approaches

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  latest_state.loc[:,'proportion'] = latest_state['Cumulative Count']/total_computational_approaches


In [11]:
latest_state

Unnamed: 0,Year,Computational Group,Cumulative Count,proportion
92,2023,artificial intelligence (AI),27.0,0.201493
93,2023,comparative genomics,59.0,0.440299
94,2023,hybrid,29.0,0.216418
95,2023,sequence composition,19.0,0.141791


In [12]:
#reference: https://github.com/AAnzel/TVSDS/blob/master/Source/UI.py
def calc_midpoints(y):
    x = []
    for i in range(len(y)):
        prev = y[: i]
        x.append(y[i]/2 + sum(prev))

    return x

In [13]:
latest_state.loc[:,'text_pos'] = calc_midpoints(latest_state.loc[:,'proportion'].values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  latest_state.loc[:,'text_pos'] = calc_midpoints(latest_state.loc[:,'proportion'].values)


In [14]:
#split the label color into white and black color for better visibility
white_font = latest_state[latest_state['Computational Group'].str.contains("AI|hybrid")]
black_font = latest_state[latest_state['Computational Group'].str.contains("sequence|comparative")]

In [15]:
computational_trend_proportion_bar = alt.Chart(latest_state).mark_bar(size=20, stroke='gray').encode(
    x=alt.X('sum(proportion)',axis=alt.Axis(labels=False, tickSize=0)).title('Proportion').stack('normalize'),
    color=alt.Color('Computational Group', 
                    scale=alt.Scale(domain=["artificial intelligence (AI)","comparative genomics","hybrid","sequence composition"],
                                    range=['#000000','#a6cee3','#1f78b4','#b2df8a']))
)

white_text = alt.Chart(white_font).mark_text(size=12, color='#FAF9F6').encode(
    x=alt.X('text_pos:Q'),
    detail='Computational Group',
    text=alt.Text('proportion:Q', format=".2%")
)

black_text = alt.Chart(black_font).mark_text(size=12, color='#363636').encode(
    x=alt.X('text_pos:Q'),
    detail='Computational Group',
    text=alt.Text('proportion:Q', format=".2%")
)

In [16]:
computational_trend_proportion_bar_w_text = (computational_trend_proportion_bar+white_text+black_text).properties(width=600)
computational_trend_proportion_bar_w_text

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [18]:
#final plot of the trend of computational approaches per group between 2000 and 2023

computational_group_trend_final = alt.vconcat(computational_group_trend_chart, computational_trend_proportion_bar_w_text)
computational_group_trend_final

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
