# Analyse the literature

In [None]:
import pandas as pd
import matplotlib as mpl
# No touchie. Make the font the same as the LaTeX paper
mpl.rc('font', family='serif', serif='cmr10', size=18)
mpl.rc('axes.formatter', use_mathtext=True)
import matplotlib.pyplot as plt
import seaborn as sns
import re
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 30)
import numpy as np

A4_WIDTH_INCHES = 8.25

FINAL = True

def watermark(ax):
    if FINAL:
        return
    ax.text(
        0.5, 
        0.5, 
        'Visualisation not final', 
        transform=ax.transAxes,
        fontsize=40, 
        color='gray', 
        alpha=0.5,
        ha='center', 
        va='center', 
        rotation=30
    )
    
def prettify_text(text):
    replacements = {
        "5Dt": "5DT",
        "App": "Application",
        "Ar": "Augmented Reality",
        "Based On": "Based-on",
        "Cnn": "CNN",
        "Cyberglove": "CyberGlove",
        "Cyberglove Ii": "CyberGlove II",
        "Dataglove": "DataGlove",
        "Deep Q Network": "Deep Q-Network",
        "Emg": "EMG",
        "Ffnn": "FFNN",
        "Gmm": "GMM",
        "Gru": "GRU",
        "Hmm": "HMM",
        "Imu": "IMU",
        "K Means": "k-Means",
        "Knn": "KNN",
        "Lstm": "LSTM",
        "Msc": "Master's Thesis",
        "Myomonitor Iv": "Myomonitor IV",
        "Nn": "NN",
        "Pca": "PCA",
        "Powerglove": "PowerGlove",
        "Rgb": "RGB",
        "Rgbd": "RGBD",
        "Rnn": "RNN",
        "Sl": "Sign Language",
        "Som": "SOM",
        "Svd": "Svd",
        "Svm": "SVM",
        "Tdnn": "TDNN",
        "Tech": "Technology",
        "Tub": "TUB",
        "Uwave": "uWave",
        "Vpl": "VPL",
        "Vr": "Virtual Reality",
        "Wifi": "WiFi",
#         "Naive Bayes": "Naïve-Bayes", # Matplotlib doesn't understand 'ï'
    }
    
    text = text.replace('-', ' ').title()
    for k, v in replacements.items():
        text = re.sub(r'\b' + k + r'\b', v, text, 0)
    return text
    
def prettify_ax(ax):
    ax.grid()
    ax.set_xlabel(prettify_text(ax.get_xlabel()))
    ax.set_ylabel(prettify_text(ax.get_ylabel()))

#     ax.set_xticks(ax.get_xticks())
#     ax.set_xticklabels([
#         prettify_text(label.get_text())
#         for label
#         in ax.get_xticklabels()
#     ])
    ax.set_yticks(ax.get_yticks())
    ax.set_yticklabels([
        prettify_text(label.get_text())
        for label
        in ax.get_yticklabels()
    ])

    legend = ax.get_legend()
    if legend:
        for txt in legend.get_texts():
            txt.set_text(prettify_text(txt.get_text()))

        legend.set_title(prettify_text(legend.get_title().get_text()))

    return ax

In [None]:
date_cols = ['Date', 'Publication Year', 'Date Added', 'Date Modified', 'Access Date']
df = pd.read_csv(
    './src/zotero.csv', 
    parse_dates=date_cols
)
df['Date'] = pd.to_datetime(df['Date'], format='ISO8601')

def get_citations(x):
    if x is None or pd.isna(x):
        return None
    RE = r'(\d+) citations'
    match = re.search(RE, x, flags=re.IGNORECASE)
    return None if match is None else int(match.groups()[0])

# Extract citations
df['citations'] = df['Extra'].apply(get_citations)

# Extract nicely formatted tags as a list of key:value objects
df['tags'] = df['Manual Tags'].str.split('; ').apply(lambda x: x if type(x) is list or pd.notna(x) else [])

# # https://stackoverflow.com/a/952952/14555505
uniq_tags = sorted(list(set([item for sublist in df['tags'] for item in sublist])))

# Make a helper function to check if tag(s) is/are in a series
def has_tag(tag, series):
    if type(tag) is list:
        return series.apply(lambda x: any(t in x for t in tag))
    else:
        return series.apply(lambda x: tag in x)

# Make a helper function to extract the references from the Extras field
def extract_references(extra):
    if pd.isna(extra):
        return []
    regex = r'--- DOIs_of_references: (\[.*\]) ---'
    match = re.search(regex, extra)
    if match is None:
        return []
    dois_str = match.groups()[0]
    return sorted([l[1:-1] for l in dois_str[1:-1].split(',')])

df['references'] = df['Extra'].apply(extract_references)

# Get a shortcut column for the year
df['year'] = df['Date'].dt.year

In [None]:
# Add columns to the dataset, one for each prefix
prefixes = sorted(set([t.split(':')[0] for t in uniq_tags if ':' in t]))

# Make a column for all `prefix`:`value` type tags
def make_prefix_value(tags, prefix, sep=':'):
    s = '+'.join(sorted([tag.replace(prefix + sep, '') for tag in tags if (prefix+sep) in tag]))
    return s if s else np.NaN

for prefix in prefixes:
    df[prefix] = df['tags'].apply(lambda tags: make_prefix_value(tags, prefix))

# Attempt to infer better dtypes for object columns.
df = df.infer_objects()

In [None]:
def dedup_prefix(df, prefix):
    """Taken in a dataframe and a prefix column that contains values like `abc+123+efg.
    Split those values up so they take multiple rows, containing `abc`, `123`, `efg`."""
    assert prefix in df.columns, f'prefix `{prefix}` is not in df.columns:\n{df.columns.to_list()}'
    list_of_series = []
    for i, row in df.iterrows():
        if pd.notna(row[prefix]) and '+' in row[prefix]:
            for item in row[prefix].split('+'):
                tmp_row = row.copy()
                tmp_row[prefix] = item
                list_of_series.append(tmp_row)
        else:
            list_of_series.append(row)
    return pd.DataFrame(list_of_series).reset_index(drop=True)

In [None]:
# Paper types over time
fig, ax = plt.subplots(figsize=(A4_WIDTH_INCHES,3))
data = dedup_prefix(dedup_prefix(df, 'type'), 'based-on')

data = data[
    data['type'].notna() & 
    data['type'].isin(['paper', 'survey', 'msc', 'dataset'])
]

sns.swarmplot(
    data=data, 
    x="Date", 
    y="type",
    hue='based-on',
    size=4,
    ax=ax
)
plt.title('Types of Research over Time');
watermark(ax)
ax = prettify_ax(ax)


In [None]:
# Surveys vs citations over time
fig, ax = plt.subplots(figsize=(A4_WIDTH_INCHES, 5))
deduped = dedup_prefix(dedup_prefix(df, 'type'), 'based-on')

deduped = deduped[deduped.type == 'survey']
sns.scatterplot(
    data=deduped, 
    x="Date", 
    y="citations",
    style='based-on',
    hue='based-on',
    ax=ax,
)
plt.title('Surveys vs Citations over Time')
watermark(ax)
ax = prettify_ax(ax)

plt.show()

deduped.loc[
    deduped.citations > 500, 
    ['Author', 'Date', 'citations', 'Title', 'based-on', 'type']
].sort_values('Date')

In [None]:
# Surveys vs citations over time
fig, ax = plt.subplots(figsize=(A4_WIDTH_INCHES, 7))
deduped = dedup_prefix(dedup_prefix(df, 'type'), 'based-on')

deduped = deduped[deduped.type == 'paper']
sns.scatterplot(
    data=deduped, 
    x="Date", 
    y="citations",
    style='based-on',
    hue='based-on',
    ax=ax,
)
plt.title('Papers and Their Citations Over Time')
ylim = ax.get_ylim()
c_per_y = [10, 50, 100]
for c in c_per_y:
    ax.plot(
        [pd.to_datetime('2023-01-01'), pd.to_datetime('1980-01-01')], 
        [0, c * (2023-1980)], 
        alpha=0.1
    )
ax.set_ylim(ylim)
watermark(ax)
ax = prettify_ax(ax)

plt.show()

(df.loc[
    (df.citations > 500) & (df.type == 'paper'),
    ['Author', 'Title', 'Date', 'citations', 'type', 'based-on']
]
.sort_values('citations'))

In [None]:
# Hardware used for glove-based systems
fig, ax = plt.subplots(figsize=(A4_WIDTH_INCHES,7))

deduped = dedup_prefix(dedup_prefix(df, 'hardware'), 'based-on')
deduped = deduped[
    (deduped.type == 'paper') &
    (deduped['based-on'] == 'gloves') & 
    deduped['hardware'].notna()
]
sns.swarmplot(
    data=deduped.sort_values('Date'), 
    x="Date", 
    y="hardware",
    size=4, 
    ax=ax
)
watermark(ax)
ax = prettify_ax(ax)

plt.title('Hardware Used for Glove-based Systems')
plt.savefig('./src/imgs/graphs/03_hardware_for_gloves.pdf', bbox_inches='tight')

In [None]:
# Tech used for glove-based systems
fig, ax = plt.subplots(figsize=(A4_WIDTH_INCHES,7))

deduped = dedup_prefix(dedup_prefix(df, 'tech'), 'based-on')

deduped = deduped[
    (deduped.type == 'paper') &
    (deduped['based-on'] == 'gloves') &
    (~deduped['tech'].isin(['rgb'])) &
    (deduped['tech'].notna())
]
sns.swarmplot(
    data=deduped, 
    x="Date", 
    y="tech",
#     hue='type',
    size=4, 
    ax=ax
)
watermark(ax)
ax = prettify_ax(ax)

plt.title('Technologies Used for Glove-based Systems')
plt.savefig('./src/imgs/graphs/03_tech_for_gloves.pdf', bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(figsize=(A4_WIDTH_INCHES,16))

data = dedup_prefix(dedup_prefix(df, 'app'), 'based-on').sort_values('Date')
data = data[
    data['app'].notna()
]

sns.swarmplot(
    data=data, 
    x="Date", 
    y="app",
    hue='based-on',
    size=4, 
    ax=ax
)
watermark(ax)
ax = prettify_ax(ax)

plt.title('Applications of Gesture Recognition')
plt.savefig('./src/imgs/graphs/03_applications.pdf', bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(figsize=(A4_WIDTH_INCHES,7))

data = dedup_prefix(dedup_prefix(df, 'app'), 'based-on').sort_values('app')
data = data[
    (data['based-on'].isin(['wifi', 'vision'])) & 
    (data['app'].notna()) & 
    (data['app'].apply(lambda x: '-sl' not in str(x)))
]
# order = data.groupby('app').size().sort_values(ascending=False).index

sns.swarmplot(
    data=data, 
    x="Date", 
    y="app",
    hue='based-on',
#     order=order,
    size=4, 
    ax=ax
)
watermark(ax)
ax = prettify_ax(ax)

plt.title('Applications of Vision/WiFi-based Gesture Recognition')
plt.savefig('./src/imgs/graphs/03_wifi_vision_applications.pdf', bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(figsize=(A4_WIDTH_INCHES, 5))
data = dedup_prefix(dedup_prefix(df, 'app'), 'based-on').sort_values('app')
data = data[data.app.apply(lambda x: '-sl' in str(x))]
sns.swarmplot(
    data=data, 
    x="Date", 
    y="app",
    hue='based-on',
    size=4, 
    ax=ax
)
watermark(ax)
ax = prettify_ax(ax)

ax.set_ylabel('Sign Language')
ax.set_yticks(ax.get_yticks())
ax.set_yticklabels([
    t.get_text().replace(' Sign Language', '')
    for t 
    in ax.get_yticklabels()
])

plt.title('Sign Language Applications of Gesture Recognition')
plt.savefig('./src/imgs/graphs/03_sl_applications.pdf', bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(figsize=(A4_WIDTH_INCHES, 8))
data = dedup_prefix(dedup_prefix(df, 'model'), 'based-on').sort_values('Date')

value_counts = data.value_counts('model')
gt_1_use = value_counts[value_counts > 1].index
data = data[
    (data['model'].isin(gt_1_use)) & 
    (data['based-on'] == 'gloves')
]

order = data.groupby('model').size().sort_values(ascending=False).index
sns.swarmplot(
    data=data,
    x="Date",
    y="model",
#     hue='based-on',
    order=order,
    size=4,
    ax=ax
)
watermark(ax)
ax = prettify_ax(ax)

plt.title('Models Used for Glove-based Gesture Recognition')
plt.savefig('./src/imgs/graphs/03_models_glove_based.pdf', bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(figsize=(A4_WIDTH_INCHES, 10))
data = dedup_prefix(dedup_prefix(df, 'model'), 'based-on').sort_values('Date')

value_counts = data.value_counts('model')
gt_1_use = value_counts[value_counts > 1].index
data = data[
    (data['model'].isin(gt_1_use)) &
    (data['based-on'] != 'gloves')

]

order = data.groupby('model').size().sort_values(ascending=False).index
sns.swarmplot(
    data=data,
    x="Date",
    y="model",
    hue='based-on',
    order=order,
    size=4,
    ax=ax
)
watermark(ax)
ax = prettify_ax(ax)

plt.title('Models Used for Non-glove-based Gesture Recognition')
plt.savefig('./src/imgs/graphs/03_models_no_gloves.pdf', bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(figsize=(A4_WIDTH_INCHES, 15))
data = dedup_prefix(dedup_prefix(df, 'model'), 'based-on').sort_values('Date')

value_counts = data.value_counts('model')
gt_1_use = value_counts[value_counts > 1].index
data = data[
    (data['model'].isin(gt_1_use))
]

order = data.groupby('model').size().sort_values(ascending=False).index
sns.swarmplot(
    data=data,
    x="Date",
    y="model",
    hue='based-on',
    order=order,
    size=4,
    ax=ax
)
watermark(ax)
ax = prettify_ax(ax)

plt.title('Models Used for Gesture Recognition')
plt.savefig('./src/imgs/graphs/03_models.pdf', bbox_inches='tight')

In [None]:
# HMMs vs CNNs/NNs
fig, ax = plt.subplots(figsize=(A4_WIDTH_INCHES, 4))
data = dedup_prefix(dedup_prefix(df, 'model'), 'based-on').sort_values('Date')
data = data[
    (data.model.isin(['nn', 'cnn', 'hmm'])) &
    (data['based-on'].isin(['vision', 'wifi']))
]

# order = data.groupby('model').size().sort_values(ascending=False).index
sns.swarmplot(
    data=data,
    x="Date",
    y="model",
    hue='based-on',
#     order=order,
    size=4,
    ax=ax
)
watermark(ax)
ax = prettify_ax(ax)

plt.title('HMMs and CNNs for Vision/WiFi-based Gesture Recognition')
plt.savefig('./src/imgs/graphs/03_models_hmm_vs_cnn.pdf', bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(figsize=(A4_WIDTH_INCHES,5))
data = dedup_prefix(dedup_prefix(dedup_prefix(df, 'model'), 'based-on'), 'participants').sort_values('model')

data['participants'] = (pd.to_numeric(data['participants']))

data = data[
    data['model'].notna()
]

sns.scatterplot(
    data=data,
    y="participants",
    x="Date",
    hue='based-on',
    ax=ax,
)

# watermark(ax)
# ax = prettify_ax(ax)

# ax.set_ylabel('Number of Participants')

# ax.set_xticks(ax.get_xticks())
# ax.set_xticklabels(np.round(np.power(10, ax.get_xticks()), 0))


# plt.title('Model Used vs Number of Participants')
# plt.savefig('./src/imgs/graphs/03_models_vs_nclasses.pdf', bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(figsize=(A4_WIDTH_INCHES,14))
data = dedup_prefix(dedup_prefix(dedup_prefix(df, 'model'), 'based-on'), 'classes').sort_values('model')

data['classes'] = np.log10(pd.to_numeric(data['classes']))

data = data[
    data['model'].notna()
]

sns.swarmplot(
    data=data,
    y="model",
    x="classes",
    hue='based-on',
    size=4,
    ax=ax,
)

watermark(ax)
ax = prettify_ax(ax)

ax.set_xlabel('Number of Classes ($\log_{10}$ scale)')

ax.set_xticks(ax.get_xticks())
ax.set_xticklabels((np.round(np.power(10, ax.get_xticks()), 0)).astype(int))


plt.title('Model Used vs Number of Classes')
plt.savefig('./src/imgs/graphs/03_models_vs_nclasses.pdf', bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(figsize=(A4_WIDTH_INCHES,6))
deduped = dedup_prefix(dedup_prefix(df, 'based-on'), 'type')
deduped = deduped[
    deduped.type.isin(['paper', 'survey', 'msc', 'dataset'])
]

sns.swarmplot(
    data=deduped,
    x="Date", 
    y="based-on",
    order=deduped['based-on'].value_counts().index,
    hue='type',
    size=4,
    ax=ax,
)
watermark(ax)
ax = prettify_ax(ax)

plt.title('Trend of Different Systems over Time')
plt.savefig('./src/imgs/graphs/03_based_on_over_time.pdf', bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(figsize=(A4_WIDTH_INCHES,4))

data = dedup_prefix(dedup_prefix(df, 'fidelity'), 'based-on')

data = data[
    data['fidelity'].notna()
]
sns.swarmplot(
    data=data, 
    x="Date", 
    y="fidelity",
    hue='based-on',
    size=8,
    ax=ax
)
watermark(ax)    
ax = prettify_ax(ax)


plt.title('Fidelity of over Time')

In [None]:
# TODO: Save as LaTeX
latex = df.loc[
    (df.type == 'dataset'),
    ['Title', 'Author', 'Date', 'dataset', 'participants', 'observations', 'classes', 'tech', 'hardware']
]

with open('./src/tables/03_datasets.generated.tex', 'w') as f:
    latex.to_latex(
        f,
        caption='caption',
        label='tab:',
        index=False,
        bold_rows=True,
        longtable=True,
        na_rep='-',
        formatters={
            "Date": lambda d: str(d)[:-9],
        }
    )