# Visualizations

This notebook contains the code for all visualizations in one place

## Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# set style for charts
style = 'seaborn-v0_8'  # 'ggplot' and 'seaborn-v0_8-colorblind' are also good
plt.style.use(style)

## Loading Data

In [None]:
# raw data after appending all entries and removing duplicates
df_raw = pd.read_csv('data/initial/data.csv')  # [4654, 25]

# filter out no main language, no description and non-programming main language
# df_filter1 = pd.read_csv('data/initial/data_filtered_initial.csv')  # [1576, 31]
df_filter1 = pd.read_csv('data/test/data_filtered_appended_all.csv')  # [1576, 31], fixed error in some data

# filter out archived and non-updated in 5 years [1179, 32] (index columns)
df_filter2 = pd.read_csv('data/initial/data_filtered_pre_readme_fetch.csv')

# appended all data and removed empty readmes and non-english descriptions
df_filter3 = pd.read_csv('data/appended/final.csv')  # [1028, 30]

# results of manual filtering by description
df_description = pd.read_csv('data/manual/filter_by_description.csv')

# results of manual filtering by readme
df_readme = pd.read_csv('data/manual/filter_by_readme.csv', skiprows=1)

# marked as framework
df_frameworkds = pd.read_csv('data/frams.csv')

In [None]:
from gh_search import *
from langdetect import detect
import validators

def fill_na_values(df: pd.DataFrame) -> pd.DataFrame:
  # fill null values with base data
  df['description'].fillna(value='', inplace=True)
  df['homepage_url'].fillna(value='', inplace=True)
  df['license'].fillna(value='UNLICENSED', inplace=True)
  df['main_language'].fillna(value='NONE', inplace=True)
  print('Filled N/A values for "description", "homepage_url", '
               '"license" and "main_language" with base values')
  return df


def filter_out_nocode(df: pd.DataFrame) -> pd.DataFrame:
  n = df.shape[0]
  df = df[df['main_language'] != 'NONE']
  num_removed_lang = n - df.shape[0]
  df = df[df['repo_size_kb'] >= 50]
  num_removed_size = n - df.shape[0] - num_removed_lang
  print(f'Removed {num_removed_lang}/{n} repos due not having a '
              'recognized programming language as main language on GitHub')
  print(f'Removed {num_removed_size}/{n - num_removed_lang} repos due not '
              'having >= 50 kB of content')
  return df


def filter_out_nodesc(df: pd.DataFrame) -> pd.DataFrame:
  n = df.shape[0]
  df = df[df['description'] != '']
  num_removed = n - df.shape[0]
  print(f'Removed {num_removed}/{n} repos due to not having a '
              'description')
  return df


def filter_out_nonprog(df: pd.DataFrame) -> pd.DataFrame:
  n = df.shape[0]
  lang: LinguistData = LinguistData()

  def lang_filter(language: str) -> bool:
    accepted_non_prog_langs: list = ['CSS', 'Mermaid', 'Prisma',
                                     'Riot', 'Svelte', 'Vue']
    return (lang in accepted_non_prog_langs
            or lang.is_programming_language(language))
  df = df[df['main_language'].apply(lang_filter)]
  num_removed = n - df.shape[0]
  print(f'Removed {num_removed}/{n} repos due to not being written in a '
              f'programming language recognized by GitHub')
  return df


def filter_out_archived(df: pd.DataFrame) -> pd.DataFrame:
  n = df.shape[0]
  df = df[df['is_archived'] == False]
  num_removed = n - df.shape[0]
  print(f'Removed {num_removed}/{n} repos due to being archived')
  return df


def filter_out_notupdated(df: pd.DataFrame) -> pd.DataFrame:
  n = df.shape[0]
  diff: pd.Timestamp = pd.Timestamp.today(tz='UTC') - pd.Timedelta(days=365 * 5)
  df = df[pd.to_datetime(df['updated_at']) >= diff]
  num_removed = n - df.shape[0]
  print(f'Removed {num_removed}/{n} repos due to being updated in the '
              'last 5 years')
  return df


def filter_out_noreadme(df: pd.DataFrame) -> pd.DataFrame:
  n = df.shape[0]
  df = df.dropna(subset=['readme'], inplace=False)
  num_removed = n - df.shape[0]
  print(f'Removed {num_removed}/{n} repos due having no Readme')
  return df


def filter_out_emptyreadme(df: pd.DataFrame) -> pd.DataFrame:
  n = df.shape[0]

  def empty_readme_filter(row) -> bool:
    name = row['name']
    readme = row['readme']
    return not (readme.startswith(f'# {name}') and len(readme.split('\n')) <= 2)
  df = df[df.apply(empty_readme_filter, axis=1)]
  num_removed = n - df.shape[0]
  print(f'Removed {num_removed}/{n} repos due to having the generated '
              f'README format')
  return df


def filter_out_nonenglish(df: pd.DataFrame) -> pd.DataFrame:
  n = df.shape[0]

  def desc_lang_filter(text: str) -> bool:
    try:
      v = detect(text.strip())
      if df.owner == 'funf-core':
        print(f'CORE: {v}')
      return v == 'en'
    except:
      print(f'Unable to detect language of "{text}"')
      if validators.url(text.strip()):
        print(f'"{text}" is an url, accepted as valid')
        return True
      return False
  df = df[df['description'].apply(desc_lang_filter)]
  num_removed = df.shape[0]
  print(f'Removed {num_removed}/{n} repos due not having an English '
              'description text')
  return df

In [None]:
df = df_raw

df = fill_na_values(df)
df = filter_out_nocode(df)
df = filter_out_nodesc(df)
df = filter_out_nonprog(df)
df['num_issues'] = [0] * len(df)
df['num_subscribers'] = [0] * len(df)
df['num_contributors'] = [0] * len(df)
df['languages'] = [[]] * len(df)
df['readme'] = [''] * len(df)

df.info()

## Initial Search

In [None]:
# popularity analysis
df = df_filter1
pop_df = df[['num_stars', 'num_subscribers', 'num_forks',
                     'has_issues', 'num_issues', 'is_archived']]
print(pop_df.describe())
for key in ['num_stars', 'num_subscribers', 'num_forks', 'num_issues']:
  print(f'Median of "{key}":', np.median(pop_df[key].to_list()))

In [None]:
# archived ratio
fig, ax = plt.subplots()
archived_labels = ['archived', 'active']
archived_sizes = [len(df[df['is_archived'] == True]),
                  len(df[df['is_archived'] == False])]
ax.pie(archived_sizes, labels=archived_labels, autopct='%1.1f%%')
fig.tight_layout()

In [None]:
# time difference analysis
time_df = df[['created_at', 'updated_at']]
time_df['created_at'] = time_df['created_at'].apply(pd.to_datetime)
time_df['updated_at'] = time_df['updated_at'].apply(pd.to_datetime)
time_df['difference'] = time_df['updated_at'] - time_df['created_at']

print(time_df.describe())

creation_dates = time_df['created_at'].to_list()
print(min(creation_dates))
print(max(creation_dates))
creation_seconds = [ts.timestamp() for ts in creation_dates]
print(pd.Timestamp.fromtimestamp(sum(creation_seconds) / len(creation_seconds)))
tmp_df = pd.DataFrame({'timestamps': creation_dates})
tmp_df['year'] = tmp_df['timestamps'].dt.year
print(tmp_df.groupby('year').size())

xs = list(range(2009, 2025, 1))
ys = [1, 5, 11, 15, 21, 38, 109, 149, 127, 163, 165, 193, 194, 154, 194, 37]
fig, axes = plt.subplots(nrows=1, ncols=2)
axes[0].plot(xs, ys)
axes[0].xaxis.set_major_locator(plt.MaxNLocator(integer=True))
axes[0].set_title('Amount of new EMA Repositories per Year')
axes[0].set_ylabel('Count')
time_df.difference.dt.days.hist()
axes[1].set_title('Difference between Creation and latest Update')
axes[1].set_xlabel('Days after Creation')
axes[1].set_ylabel('Count')
fig.tight_layout()

In [None]:
# GitHub repository growth per year
gh_developers_dict = {'2016': 5.8,
                      '2017': 24,
                      '2018': 31,
                      '2019': 40,
                      '2020': 56,
                      '2021': 73.5,
                      '2022': 94}
gh_repos_dict = {'2016': 19.4,
                 '2017': 67,
                 '2018': 96,
                 '2019': 44,
                 '2020': 60,
                 '2021': 115,
                 '2022': 200}
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(8, 4))
axes = axes.flatten()
ax = axes[0]
xs = list(gh_developers_dict.keys())
ys = list(gh_developers_dict.values())
ax.bar(xs, ys)
ax.set_ylabel('Number of Users [Million]')
ax.set_xlabel('Year')
ax.set_title('GitHub Users per Year')
ax = axes[1]
xs = list(gh_repos_dict.keys())
ys = list(gh_repos_dict.values())
ax.bar(xs, ys)
ax.set_ylabel('Number of new Repositories [Million]')
ax.set_xlabel('Creation Year')
ax.set_title('New GitHub Repositories per Year')
fig.tight_layout()

In [None]:
xs = list(tmp_dict.keys())[7:14]
ema_repos_per_year = np.array(list(tmp_dict.values())[7:14])
gh_repos_per_year = np.array(list(gh_repos_dict.values()))
millioner = lambda t: t * 1000000
millioner_func = np.vectorize(millioner)
ys = ema_repos_per_year/millioner(gh_repos_per_year)

fig, ax = plt.subplots()
ax.bar(xs, ys)
ax.set_ylabel('"Ema Repositories" to "All Repositories" Ratio')
ax.set_xlabel('Creation Year')
fig.suptitle('Proportion of EMA Repositories per Year')
fig.tight_layout()

## Manual Screening

In [None]:
potentials = df_description[df_description.classification == 'Potential']
unrelateds = df_description[df_description.classification == 'Unrelated']

checked_readmes = df_description[df_description.readme_check == True]
missing_infos = df_description[df_description.missing_info == True]

checked_potentials = potentials.merge(checked_readmes, on='url', how='inner')

In [None]:
# information pie charts
fig, axes = plt.subplots(nrows=1, ncols=3)
x = potentials.shape[0]
y = unrelateds.shape[0]
labels = [f'potential\n   ({x})', f'unrelated\n({y})    ']
sizes = [x, y]
axes[0].pie(sizes, labels=labels, autopct='%1.1f%%')
axes[0].set_title('Classification Results')
x = checked_readmes.shape[0]
y = df_description.shape[0] - x
labels = [f'needed\n  ({x})', f'not needed\n({y})     ']
sizes = [x, y]
axes[1].pie(sizes, labels=labels, autopct='%1.1f%%')
axes[1].set_title('Readme Checking')
x = checked_potentials.shape[0]
y = potentials.shape[0] - x
labels = [f'potential\n    ({x})', f'unrelated\n({y})    ']
sizes = [x, y]
axes[2].pie(sizes, labels=labels, autopct='%1.1f%%')
axes[2].set_title('Classification of Checked Entries')
fig.tight_layout()

In [None]:
ema_apps = df_readme[df_readme.classification == 'EMA App']
ema_frameworks = df_readme[df_readme.classification == 'EMA Framework']
ema_tools = df_readme[df_readme.classification == 'EMA Tool']
unrelateds = df_readme[df_readme.classification == 'Unrelated']

checked_external = df_readme[df_readme.external_check == True]
missing_infos = df_readme[df_readme.missing_info == True]

checked_apps = ema_apps.merge(checked_external, on='url', how='inner')
checked_frameworks = ema_frameworks.merge(checked_external, on='url', how='inner')
checked_tools = ema_tools.merge(checked_external, on='url', how='inner')
checked_unrelateds = unrelateds.merge(checked_external, on='url', how='inner')

In [None]:
# information pie charts
fig, axes = plt.subplots(nrows=1, ncols=3)
a = ema_apps.shape[0]
b = ema_frameworks.shape[0]
c = ema_tools.shape[0]
d = unrelateds.shape[0]
labels = [f'App ({a})', f'Framework ({b})',
          f'Tool ({c})', f'Unrelated ({d})']
sizes = [a, b, c, d]
axes[0].pie(sizes, labels=labels, autopct='%1.1f%%')
axes[0].set_title('Classification Results')
x = checked_external.shape[0]
y = df_readme.shape[0] - x
labels = [f'Needed ({x})', f'Not needed ({y})     ']
sizes = [x, y]
axes[1].pie(sizes, labels=labels, autopct='%1.1f%%')
axes[1].set_title('External Checking')
a = checked_apps.shape[0]
b = checked_frameworks.shape[0]
c = checked_tools.shape[0]
d = checked_unrelateds.shape[0]
labels = [f'App ({a})', f'Framework ({b})',
          f'Tool ({c})', f'Unrelated ({d})']
sizes = [a, b, c, d]
axes[2].pie(sizes, labels=labels, autopct='%1.1f%%')
axes[2].set_title('Classification of Checked Entries')
fig.tight_layout()

## Findings

In [None]:
df_man = ema_apps
df_app = df_filter3[df_filter3.url.isin(df_man.url)]

In [None]:
# popularity analysis
pop_df = df_app[['num_stars', 'num_subscribers', 'num_forks',
                  'has_issues', 'num_issues', 'is_archived']]
print(pop_df.describe())
for key in ['num_stars', 'num_subscribers', 'num_forks', 'num_issues']:
  print(f'Median of "{key}":', np.median(pop_df[key].to_list()))

In [None]:
# Subscribers
fig, ax = plt.subplots()
ax = df_app.num_subscribers.value_counts().sort_index().plot(kind='bar', label='Data')
ax.set_xlabel('Number of Subscribers')
ax.set_ylabel('Frequency')
ax.grid(True)
for p in ax.patches:
  ax.annotate(str(int(p.get_height())),
              (p.get_x() + p.get_width() / 2.0, p.get_height() - 0.25),
              ha='center', va='center',
              xytext=(0, 10), textcoords='offset points')
ax.axvline(x=df_app.num_subscribers.mean(), color='green', linestyle='--', label='Mean')
ax.axvline(x=df_app.num_subscribers.median(), color='orange', linestyle='--', label='Median')
ax.legend()
fig.tight_layout()

In [None]:
# Subscribers
fig, ax = plt.subplots()
ax = df_app.num_stars.value_counts().sort_index().plot(kind='bar', label='Data')
ax.set_xlabel('Number of Stargazers')
ax.set_ylabel('Frequency')
ax.grid(True)
for p in ax.patches:
  ax.annotate(str(int(p.get_height())),
              (p.get_x() + p.get_width() / 2.0, p.get_height() - 0.25),
              ha='center', va='center',
              xytext=(0, 10), textcoords='offset points')
ax.axvline(x=df_app.num_stars.mean(), color='green', linestyle='--', label='Mean')
ax.axvline(x=df_app.num_stars.median(), color='orange', linestyle='--', label='Median')
ax.legend()
fig.tight_layout()

In [None]:
# Forks
fig, ax = plt.subplots()
ax = df_app.num_forks.value_counts().sort_index().plot(kind='bar', label='Data')
ax.set_xlabel('Number of Forks')
ax.set_ylabel('Frequency')
ax.grid(True)
for p in ax.patches:
  ax.annotate(str(int(p.get_height())),
              (p.get_x() + p.get_width() / 2.0, p.get_height() - 0.25),
              ha='center', va='center',
              xytext=(0, 10), textcoords='offset points')
ax.axvline(x=df_app.num_forks.mean(), color='green', linestyle='--', label='Mean')
ax.axvline(x=df_app.num_forks.median()+0.2, color='orange', linestyle='--', label='Median')
ax.legend()
fig.tight_layout()

In [None]:
fig, axes = plt.subplots(ncols=2)
n = df_app.shape[0]
m = df_man.missing_info.values.sum()
axes[0].pie([m, n-m], labels=[f'Missing info ({m})',
                              f'Full info ({n-m})'], autopct='%1.1f%%')
axes[0].set_title('Description & README')
l = df_man.external_check.values.sum()
axes[1].pie([l, n-l], labels=[f'Found info ({l})',
                              f'Did not find info ({n-l})'], autopct='%1.1f%%')
axes[1].set_title('Other Sources')
fig.tight_layout()

In [None]:
owner_counts = df_app.owner.value_counts()
multiple_owners = owner_counts[owner_counts > 1].index.tolist()
print("Owners with multiple rows:", multiple_owners)

In [None]:
df_app.owner_type.value_counts()

In [None]:
list(df_app[df_app.owner_type == 'Organization'].owner.unique())

In [None]:
df_man.has_publication.value_counts()

In [None]:
fig, axes = plt.subplots(ncols=2)
axes[0].pie([17, 30], labels=[f'≥1 publications ({17})',
                              f'no publication ({30})'], autopct='%1.1f%%')
axes[0].set_title('Linked Publications')

axes[1].pie([8, 9], labels=[f'Users ({8})',
                              f'Organizations ({9})'], autopct='%1.1f%%')
axes[1].set_title('User Types')

# TODO: Write
# Master Student,
# 2 Unknown but scientific repos
# 2 No scientific
# 1 PHD Student
# 1 Professor

fig.tight_layout()

In [None]:
df_emaapps = df_readme[df_readme.classification == 'EMA App']
df_samplingstrategy = df_emaapps[['url', 'event-contingent',
                                  'signal-contingent', 'continuous', 'none']]
df_input = df_emaapps[['url', 'uses_diaries', 'uses_interviews',
                       'uses_questionnaires', 'uses_mic', 'uses_cam',
                       'uses_sensing', 'uses_other']]
df_platforms = df_emaapps[['url', 'on_smartphone', 'on_smartwatch',
                           'in_browser', 'on_other', 'OS']]
df_interventions = df_emaapps[['url', 'has_app_notifications',
                               'has_device_notifications']]

In [None]:
# stacked bar chart: which input methods are being used
def get_use_nums(diaries=False, interviews=False, questionnaires=False,
                 mic=False, cam=False, sensing=False, other=False):
    if diaries:
      k = 'uses_diaries'
    elif interviews:
       k = 'uses_interviews'
    elif questionnaires:
       k = 'uses_questionnaires'
    elif mic:
       k = 'uses_mic'
    elif cam:
       k = 'uses_cam'
    elif sensing:
       k = 'uses_sensing'
    elif other:
       k = 'uses_other'
    n = df_input[df_input[k] == True].shape[0]
    filter_condition = (df_input['uses_diaries'] == diaries) & \
                       (df_input['uses_interviews'] == interviews) & \
                       (df_input['uses_questionnaires'] == questionnaires) & \
                       (df_input['uses_mic'] == mic) & \
                       (df_input['uses_cam'] == cam) & \
                       (df_input['uses_sensing'] == sensing) & \
                       (df_input['uses_other'] == other)
    n_only = df_input[filter_condition].shape[0]
    return n_only, n - n_only


n_diaries_only, n_diaries_partly = get_use_nums(diaries=True)
n_interviews_only, n_interviews_partly = get_use_nums(interviews=True)
n_questionnaires_only, n_questionnaires_partly = get_use_nums(questionnaires=True)
n_mic_only, n_mic_partly = get_use_nums(mic=True)
n_cam_only, n_cam_partly = get_use_nums(cam=True)
n_sensing_only, n_sensing_partly = get_use_nums(sensing=True)
n_other_only, n_other_partly = get_use_nums(other=True)

xs = ('Diary', 'Interview', 'Questionnaire',
      'Microphone', 'Camera', 'Sensing', 'Other')
ys_only = [n_diaries_only, n_interviews_only, n_questionnaires_only,
           n_mic_only, n_cam_only, n_sensing_only, n_other_only]
ys_partly = [n_diaries_partly, n_interviews_partly, n_questionnaires_partly,
             n_mic_partly, n_cam_partly, n_sensing_partly, n_other_partly]
ys = {
   'Only this method': ys_only,
   'At least one other method': ys_partly
}
width = 0.5
fig, ax = plt.subplots()
r = range(len(xs))
p1 = ax.bar(r, ys_only, width=width, label='Only this method')
p2 = ax.bar(r, ys_partly, bottom=ys_only, width=width, label='At least one other method')
for i in range(len(r)):
    do_partly = False
    if ys_only[i] > 0:
      do_partly = True
      ax.text(r[i], ys_only[i] / 2, str(ys_only[i]), ha='center', va='center', color='white')
    if ys_partly[i] > 0 and do_partly:
      ax.text(r[i], ys_only[i] + ys_partly[i] / 2, str(ys_partly[i]), ha='center', va='center', color='white')
    ax.text(r[i], ys_only[i] + ys_partly[i], str(ys_only[i] + ys_partly[i]), ha='center', va='bottom')
ax.set_xlabel('Method')
ax.set_ylabel('Count')
ax.set_title('Data Input Methods')
ax.set_xticks(r)
ax.set_xticklabels(xs)
ax.legend()

In [None]:
# stacked bar chart: which sampling strategies are employed
def get_use_nums(event=False, signal=False, continuous=False):
    if event:
      k = 'event-contingent'
    elif signal:
       k = 'signal-contingent'
    elif continuous:
       k = 'continuous'
    n = df_samplingstrategy[df_samplingstrategy[k] == True].shape[0]
    filter_condition = (df_samplingstrategy['event-contingent'] == event) & \
                       (df_samplingstrategy['signal-contingent'] == signal) & \
                       (df_samplingstrategy['continuous'] == continuous)
    n_only = df_samplingstrategy[filter_condition].shape[0]
    return n_only, n - n_only


n_event_only, n_event_partly = get_use_nums(event=True)
n_signal_only, n_signal_partly = get_use_nums(signal=True)
n_continuous_only, n_continuous_partly = get_use_nums(continuous=True)

xs = ('Event-Contingent', 'Signal-Contingent', 'Continuous')
ys_only = [n_event_only, n_signal_only, n_continuous_only]
ys_partly = [n_event_partly, n_signal_partly, n_continuous_partly]
ys = {
   'Only this strategy': ys_only,
   'At least one other strategy': ys_partly
}
width = 0.5
fig, ax = plt.subplots()
r = range(len(xs))
p1 = ax.bar(r, ys_only, width=width, label='Only this method')
p2 = ax.bar(r, ys_partly, bottom=ys_only, width=width, label='At least one other method')
for i in range(len(r)):
    do_partly = False
    if ys_only[i] > 0:
      do_partly = True
      ax.text(r[i], ys_only[i] / 2, str(ys_only[i]), ha='center', va='center', color='white')
    if ys_partly[i] > 0 and do_partly:
      ax.text(r[i], ys_only[i] + ys_partly[i] / 2, str(ys_partly[i]), ha='center', va='center', color='white')
    ax.text(r[i], ys_only[i] + ys_partly[i], str(ys_only[i] + ys_partly[i]), ha='center', va='bottom')
ax.set_xlabel('Strategy')
ax.set_ylabel('Count')
ax.set_title('Sampling Strategies')
ax.set_xticks(r)
ax.set_xticklabels(xs)
ax.legend()

In [None]:
fig, axes = plt.subplots(ncols=3, figsize=((9, 4)))
m = df_interventions.shape[0] # total num of rows
# num of app notifications locally
n = df_interventions[df_interventions.has_app_notifications == True].shape[0]
# num of device supported notificaions via other devices
l = df_interventions[df_interventions.has_device_notifications == True].shape[0]
cond = (df_interventions.has_app_notifications == True) | \
       (df_interventions.has_device_notifications == True)
k = df_interventions[cond].shape[0]  # num apps that support any notification
axes[0].pie([k, m-k], labels=[f'supported ({k})',
                              f'not supported ({m-k})'], autopct='%1.1f%%')
axes[0].set_title('Notifications')
axes[1].pie([n, k-n], labels=[f'supported ({n})',
                              f'\n\nnot supported ({k-n})'], autopct='%1.1f%%')
axes[1].set_title('In-App Notifications')
axes[2].pie([l, k-l], labels=[f'supported ({l})',
                              f'not supported ({k-l})'], autopct='%1.1f%%')
axes[2].set_title('Device Notifications')
fig.tight_layout()

In [None]:
# getting all tech info for the 47 repos
urls = df_emaapps.url.to_list()
df_tech = df_filter3[df_filter3.url.isin(urls)]

In [None]:
# licensing
license_dict = df_tech.license.value_counts().to_dict()
fig, ax = plt.subplots()
ax.pie(list(license_dict.values()),
       labels=list(license_dict.keys()), autopct='%1.1f%%')
fig.tight_layout()

In [None]:
# languages in use
fig, ax = plt.subplots()
ax = df_tech.groupby('main_language').size().sort_values(ascending=False).plot(
  kind='bar'
)
ax.set_xlabel('Main Language')
ax.set_ylabel('Frequency')
for p in ax.patches:
  ax.annotate(str(int(p.get_height())),
              (p.get_x() + p.get_width() / 2.0, p.get_height() - 0.2),
              ha='center', va='center',
              xytext=(0, 10), textcoords='offset points')
fig.tight_layout()

In [None]:
from ast import literal_eval

lang_dict = {}
for i in range(df_tech.shape[0]):
  row = df_tech.iloc[i]
  langs = [row.main_language] + literal_eval(row.languages)
  for lang in langs:
    if lang not in lang_dict.keys():
      lang_dict[lang] = 1
    else:
      lang_dict[lang] += 1
lang_dict = dict(sorted(lang_dict.items(), key=lambda e: e[1], reverse=True))

fig, ax = plt.subplots()
ax.bar(list(lang_dict.keys())[:10], list(lang_dict.values())[:10])
ax.set_xlabel('Used Language')
ax.set_ylabel('Frequency')
ax.set_xticklabels(list(lang_dict.keys())[:10], rotation=90)
for p in ax.patches:
  ax.annotate(str(int(p.get_height())),
              (p.get_x() + p.get_width() / 2.0, p.get_height() - 0.4),
              ha='center', va='center',
              xytext=(0, 10), textcoords='offset points')
fig.tight_layout()

In [None]:
# getting times to be a datetime format
df_tech.created_at = df_tech.created_at.apply(pd.to_datetime)
df_tech.updated_at = df_tech.updated_at.apply(pd.to_datetime)

In [None]:
# df_apps_gh.groupby([df_apps_gh.created_at.dt.year, df_apps_gh.main_language]).size()
fig, ax = plt.subplots(ncols=3, nrows=2, figsize=(10, 6))

ys_android = []
ys_java = []
ys_kotlin = []
ys_iOS = []
ys_swift = []
ys_objectivec = []
ys_other = []
for year in range(2014, 2025, 1):
  ts_langs_dict = {'Android': 0, 'Java': 0, 'Kotlin': 0, 'iOS': 0,
                   'Swift': 0, 'Objective-C': 0, 'Other': 0}
  df_year = df_tech[df_tech.created_at.dt.year == year]
  for i in range(df_year.shape[0]):
    if df_year.iloc[i].main_language in list(ts_langs_dict.keys()):
      ts_langs_dict[df_year.iloc[i].main_language] += 1
    else:
      ts_langs_dict['Other'] += 1
  ts_langs_dict['Android'] = ts_langs_dict['Java'] + ts_langs_dict['Kotlin']
  ts_langs_dict['iOS'] = ts_langs_dict['Swift'] + ts_langs_dict['Objective-C']
  ys_android.append(ts_langs_dict['Android'])
  ys_java.append(ts_langs_dict['Java'])
  ys_kotlin.append(ts_langs_dict['Kotlin'])
  ys_iOS.append(ts_langs_dict['iOS'])
  ys_swift.append(ts_langs_dict['Swift'])
  ys_objectivec.append(ts_langs_dict['Objective-C'])
  ys_other.append(ts_langs_dict['Other'])

ax[0, 0].set_ylabel('By Year')
ax[0, 0].plot(list(range(2014, 2025, 1)), ys_android, label='Android')
ax[0, 0].plot(list(range(2014, 2025, 1)), ys_iOS, label='iOS')
ax[0, 0].plot(list(range(2014, 2025, 1)), ys_other, label='Other')
ax[0, 0].legend()

ax[0, 1].plot(list(range(2014, 2025, 1)), ys_java, label='Java')
ax[0, 1].plot(list(range(2014, 2025, 1)), ys_kotlin, label='Kotlin')
ax[0, 1].legend()

ax[0, 2].plot(list(range(2014, 2025, 1)), ys_swift, label='Swift')
ax[0, 2].plot(list(range(2014, 2025, 1)), ys_objectivec, label='Objective-C')
ax[0, 2].legend()

ys_android = []
ys_java = []
ys_kotlin = []
ys_iOS = []
ys_swift = []
ys_objectivec = []
ys_other = []
for year in range(2014, 2025, 1):
  ts_langs_dict = {'Android': 0, 'Java': 0, 'Kotlin': 0, 'iOS': 0,
                   'Swift': 0, 'Objective-C': 0, 'Other': 0}
  df_year = df_tech[df_tech.created_at.dt.year <= year]
  for i in range(df_year.shape[0]):
    if df_year.iloc[i].main_language in list(ts_langs_dict.keys()):
      ts_langs_dict[df_year.iloc[i].main_language] += 1
    else:
      ts_langs_dict['Other'] += 1
  ts_langs_dict['Android'] = ts_langs_dict['Java'] + ts_langs_dict['Kotlin']
  ts_langs_dict['iOS'] = ts_langs_dict['Swift'] + ts_langs_dict['Objective-C']
  ys_android.append(ts_langs_dict['Android'])
  ys_java.append(ts_langs_dict['Java'])
  ys_kotlin.append(ts_langs_dict['Kotlin'])
  ys_iOS.append(ts_langs_dict['iOS'])
  ys_swift.append(ts_langs_dict['Swift'])
  ys_objectivec.append(ts_langs_dict['Objective-C'])
  ys_other.append(ts_langs_dict['Other'])

ax[1, 0].set_ylabel('Cumulative')
ax[1, 0].plot(list(range(2014, 2025, 1)), ys_android, label='Android')
ax[1, 0].plot(list(range(2014, 2025, 1)), ys_iOS, label='iOS')
ax[1, 0].plot(list(range(2014, 2025, 1)), ys_other, label='Other')
ax[1, 0].legend()

ax[1, 1].plot(list(range(2014, 2025, 1)), ys_java, label='Java')
ax[1, 1].plot(list(range(2014, 2025, 1)), ys_kotlin, label='Kotlin')
ax[1, 1].legend()

ax[1, 2].plot(list(range(2014, 2025, 1)), ys_swift, label='Swift')
ax[1, 2].plot(list(range(2014, 2025, 1)), ys_objectivec, label='Objective-C')
ax[1, 2].legend()

fig.tight_layout()

In [None]:
# platforms
'Android' in df_platforms.iloc[0].OS
os_dict = {'Android': 0, 'iOS': 0, 'Cross-Plattform': 0, 'PC': 0}
for i in range(df_platforms.shape[0]):
  os = str(df_platforms.iloc[i].OS)
  if 'Android' in os and ',' not in os:
    os_dict['Android'] += 1
  elif 'iOS' in os and ',' not in os:
    os_dict['iOS'] += 1
  elif 'Linux' in os or 'MacOS' in os or'Windows' in os:
    os_dict['PC'] += 1
  elif 'Android, iOS' in os or 'Any' in os:
    os_dict['Cross-Plattform'] += 1

fig, ax = plt.subplots(ncols=2)
ax[0].pie(list(os_dict.values()),
          labels=list(os_dict.keys()), autopct='%1.1f%%')
ax[0].set_title('Supported OS')
ys = [df_platforms[df_platforms.on_smartphone == True].shape[0],
      df_platforms[df_platforms.on_smartwatch == True].shape[0],
      df_platforms[df_platforms.in_browser == True].shape[0],
      df_platforms[df_platforms.on_other.notna()].shape[0]]
ax[1].bar(['Smartphone', 'Smartwatch', 'Browser', 'Other'], ys)
ax[1].set_title('Supported Device')
for p in ax[1].patches:
  ax[1].annotate(str(int(p.get_height())),
                 (p.get_x() + p.get_width() / 2.0, p.get_height() - 0.6),
                 ha='center', va='center',
                 xytext=(0, 10), textcoords='offset points')
ax[1].set_xticklabels(['Phone', 'Watch', 'Browser', 'Other'])
fig.tight_layout()

In [None]:
df_platforms.on_other.replace(to_replace='PC', value=1, inplace=True)
df_platforms.on_other.fillna(0.0, inplace=True)

In [None]:
# Create a correlation matrix
corr_matrix = df_platforms[["on_smartphone", "on_smartwatch",
                            "in_browser", "on_other"]].corr()

# Configure the heatmap plot
fig, ax = plt.subplots()
im = ax.imshow(corr_matrix, cmap="coolwarm")  # Choose a colormap

# Add colorbar
fig.colorbar(im, label="Correlation Coefficient")

# Set ticks and labels for x and y axes
ax.set_xticks(range(len(corr_matrix.columns)))
ax.set_yticks(range(len(corr_matrix.columns)))
ax.set_xticklabels(corr_matrix.columns, rotation=45, ha="right")  # Rotate x-axis labels
ax.set_yticklabels(corr_matrix.columns)

# Add labels for each cell (optional)
for i in range(len(corr_matrix.columns)):
  for j in range(len(corr_matrix.columns)):
    ax.text(j, i, f"{corr_matrix.iloc[i, j]:.2f}", ha="center", va="center")  # Format to 2 decimal places

plt.xlabel("Device")
plt.ylabel("Device")
plt.title("Correlation Between Device Usage (Heatmap)")
plt.grid(False)
plt.tight_layout()  # Adjust layout for better readability

In [None]:
df = df_platforms.copy()
df.reset_index(drop=True, inplace=True)

contents = {
  'smartphone': [],
  'smartwatch': [],
  'browser': [],
  'PC': []
}

for i in range(df.shape[0]):
  row = df.loc[i]
  if row.on_smartphone == True:
    contents['smartphone'].append(i)
  if row.on_smartwatch == True:
    contents['smartwatch'].append(i)
  if row.in_browser == True:
    contents['browser'].append(i)
  if row.on_other == True:
    contents['PC'].append(i)

from upsetplot import from_contents, plot
plot(from_contents(contents))
plt.show()

In [None]:
plot(from_contents(contents))

In [None]:
filter_condition = ((df_platforms['on_smartwatch'] == True) & \
                    (df_platforms['on_other'] >= 0) & \
                    (df_platforms['in_browser'] == True) & \
                    (df_platforms['on_smartphone'] == True))
df_platforms[filter_condition].shape[0]

# 34 + 6 + 3 + 1 + 2 + 1

In [None]:
branches_dict = df_tech.default_branch.value_counts().to_dict()
fig, axes = plt.subplots(ncols=3, figsize=((12, 3)))

axes[0].pie(list(branches_dict.values()), labels=list(branches_dict.keys()),
            autopct='%1.1f%%')
axes[0].set_title('All Repositories')

branches_dict = df_tech[df_tech.created_at.dt.year < 2020].default_branch.value_counts().to_dict()
axes[1].pie(list(branches_dict.values()), labels=list(branches_dict.keys()),
            autopct='%1.1f%%')
axes[1].set_title('Before 2020')

branches_dict = df_tech[df_tech.created_at.dt.year >= 2020].default_branch.value_counts().to_dict()
axes[2].pie(list(branches_dict.values()), labels=list(branches_dict.keys()),
            autopct='%1.1f%%')
axes[2].set_title('After 2020')

fig.tight_layout()