# Quick code analysis

To get going with this notebook I suggest using VS Code with Python extension.

Create a virtual environment in this folder:

```
python -m venv .venv
```

Select the Python Interpreter from the venv in the quick command menu (Ctrl/Cmd + Shift + P) -> `> Python: Select Interpreter` (refresh the interpreter list or reload the window if it does not show up).

Open a terminal, load the venv if that's not done automatically, and install pandas:

```
pip install pandas
```

Ensure `git` can be be found.

Change the variables below to point to your repo of interest, and give this notebook a spin (VS Code may ask if it needs to install a Jupyter kernel).

In [23]:
# path = "/home/arjan/Projects/DeutscheBahn/py-capella-mbse"
path = "/home/arjan/Development/gtk"
after = "one year ago"
top = 10

# Regex patterns to exclude from analisys. E.g. generated files.
excludes = [".*\\.svg$"]


In [24]:
import os
import re
import pandas

os.chdir(path)

lines = []

compiled_excludes = [re.compile(ex) for ex in excludes]

for line in os.popen(f"git log --format=format:'::: %H,%ci,%an' --numstat --no-renames --after='{after}'").readlines():
    line = line.strip()
    if not line:
        continue
    elif line.startswith("::: "):
        meta = line.split(" ", 1)[1].split(",")
    else:
        try:
            added, removed, filename = line.split("\t", 2)
        except ValueError as e:
            print(f"'{line}': {e}")
            continue
        for ex in compiled_excludes:
            if re.search(ex, filename):
                break
        else:
            rec = [*meta, added, removed, filename]
            assert len(rec) == 6, rec
            lines.append(rec)

df = pandas.DataFrame(lines, columns=["date", "commit", "author", "added", "removed", "filename"])
df['dirname'] = df.apply(lambda row: os.path.dirname(row["filename"]), axis=1)
df['dirname2'] = df.apply(lambda row: os.path.dirname(row["dirname"]), axis=1)

del lines
df[:10]

Unnamed: 0,date,commit,author,added,removed,filename,dirname,dirname2
0,4a684a637112dd8eae14e250f6c48c0f6932368b,2021-09-29 05:57:36 +0000,Kukuh Syafaat,66,43,po/id.po,po,
1,e77eaa0acedd86371d9d81de3982d8657864915d,2021-09-28 17:36:23 -0700,Christian Hergert,2,2,gdk/macos/gdkmacosglcontext.c,gdk/macos,gdk
2,5bc3923bae211780715714e559b24398f6a385b8,2021-09-28 17:48:50 -0400,Matthias Clasen,1,3,testsuite/tools/simplify-data-3to4/toolbar.exp...,testsuite/tools/simplify-data-3to4,testsuite/tools
3,5bc3923bae211780715714e559b24398f6a385b8,2021-09-28 17:48:50 -0400,Matthias Clasen,6,7,tools/gtk-builder-tool-simplify.c,tools,
4,e99ac8f6d831a3fccaffd255bf229a5efe633870,2021-09-28 17:48:50 -0400,Matthias Clasen,11,13,gtk/gtkbuilder.c,gtk,
5,f5db5018790d66661bc5c03af9b6da38fe73e043,2021-09-28 20:28:18 +0000,Yaron Shahrabani,1718,1994,po/he.po,po,
6,83b434d6a506326f4a2948cca9396af9e94bd13d,2021-09-28 15:47:06 -0400,Matthias Clasen,12,0,docs/reference/gtk/migrating-3to4.md,docs/reference/gtk,docs/reference
7,2c9a2e94c834b9e93dc67cf1c14d738e7984a1de,2021-09-28 15:40:41 -0400,Matthias Clasen,1,1,gtk/gtkentry.c,gtk,
8,32191bc18ea85201f6b1a003d4cb465634363f11,2021-09-28 15:19:17 -0400,Matthias Clasen,2,0,gtk/gtkcsstransformvalue.c,gtk,
9,c4069fdcee22cdd481c5f71955447a151c8158e0,2021-09-28 15:09:49 -0400,Matthias Clasen,1,4,gtk/gtkbuilder.c,gtk,


## Churn

Churn is simplyhow many times a file has changed in the history of a project. The more often it changed, the higher the "churn".

In [25]:
churn = df.groupby(['filename']).size().reset_index(name='counts').sort_values("counts", ascending=False)
churn[:top]

Unnamed: 0,filename,counts
1687,gtk/gtkwidget.c,94
1012,gtk/a11y/gtkatspicontext.c,91
1692,gtk/gtkwindow.c,84
2349,meson.build,78
1843,gtk/theme/Adwaita/_common.scss,71
646,gsk/gl/gskglrenderer.c,62
728,gsk/ngl/gsknglrenderjob.c,62
541,gdk/wayland/gdksurface-wayland.c,58
459,gdk/gdksurface.c,49
1650,gtk/gtktextview.c,49


# Top authors

The authors that contributed most to the repository.

In [26]:
top_authors = df.groupby(['author']).size().reset_index(name='commits').sort_values("commits", ascending=False)
top_authors[:top]

Unnamed: 0,author,commits
99,Matthias Clasen,5350
51,Emmanuele Bassi,946
22,Benjamin Otte,721
33,Christian Hergert,318
68,Jakub Steiner,314
135,Timm Bäder,281
78,Jonas Ådahl,252
5,Alexander Mikhaylenko,129
35,Chun-wei Fan,108
11,Arnaud Bonatti,91


In [27]:
top_authors_per_package = df.groupby(['dirname', 'author']).size().reset_index(name='commits').sort_values("commits", ascending=False)
top_authors_per_package[:top]

Unnamed: 0,dirname,author,commits
305,gtk,Matthias Clasen,2073
283,gtk,Emmanuele Bassi,319
136,gdk,Matthias Clasen,270
49,demos/gtk-demo,Matthias Clasen,194
332,gtk/a11y,Matthias Clasen,164
320,gtk,Timm Bäder,164
246,gsk/ngl,Matthias Clasen,156
201,gdk/x11,Benjamin Otte,150
572,testsuite/gtk,Matthias Clasen,148
155,gdk/macos,Christian Hergert,133


# Change Coupling

Change coupling tells us which files have a tendency to change together.

In [28]:
from IPython.core.display import HTML

combinations = {}
commits = {}

for _, group in df.groupby(['commit']):
    import itertools
    
    for name in group["filename"]:
        commits[name] = 1 + commits.get(name, 0)

    for pair in set(itertools.combinations(sorted(group['filename']), 2)):
        combinations[pair] = 1 + combinations.get(pair, 0)


change_coupling = sorted(((n / commits[a] + n / commits[b], n, a, b) for (a, b), n in combinations.items()), reverse=1)[:top]

rows = (
    f"""
    <tr><td>{n}</td><td>{file_a}</td><td>{commits[file_a]}</td><td>{int(n / commits[file_a] * 100)}</td></tr>
    <tr><td></td><td>{file_b}</td><td>{commits[file_b]}</td><td>{int(n / commits[file_b] * 100)}</td></tr>
    """ for _, n, file_a, file_b in change_coupling)

HTML(f"<table><th><td>Coupled Entities</td><td>Commits</td><td>% coupling</td></th>{''.join(rows)}</table>")

0,1,2,3
8.0,po-properties/POTFILES.in,8,100
,po/POTFILES.in,8,100
5.0,build-aux/flatpak/org.gtk.IconBrowser4.json,5,100
,build-aux/flatpak/org.gtk.WidgetFactory4.json,5,100
4.0,testsuite/reftests/window-show-contents-on-map.ref.ui,4,100
,testsuite/reftests/window-show-contents-on-map.ui,4,100
4.0,testsuite/gtk/focus-chain/widget-factory2.ui,4,100
,testsuite/gtk/focus-chain/widget-factory3.ui,4,100
4.0,testsuite/gtk/focus-chain/widget-factory.ui,4,100
,testsuite/gtk/focus-chain/widget-factory3.ui,4,100
