In [None]:
import pandas as pd

before re-classification we loaded the mauczka data into a mysql database like so
from that we create a dataframe (perfective = internal_quality, corrective = external_quality)

```python
import MySQLdb
import MySQLdb.cursors
db = MySQLdb.connect(user='root', passwd='balla', db='msrsurvey', cursorclass=MySQLdb.cursors.DictCursor)

c=db.cursor()
c.execute("""SELECT sr.sw_adaptive, sr.sw_corrective, sr.sw_perfective, c.title, p.home, c.identifier from SurveyResults as sr, Commits as c, Projects as p WHERE c.id = sr.id AND c.project = p.id""")

dat = []
for row in c:
    dat.append({'project_url': row['home'], 'revision_hash': row['identifier'], 'message': row['title'], 'internal_quality': row['sw_perfective'] == 1, 'external_quality': row['sw_corrective'] == 1, 'sw_adaptive': row['sw_adaptive'] == 1})


df = pd.DataFrame(dat)

# map urls to names
map_urls = {'http://valadoc.org': 'vala-doc',
            'https://wiki.gnome.org/Projects/Vala': 'vala',
            'https://www.drupal.org/project/search_api': 'drupal-search-api',
            'https://code.google.com/a/eclipselabs.org/p/tapiji/': 'tapiji',
            'https://projects.eclipse.org/projects/mylyn.reviews': 'mylyn-reviews',
            'https://deltaspike.apache.org/': 'deltaspike'}
df['project'] = df['project_url'].map(map_urls)

# retain only java projects
df = df[~df['project'].isin(['vala-doc', 'vala', 'drupal-search-api'])].copy()

df.to_csv('../data/mauczka_label.csv', index=False)
```

In [None]:
# load data
df = pd.read_csv('../data/mauczka_label.csv')

In [None]:
# create re-label UI
# this needs ipywidgets which may require additional steps for jupyter lab

from IPython.display import display, Markdown, clear_output
from ipywidgets import Layout
import ipywidgets as widgets

# create ui
project = widgets.Text(
       value='',
       description='Project',
        layout=Layout( width='auto'))
revision = widgets.Text(
       value='',
       description='Revision hash',
        layout=Layout( width='auto'))
text = widgets.Textarea(
       value='',
       description='Commit message',
        layout=Layout( width='auto', height='250px'))
label_internal_quality = widgets.Checkbox(
       description='Perfective?',)
label_external_quality = widgets.Checkbox(
       description='Corrective?',)
button = widgets.Button(description='Set label')

progress = widgets.HTML(value='', description='Progress')
link = widgets.HTML(value='', description='Revision')

def load_next():
    """Sample from unlabeled messages in the sample, change values on ui elements."""

    sample = df[df['has_label'] == False].sample(n=1)

    project.value = sample['project'].values[0]
    revision.value = sample['revision_hash'].values[0]
    text.value = sample['message'].values[0]

    progress.value = '{}/{}'.format(len(df[df['has_label'] == True]), len(df))
    owner = 'apache'
    repo = project.value
    if project.value == 'tapiji':
        owner = 'tapiji'
    if project.value == 'mylyn-reviews':
        owner = 'eclipse'
        repo = 'mylyn.reviews'
    link.value = '<a href="https://github.com/{}/{}/commit/{}" target="_blank">{}</a>'.format(owner, repo, revision.value, revision.value)

    label_internal_quality.value = False
    label_external_quality.value = False

def set_label(button):
    """Set label after click and then load next."""
    df.loc[df['revision_hash'] == revision.value, 'label_internal_quality'] = label_internal_quality.value
    df.loc[df['revision_hash'] == revision.value, 'label_external_quality'] = label_external_quality.value
    df.loc[df['revision_hash'] == revision.value, 'has_label'] = True
    load_next()

load_next()  # init
button.on_click(set_label)

label_box = widgets.HBox([label_internal_quality, label_external_quality])
box = widgets.VBox([progress, project, link, text, label_box, button], layout={'display': 'flex', 'flex_flow': 'column', 'align_items': 'stretch'})

# show it
display(box)

In [None]:
# safe data
df.write_csv('../data/mauczka_label_finished.csv', index=False)