**This notebook does the following**

* Go through all issues that has label `samples`
* For each issue, go through all the comments, get all the image urls
* Get the reaction of each comment and use that as a target label (`weed`, `not_weed`)
* Save all (url, target_label) to a csv file

In [8]:
import pandas as pd
from github import Github
import re
import os


In [10]:
github_token = os.environ['TOKEN']

In [11]:
# use access token
g = Github(github_token)


In [12]:
repo_name = 'alext234/weeds-id'
repo = g.get_repo(repo_name)

In [15]:
def extract_img_urls(content):
    '''
    Extract all image urls following this format
    ![](https://abc.com/img.jpg)

    '''
    regex = re.compile('\!\[.*\]\((.+)\)')
    m = regex.findall(content)
    return m

In [66]:
def handle_comment(comment):
    '''
    return a DataFrame of of (url, label)
    '''
    imgs = extract_img_urls(comment.body)
#     print("comment by {username} ".format(username=comment.user.login))
#     print("  image urls {imgs} ".format(imgs=imgs))
    reactions = comment.get_reactions()
    is_weed = any([reaction.content == '+1' for reaction in reactions])
    label = 'weed' if is_weed else 'not_weed'
    # reaction.user.login --> the reaction's username
    
    return pd.DataFrame({'url': imgs, 'label': label})

In [67]:
# extract image urls and reaction from all issue's comments
# API ref 
# https://pygithub.readthedocs.io/en/latest/github_objects/Issue.html
def handle_issue(issue):
    '''
    return a DataFrame of of (url, label)
    '''
    print("handling issue '{title}'".format(title=issue.title))
    comments = issue.get_comments()
    df = pd.DataFrame()
    for comment in comments:
        df = df.append(handle_comment(comment))
    return df

In [68]:
# # return list of (url,label)
def get_sample_urls():
    '''
    Go through all issues with label 'samples' and extract image url and reaction.
    Return a DataFrame of (url, is_weed)
    '''
    label_sample = repo.get_label("samples")
    # get issue with label 'Samples'
    sample_issues = repo.get_issues(labels=[label_sample])
    df = pd.DataFrame()
    for issue in sample_issues:
        df = df.append(handle_issue(issue))
    return df.reset_index(drop=True)

In [69]:
# TODO mapping of '+1' --> 'weed', no reaction --> 'not_weed'
# TODO handle issue should return list of (url, label)
# TODO download images into folders, zip them and upload
# TODO should be resize and cropped center (e.g. 225 x 225)
df = get_sample_urls()

handling issue 'Samples'


In [70]:
df.head(10)

Unnamed: 0,url,label
0,https://user-images.githubusercontent.com/1624...,weed
1,https://user-images.githubusercontent.com/1624...,not_weed
2,https://user-images.githubusercontent.com/1624...,weed
3,https://user-images.githubusercontent.com/1624...,not_weed
4,https://user-images.githubusercontent.com/1624...,not_weed
5,https://user-images.githubusercontent.com/1624...,weed
6,https://user-images.githubusercontent.com/1624...,weed
7,https://user-images.githubusercontent.com/1624...,not_weed
8,https://user-images.githubusercontent.com/1624...,not_weed
9,https://user-images.githubusercontent.com/1624...,not_weed


In [76]:
# label count 
df.groupby('label').count()


Unnamed: 0_level_0,url
label,Unnamed: 1_level_1
not_weed,11
weed,13


In [77]:
csv_file =  'urls_labels.csv'
df.to_csv(csv_file, index=False)