In [1]:
import os
#we will process the data (which is a csv file)
import pandas as pd

#API client
from kili.client import Kili
#Why not use pretty progress bars?
from tqdm import tqdm

from dotenv import load_dotenv

load_dotenv()

True

In order to access the platform, we need to authenticate our imported client

In [2]:
API_KEY = os.getenv('KILI_API_KEY')
# initialize and authenticate the Kili client
kili = Kili(api_key=API_KEY)

Now we can start to prepare our interface, the interface will be just a dictionary in Python. We will define our jobs, then fill the labels up. Since all labels also could have children labels, we will pass dictionaries as labels too!

In [3]:
labels = ['User experience', 'Subscription', 'Content', 'Other', 'Multi label']
entity_dict = {
    'User experience': '#cc4125',
    'Subscription': '#4543e6',
    'Content': '#3edeb6',
}
project_name = 'User review dataset for topic classification'
project_description = "Medium's app reviews fetched from google play store for topic classification"

interface = {
    'jobs': {
        'JOB_0': {
            'mlTask': 'CLASSIFICATION',
            'instruction': 'Labels',
            'required': 1,
            'content': {
                "categories": {},
                "input": "radio",
            },
        },
        'JOB_1': {
            'mlTask': "NAMED_ENTITIES_RECOGNITION",
            'instruction': 'Entities',
            'required': 1,
            'content': {
                'categories': {},
                "input": "radio"
            },
        },
    }
}

# fill the interface json with jobs
for label in labels:
    # converts labels to uppercase and replaces whitespaces with underscores (_)
    # ex. User experience -> USER_EXPERIENCE
    # this is the preferred way to fill the interface
    label_upper = label.strip().upper().replace(' ', '_')
    #
    content_dict_0 = interface['jobs']['JOB_0']['content']
    categories_0 = content_dict_0['categories']
    category = {'name': label, 'children': []}
    categories_0[label_upper] = category

for label, color in entity_dict.items():
    label_upper = label.strip().upper().replace(' ', '_')
    content_dict_1 = interface['jobs']['JOB_1']['content']
    categories_1 = content_dict_1['categories']
    category = {'name': label, 'children': [], 'color': color}
    categories_1[label_upper] = category

# now we can create our project
# this method returns the created project’s id
project_id = kili.create_project(json_interface=interface,
                                 input_type='TEXT',
                                 title=project_name,
                                 description=project_description)['id']


In [4]:
project = kili.projects(project_id)
project

[{'roles': [{'user': {'email': 'itsc0508@gmail.com',
     'id': 'ckx7rorth01f20kvj24912j7c',
     'name': 'Alper Balbay'},
    'id': 'ckyoycx060atq0lt4c2n4elzl',
    'role': 'ADMIN'}],
  'consensusTotCoverage': 0,
  'id': 'ckyoycwvq0atn0lt474jlamna',
  'inputType': 'TEXT',
  'interfaceCategory': 'IV2',
  'jsonInterface': {'jobs': {'JOB_0': {'mlTask': 'CLASSIFICATION',
     'instruction': 'Labels',
     'required': 1,
     'content': {'categories': {'USER_EXPERIENCE': {'name': 'User experience',
        'children': []},
       'SUBSCRIPTION': {'name': 'Subscription', 'children': []},
       'CONTENT': {'name': 'Content', 'children': []},
       'OTHER': {'name': 'Other', 'children': []},
       'MULTI_LABEL': {'name': 'Multi label', 'children': []}},
      'input': 'radio'}},
    'JOB_1': {'mlTask': 'NAMED_ENTITIES_RECOGNITION',
     'instruction': 'Entities',
     'required': 1,
     'content': {'categories': {'USER_EXPERIENCE': {'name': 'User experience',
        'children': [],
     

In [5]:
dataset_path = '../data/processed/lowercase_cleaned_dataset.csv'
df = pd.read_csv(dataset_path).reset_index()
df.sample(10)

Unnamed: 0,index,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt
36060,36060,gp:AOqpTOE9egdxZT2PZlaD_-cOscQKvnsYHvKooKbJeGq...,Omar Shabab,https://play-lh.googleusercontent.com/a-/AOh14...,i simply love it.,5,0,3.2.2340,2017-01-31 14:01:47,,
7033,7033,gp:AOqpTOF2_P3vMlSKSW_fusbdCZvNh-FPkQdf0wsFfSn...,Prithvi Raj,https://play-lh.googleusercontent.com/a-/AOh14...,brilliant portal,5,0,3.19.1035918,2020-09-13 08:48:22,,
28717,28717,gp:AOqpTOHqdTEMEiLrjh3i9urcBTQFhmQavz0BjPcdw5T...,Ray Spencer,https://play-lh.googleusercontent.com/a-/AOh14...,medium helps me up the quality of my social me...,5,0,3.6.6735,2018-11-06 03:09:52,,
6085,6085,gp:AOqpTOGa_WJ1V7MLr70S5m53c2cd4-vTqDE7D86Tpxt...,Keya Arati,https://play-lh.googleusercontent.com/a-/AOh14...,i can't search anymore. can anyone tell me the...,3,0,4.0.1045135,2020-10-27 22:25:19,Hi there! This issue should be resolved in the...,2020-12-15 13:19:08
18465,18465,gp:AOqpTOH6wRn3yEJx8IUn-urkIpwY2la8OVjZqFf6K0A...,Rush Wars,https://play-lh.googleusercontent.com/a/AATXAJ...,good,5,0,,2019-09-10 11:49:40,,
4109,4109,gp:AOqpTOEBaJUh8mjxJvhBxvzX_0DY-wUU3j6wsscraHf...,John Adjei,https://play-lh.googleusercontent.com/a/AATXAJ...,excellent app very useful for marketing,5,0,3.14.1001360,2021-02-05 18:47:29,,
24292,24292,gp:AOqpTOEvb3lkDQe-WEsyKX5_hIERssy2Gei_P_OOr0O...,Syed Ahmad,https://play-lh.googleusercontent.com/a/AATXAJ...,time well spent. can be addictive though,4,0,,2019-03-25 10:28:26,"Hi Syed, thanks for reviewing our app! :)",2019-03-25 21:46:25
37736,37736,gp:AOqpTOHXzo5zdfKG4x7rfZkZI1InIhj1f_XH0FpSCOe...,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,bestest,5,0,2.1.1695,2016-07-07 04:43:26,,
27699,27699,gp:AOqpTOHkaN4CND6bPnzAMClxGEUIb--ARO59RX8V_CR...,Jay Ahr Medez,https://play-lh.googleusercontent.com/a-/AOh14...,good apps,5,0,3.6.6916,2018-12-11 06:03:34,,
17788,17788,gp:AOqpTOFIwqGwgx_6IUqBC95MnTACvHRge_78VhQi9_i...,kalyan pvs,https://play-lh.googleusercontent.com/a-/AOh14...,uanble to read storys.pricing too high content...,1,0,3.8.1000493,2019-10-05 21:10:30,,


We are ready to upload our data to the project. I will use the `append_many_to_dataset` method to import the data into the platform. Currently, we can only import 100 samples at a time by using Python API, I have prepared a simple function to upload the data

In [6]:
def import_dataframe(project_id:str, dataset:pd.DataFrame, text_data_column:str, external_id_column:str, subset_size:int=100) -> bool:
    """
    Simply imports the given `dataset` DataFrame to a project specified by project_id.
    
    Arguments:
    Inputs
        - project_id (str): specifies the project to load the data, this is also returned when we create our project
        - dataset (pandas DataFrame): Dataset that has proper columns for id and text inputs
        - text_data_column (str): specifies which column has the text input data
        - external_id_column (str): specifies which column has the ids
        - subset_size (int): specifies the number of samples to import at a time. Cannot be higher than 100
    
    Outputs:
        None
    
    Returns: 
        True or False regards to process succession

    """

    assert subset_size <= 100, "Kili only allows to upload 100 assets at most at a time onto the app"


    L = len(dataset)

    # set 25000 as an upload limit, can be changed
    if L>25000:
        print('Kili Projects currently supports maximum 25000 samples as default. Importing first 25000 samples...')
        L=25000

    i = 0

    while i+subset_size < L:

        subset = dataset.iloc[i:i+subset_size]

        externalIds = subset[external_id_column].astype(str).to_list()
        contents = subset[text_data_column].astype(str).to_list()
        
        kili.append_many_to_dataset(project_id=project_id,
                                    content_array=contents,
                                    external_id_array=externalIds)

        i += subset_size

    return True

We can see the arguments from docstring, we just need to pass our dataset along with the corresponding column names. I’ll just use the sample indices we get when we load the data. And then voila, uploading the data is done!

In [7]:
import_dataframe(project_id, df, 'content', 'index', verbose=True)

Kili Projects currently supports maximum 25000 samples as default. Importing first 25000 samples...


True