# RQ4: What are the naming smells of tags?
In this notebook, we are going to investigate the naming smells of tags.

## Load data and import libraries

In [1]:
%load_ext autoreload

# Auto reloading causes the kernel to reload the libraries we have
%autoreload 2

# usual imports for visualization, etc.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdate
import matplotlib.ticker as ticker
import datetime
import re
from pandas import Timestamp

# make it reproducible
np.random.seed(0)

# show plots inline
%matplotlib inline

In [2]:
data_folder = '../data/'

dockerfiles = pd.read_csv(data_folder + 'docker_image_dataset.csv', error_bad_lines=False, warn_bad_lines=False,low_memory=False)

## Define helper functions and defaults

In [3]:
# Helper Functions
!mkdir figs
figs_dir = 'figs/'

In [4]:
from cycler import cycler
def set_plt_rc():
    SMALL_SIZE = 8
    MEDIUM_SIZE = 10
    BIGGER_SIZE = 12
    
    font = {'family': 'serif','size': BIGGER_SIZE}
    plt.rc('font', **font)

    plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
    plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
    plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
    plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
    plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
        
    plt.rc('axes', prop_cycle=(cycler(color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728',
                                         '#9467bd', '#8c564b', '#e377c2', '#7f7f7f',
                                         '#bcbd22', '#17becf']) +
                           cycler(linestyle=['-', '--', ':', '-.','-', '--', ':', '-.','-', '--'])))
    
set_plt_rc()

## Docker Image Tags

In [5]:
# Function for converting stringied list to list
def strlist2list(s):
    try:
        return(eval(s))
    except:
        return np.nan
dockerfiles['tags_name']=dockerfiles['tags_name'].apply(lambda x: strlist2list(x))
tags_name=dockerfiles['tags_name'].dropna()
tags_name.index=range(0,len(tags_name))
image_update_time=dockerfiles['image_updated_at'].apply(lambda x: strlist2list(x))

# Ignore Docker images with only one latest tag
non_latest_tags_name=dockerfiles['tags_name'].dropna()
for i in non_latest_tags_name.index:
    if(len(non_latest_tags_name[i])==1):
        if('latest'== non_latest_tags_name[i][0]):
            non_latest_tags_name[i]=np.nan
non_latest_tags_name=non_latest_tags_name.dropna()

In [6]:
latest_only_count=0
for i in range(len(tags_name)):
    if(len(tags_name[i])==1):
        if('latest'== tags_name[i][0]):
            latest_only_count+=1
print('The proportion of Docker images which only have one latest tag: ', latest_only_count/len(tags_name))

The proportion of Docker images which only have one latest tag:  0.48312396826257603


In [7]:
print('The number of Docker images with self-defined tags: ', len(non_latest_tags_name))

The number of Docker images with self-defined tags:  886415


In [8]:
without_latest_count=0
for i in non_latest_tags_name.index:
    if(not('latest' in non_latest_tags_name[i])):
        without_latest_count+=1
print('The proportion of Docker images which do not have latest tag: ', without_latest_count/len(non_latest_tags_name))

The proportion of Docker images which do not have latest tag:  0.6977747443353283


In [9]:
count=0
long_tag_name_index=[]
for i in non_latest_tags_name.index:
    for j in range(len(non_latest_tags_name[i])):
        if(len(non_latest_tags_name[i][j])>=20):
            count+=1
            long_tag_name_index.append(i)
            break
print('The proportion of Docker images which have overly long tag name: ', count/len(non_latest_tags_name))

The proportion of Docker images which have overly long tag name:  0.06530462593706109


In [10]:
count=0
sha_tag_name_index=[]
for i in non_latest_tags_name.index:
    for j in range(len(non_latest_tags_name[i])):
        if(len(non_latest_tags_name[i][j])>=40 and non_latest_tags_name[i][j].isalnum()):
            count+=1
            sha_tag_name_index.append(i)
            break
print('The proportion of Docker images which use the image SHA as the tag name: ', count/len(non_latest_tags_name))

The proportion of Docker images which use the image SHA as the tag name:  0.019212220009814814


In [11]:
count=0
unmatched_latest_tag_index=[]
for i in non_latest_tags_name.index:
    if('latest' in non_latest_tags_name[i]):
        if(non_latest_tags_name[i][0]!='latest'):
            latest_index=non_latest_tags_name[i].index('latest')
            if(pd.Timestamp(image_update_time[i][0])!=pd.Timestamp(image_update_time[i][latest_index])):
                count+=1
                unmatched_latest_tag_index.append(i)
print('The proportion of Docker images which have the latest tag, but the lastest tag does not point to the latest version of the image: ', count/len(non_latest_tags_name))

The proportion of Docker images which have the latest tag, but the lastest tag does not point to the latest version of the image:  0.16109158802592466


In [14]:
count=0
for i in non_latest_tags_name.index:
    if('latest' in non_latest_tags_name[i]):
        latest_index=non_latest_tags_name[i].index('latest')
        time_diff=pd.Timestamp(image_update_time[i][0])-pd.Timestamp(image_update_time[i][latest_index])
        if(latest_index>=3):
            count+=1
print('The proportion of Docker images which have the latest tag, but there are at least 3 versions between the lastest image and the image the latest tag points to: ', count/len(non_latest_tags_name))

The proportion of Docker images which have the latest tag, but there are at least 3 versions between the lastest image and the image the latest tag points to:  0.03715077023741701


In [15]:
count=0
for i in non_latest_tags_name.index:
    if('latest' in non_latest_tags_name[i]):
        latest_index=non_latest_tags_name[i].index('latest')
        time_diff=pd.Timestamp(image_update_time[i][0])-pd.Timestamp(image_update_time[i][latest_index])
        if(time_diff.days>=90):
            count+=1
print('The proportion of Docker images which have the latest tag, but the lastest tag points to an image which was updated more than 3 months ago comparing to the latest version of the image:', count/len(non_latest_tags_name))

The proportion of Docker images which have the latest tag, but the lastest tag points to an image which was updated more than 3 months ago comparing to the latest version of the image: 0.022824523501971426
