In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast

In [2]:
# Load cleaned data
df = pd.read_csv("cleaned_data.csv")

In [3]:
print("=" * 60)
print("DOCK-GUARDIAN: Dataset Analysis Report")
print("=" * 60)

DOCK-GUARDIAN: Dataset Analysis Report


In [4]:
# =========================================================
# 1. BASIC STATISTICS
# =========================================================
print("\n[1] DATASET OVERVIEW\n")
print(f"Total Images: {len(df)}")
print(f"Total Publishers: {df['publisher'].nunique()}")
df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')
print(f"Date Range: {df['created_at'].min()} to {df['created_at'].max()}")


[1] DATASET OVERVIEW

Total Images: 5665
Total Publishers: 2286
Date Range: 2014-07-19 07:01:44.542430+00:00 to 2022-04-12 19:20:46.789393+00:00


In [5]:
# =========================================================
# 2. CERTIFICATION STATUS
# =========================================================
print("\n[2] CERTIFICATION STATUS DISTRIBUTION\n")
cert_dist = df['certification_status'].value_counts()
print(cert_dist)


[2] CERTIFICATION STATUS DISTRIBUTION

certification_status
not_certified    5598
certified          66
invalid             1
Name: count, dtype: int64


In [6]:
# =========================================================
# 3. IMAGE TYPES
# =========================================================
print("\n[3] IMAGE TYPE DISTRIBUTION\n")
type_dist = df['type'].value_counts()
print(type_dist)


[3] IMAGE TYPE DISTRIBUTION

type
image      5633
plugin       24
edition       8
Name: count, dtype: int64


In [7]:
# =========================================================
# 4. POPULARITY METRICS
# =========================================================
print("\n[4] POPULARITY METRICS\n")
print(f"Average Stars: {df['star_count'].mean():.2f}")
print(f"Average Pulls: {df['pull_count'].mean():.2f}")
print(f"Most Popular: {df.loc[df['pull_count'].idxmax(), 'name']}")
print(f"  - Pulls: {df['pull_count'].max():,}")


[4] POPULARITY METRICS

Average Stars: 5.62
Average Pulls: 66.98
Most Popular: bzocker/sdkman
  - Pulls: 992


In [8]:
# =========================================================
# 5. TOP CATEGORIES
# =========================================================
print("\n[5] TOP 10 CATEGORIES\n")
# Convert string representation back to lists
df['categories'] = df['categories'].apply(eval)
all_categories = [cat for cats in df['categories'] for cat in cats]
from collections import Counter
cat_counts = Counter(all_categories)
for cat, count in cat_counts.most_common(10):
    print(f"  {cat}: {count}")


[5] TOP 10 CATEGORIES

  Base Images: 297
  Application Frameworks: 72
  Application Infrastructure: 70
  Application Services: 62
  Analytics: 61
  Monitoring: 52
  Databases: 52
  DevOps Tools: 51
  Programming Languages: 43
  Storage: 21


In [9]:
# =========================================================
# 7. ARCHITECTURES
# =========================================================
print("\n[7] SUPPORTED ARCHITECTURES\n")
df['architectures'] = df['architectures'].apply(eval)
all_arch = [arch for archs in df['architectures'] for arch in archs]
arch_counts = Counter(all_arch)
for arch, count in arch_counts.most_common():
    print(f"  {arch}: {count}")


[7] SUPPORTED ARCHITECTURES

  x86-64: 5271
  arm64: 430
  arm: 323
  IBM Z: 295
  ppc64le: 253
  ARM 64: 133
  386: 122
  ARM: 97
  PowerPC 64 LE: 89
  : 76
  mips64le: 31
  riscv64: 11
  IBM POWER: 7


In [10]:
# =========================================================
# 6. OPERATING SYSTEMS
# =========================================================
print("\n[6] OPERATING SYSTEMS DISTRIBUTION\n")
df['operating_systems'] = df['operating_systems'].apply(eval)
all_os = [os for oss in df['operating_systems'] for os in oss]
os_counts = Counter(all_os)
for os, count in os_counts.most_common(10):
    print(f"  {os}: {count}")


[6] OPERATING SYSTEMS DISTRIBUTION

  Linux: 5049
  Windows: 408
  : 364
  darwin: 2
  macOS: 1


In [11]:
# =========================================================
# 8. RISK INDICATORS (Initial Assessment)
# =========================================================
print("\n[8] INITIAL RISK INDICATORS\n")

certified = len(df[df['certification_status'] == 'certified'])
uncertified = len(df[df['certification_status'] != 'certified'])
print(f"Certified Images: {certified} ({certified/len(df)*100:.1f}%)")
print(f"Uncertified Images: {uncertified} ({uncertified/len(df)*100:.1f}%)")

outdated_threshold = pd.Timestamp.now(tz='UTC') - pd.Timedelta(days=365)
df['updated_at'] = pd.to_datetime(df['updated_at'], utc=True)
outdated = len(df[df['updated_at'] < outdated_threshold])
print(f"Not Updated in 1 Year: {outdated} ({outdated/len(df)*100:.1f}%)")

low_popularity = len(df[df['star_count'] < 10])
print(f"Low Community Trust (<10 stars): {low_popularity} ({low_popularity/len(df)*100:.1f}%)")

print("\n" + "=" * 60)
print("[✓] Analysis Complete")
print("=" * 60)


[8] INITIAL RISK INDICATORS

Certified Images: 66 (1.2%)
Uncertified Images: 5599 (98.8%)
Not Updated in 1 Year: 5664 (100.0%)
Low Community Trust (<10 stars): 5381 (95.0%)

[✓] Analysis Complete
