In [None]:
import csv
import matplotlib.pyplot as plt
import pandas as pd

from datetime import datetime
from os import path

CSV_PATH = path.join("metadata", "csvs")

TAG_DB_FILENAME = "tags.csv"
TAG_DB_PATH = path.join(CSV_PATH, TAG_DB_FILENAME)

TOP = 16

In [None]:
def get_datetime(val):
  return datetime.strptime(val + "-0300", "%Y-%m-%d %H:%M:%S%z")

def get_year(val):
  return get_datetime(val).year

def get_month(val):
  return get_datetime(val).month

def get_year_month(val):
  dt = get_datetime(val)
  return dt.year, dt.month

In [None]:
tag_df = pd.read_csv(TAG_DB_PATH)

tag_df = tag_df.rename(columns={"name.1":"tag"})

tag_df["year"] = tag_df.apply(lambda r: get_year(r["dataUpload"]), axis=1)
tag_df["month"] = tag_df.apply(lambda r: get_month(r["dataUpload"]), axis=1)

tag_df["tag"] = tag_df.apply(lambda r: str(r["tag"]).lower().strip(), axis=1)

tag_df = tag_df.drop(columns=["name", "id", "type", "dataUpload"])

tag_df

In [None]:
min_year = tag_df["year"].min()
max_year = tag_df["year"].max()

total_tags = tag_df.shape[0]
unique_tags = tag_df["tag"].unique().shape[0]
unique_ids = tag_df["photo_id"].unique().shape[0]

tag_counts = tag_df.tag.value_counts()
tag_top_counts = tag_counts[:TOP]
tag_top_labels = tag_top_counts.index

min_year, max_year, total_tags, unique_tags, unique_ids

### Tag by Photo

In [None]:
# tags used in more than 1% of images (~140 times)

USED_IN_PCT = 1

tags_top_pct = tag_counts[tag_counts > int(USED_IN_PCT * unique_ids // 100)].keys().tolist()
tag_top_pct_df = tag_df.loc[tag_df["tag"].isin(tags_top_pct)].drop(columns=["year", "month"])

tag_top_pct_total_tags = tag_top_pct_df.shape[0]
tag_top_pct_unique_tags = tag_top_pct_df["tag"].unique().shape[0]
tag_top_pct_unique_ids = tag_top_pct_df["photo_id"].unique().shape[0]

print(tag_top_pct_total_tags, tag_top_pct_unique_tags, tag_top_pct_unique_ids)

tag_top_pct_df

### Tag Count

In [None]:
image_by_tag_count = tag_top_pct_df.groupby(["photo_id"], as_index=False)["photo_id"].value_counts().sort_values("count", ascending=False)

image_by_tag_count

### Tag Correlation

In [None]:
tag_top_correlation_abs = {k0: {k1: 0 for k1 in tag_top_labels} for k0 in tag_top_labels}
tag_top_correlation_pct = {k0: {k1: 0 for k1 in tag_top_labels} for k0 in tag_top_labels}

tag_top_df = tag_df[tag_df["tag"].isin(tag_top_labels)]
id_idx = tag_top_df.groupby(["photo_id"]).count().index

for k in id_idx.tolist():
  img_tags = tag_top_df.loc[tag_top_df["photo_id"] == k]["tag"].values
  for t0 in img_tags:
    tag_top_correlation_abs[t0][t0] += 1
    for t1 in [ti for ti in img_tags if ti != t0]:
      tag_top_correlation_abs[t0][t1] += 1

for t0 in tag_top_correlation_abs.keys():
  t0_total = tag_top_correlation_abs[t0][t0]
  for t1 in tag_top_correlation_abs[t0].keys():
    tag_top_correlation_pct[t0][t1] = round(tag_top_correlation_abs[t0][t1] / t0_total, 5)

tag_top_correlation_abs["concreto"]["concreto"], tag_top_correlation_abs["concreto"], tag_top_correlation_pct["concreto"]

### Tag x Time

In [None]:
tag_top_by_tagyearmonth = tag_top_df.groupby(["tag", "year", "month"])
tag_top_by_yearmonth = tag_top_df.groupby(["year", "month"])

def get_count(year, month, tag=None):
  try:
    if tag is not None:
      cnt = tag_top_by_tagyearmonth.get_group((tag, year, month)).shape[0]
    else:
      cnt = tag_top_by_yearmonth.get_group((year, month)).shape[0]
  except KeyError:
    cnt = 0
  return cnt

In [None]:
tag_by_month_abs = {}
tag_by_year_abs = {}

tag_by_month_pct = {}
tag_by_year_pct = {}

for t in tag_top_labels:
  tag_by_month_abs[t] = []
  tag_by_year_abs[t] = []
  tag_by_month_pct[t] = []
  tag_by_year_pct[t] = []
  for y in range(min_year, max_year+1):
    year_cnt = 0
    year_total = 0
    for m in range(1, 13):
      month_cnt = get_count(y, m, t)
      month_total = get_count(y, m)
      tag_by_month_abs[t].append(month_cnt)
      tag_by_month_pct[t].append(month_cnt if month_total < 1 else (month_cnt / month_total))
      year_cnt += month_cnt
      year_total += month_total
    tag_by_year_abs[t].append(year_cnt)
    tag_by_year_pct[t].append(year_cnt if year_total < 1 else (year_cnt / year_total))

### CSV

In [None]:
TAG_COUNT_PATH = path.join(CSV_PATH, "tag_counts.csv")

with open(TAG_COUNT_PATH, 'w', newline='') as csvfile:
  fieldnames = ["tag", "count"]
  writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
  writer.writeheader()
  for k,v in tag_counts.items():
    writer.writerow({"tag":k, "count":v})

In [None]:
TAG_BY_PHOTO_PATH = path.join(CSV_PATH, "tag_by_photo.csv")

tags_to_str = (", ".join)

tag_tag_by_photo_df = tag_top_pct_df.groupby(["photo_id"], as_index=False)["tag"].apply(tags_to_str)

tag_tag_by_photo_df.to_csv(TAG_BY_PHOTO_PATH, index=False)

In [None]:
TAG_MONTH_COUNT_PATH = path.join(CSV_PATH, "tag_monthly_counts.csv")

month_labels = [("00" + str(m))[-2:] for m in range(1, 13)]
year_month_labels = [f"{y}-{m}" for y in range(min_year, max_year+1) for m in month_labels]

with open(TAG_MONTH_COUNT_PATH, 'w', newline='') as csvfile:
  fieldnames = ["tag"] + year_month_labels
  writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
  writer.writeheader()
  for t in tag_top_labels:
    vals = [t] + tag_by_month_abs[t]
    writer.writerow({k:v for k,v in zip(fieldnames, vals)})

In [None]:
TAG_YEAR_COUNT_PATH = path.join(CSV_PATH, "tag_yearly_counts.csv")

year_labels = [f"{y}" for y in range(min_year, max_year+1)]

with open(TAG_YEAR_COUNT_PATH, 'w', newline='') as csvfile:
  fieldnames = ["tag"] + year_labels
  writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
  writer.writeheader()
  for t in tag_top_labels:
    vals = [t] + tag_by_year_abs[t]
    writer.writerow({k:v for k,v in zip(fieldnames, vals)})

In [None]:
TAG_CORRELATION_ABS_PATH = path.join(CSV_PATH, "tag_correlation_abs.csv")

correlation_labels = list(tag_top_correlation_abs.keys())

with open(TAG_CORRELATION_ABS_PATH, 'w', newline='') as csvfile:
  fieldnames = ["tag"] + correlation_labels
  writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
  writer.writeheader()
  for t in correlation_labels:
    vals = [t] + list(tag_top_correlation_abs[t].values())
    writer.writerow({k:v for k,v in zip(fieldnames, vals)})

In [None]:
TAG_CORRELATION_PCT_PATH = path.join(CSV_PATH, "tag_correlation_pct.csv")

correlation_labels = list(tag_top_correlation_pct.keys())

with open(TAG_CORRELATION_PCT_PATH, 'w', newline='') as csvfile:
  fieldnames = ["tag"] + correlation_labels
  writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
  writer.writeheader()
  for t in correlation_labels:
    vals = [t] + list(tag_top_correlation_pct[t].values())
    writer.writerow({k:v for k,v in zip(fieldnames, vals)})

### Plots

In [None]:
def plot_tags(x, y, title, x_skip=None):
  plt.figure(figsize=(18, 12))
  plt.bar(x, y, log=len(x) > 900)
  plt.xticks(fontsize=10, rotation=90)

  if x_skip is not None:
    for i,l in enumerate(plt.gca().get_xticklabels()):
      l.set_visible(i % x_skip == 0)

  plt.title(title)
  plt.show()

### Tags mais usadas

In [None]:
plot_tags(tag_top_labels, tag_top_counts.values, f"Tags (top {TOP})")

### Tags por imagens

In [None]:
y = image_by_tag_count["count"].values
x = range(0, len(y))
title = "tags por imagem"

plt.figure(figsize=(18, 12))
plt.plot(x, y)

plt.xlabel("imagens")
plt.ylabel("número de tags")

for i,l in enumerate(plt.gca().get_xticklabels()):
  l.set_visible(False)

plt.title(title)
plt.show()

### Tags por mês (absoluto)

In [None]:
month_labels = [("00" + str(m))[-2:] for m in range(1, 13)]
year_month_labels = [f"{y}-{m}" for y in range(min_year, max_year+1) for m in month_labels]

for k,v in tag_by_month_abs.items():
  plot_tags(year_month_labels, v, f"{k} por mês (absoluto)", 3)

### Tags por ano (absoluto)

In [None]:
year_labels = [f"{y}" for y in range(min_year, max_year+1)]

for k,v in tag_by_year_abs.items():
  plot_tags(year_labels, v, f"{k} por ano (absoluto)", 3)

### Tags por mês (relativo ao número total de tags)

In [None]:
month_labels = [("00" + str(m))[-2:] for m in range(1, 13)]
year_month_labels = [f"{y}-{m}" for y in range(min_year, max_year+1) for m in month_labels]

for k,v in tag_by_month_pct.items():
  plot_tags(year_month_labels, v, f"{k} por mês (relativo ao número total de tags)", 3)

### Tags por ano (relativo ao número total de tags)

In [None]:
year_labels = [f"{y}" for y in range(min_year, max_year+1)]

for k,v in tag_by_year_pct.items():
  plot_tags(year_labels, v, f"{k} por ano (relativo ao número total de tags)", 3)