In [None]:
import ast
import defcon
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

from data_utils import parse_list, glyph_to_img,\
    glyph_stats, glyph_to_svg_path, normalize_glyph,\
    transform_svg, svg_format

df = pd.read_csv("ufo_data.csv", converters={"subsets": parse_list, "variants":ast.literal_eval})


In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
# Create a SparkSession
spark = SparkSession.builder.appName("GenerativeFontsDataset").getOrCreate()

In [None]:
# Initialize an empty dictionary to store the glyph data for each variant
glyph_data = {}

# Initialize an empty list to store the variant data
variant_data = []

M = 3

# Iterate through each font family and variant in the font_dataset DataFrame
for _, row in tqdm(df[:M].iterrows(), total=M, desc="Processing Variants..."):
    family = row['family']
    variants = row['variants']

    for variant, ufo_file_path in variants.items():
        # Open the UFO file for the font
        font = defcon.Font(ufo_file_path)

        variant_info = {
            'ascender' : font.info.ascender,
            'capHeight' : font.info.capHeight,
            'descender' : font.info.descender,
            'italicAngle' : font.info.italicAngle,
            'xHeight' : font.info.xHeight,
            'unitsPerEm' : font.info.unitsPerEm,
        }

        # Add the variant data to the list
        variant_data.append({'family': family, 'variant': variant, **variant_info})

        # Initialize an empty list to store the glyph data for the variant
        glyph_data[(family, variant)] = []

        # Iterate through each glyph in the font
        for glyph_name in font.keys():
            # Get the glyph object for the glyph
            glyph = font[glyph_name]

            meanX, meanY, stddevX, stddevY = glyph_stats(glyph)

            # Split the bounds tuple into separate fields
            bl_x, bl_y, tr_x, tr_y = glyph.bounds if glyph.bounds else [None]*4

            # Create a dictionary to store the glyph data
            glyph_dict = {
                'glyph_name': glyph_name,
                'svg': glyph_to_svg_path(glyph),
                'advance': glyph.width,
                'unicode' : glyph.unicode,
                'meanX' : meanX,
                'meanY' : meanY,
                'stddevX' : stddevX,
                'stddevY' : stddevY,
                'area' : glyph.area,
                'bottomLeftX' : bl_x,
                'bottomLeftY' : bl_y,
                'topRightX' : tr_x,
                'topRightY' : tr_y,
            }
            # Add the glyph data to the list
            glyph_data[(family, variant)].append(glyph_dict)

In [None]:
# Create a DataFrame for the variant information
variants_df = spark.createDataFrame(variant_data)

# Create a DataFrame for the glyph information
glyphs_df_spark = spark.createDataFrame([{'family': family, 'variant': variant, 'glyphs': glyphs} for (family, variant), glyphs in glyph_data.items()])

# Join the variants_df DataFrame with the glyphs_df_spark DataFrame on the 'variant' column
joined_df = variants_df.join(glyphs_df_spark, on=['family', 'variant'])

# Show the resulting DataFrame
joined_df.show(1000)

In [None]:
# Flatten the glyphs_df_spark dataframe to create a list of rows
glyph_rows = glyphs_df_spark.rdd.flatMap(lambda x: [{**glyph, **{'family': x['family'], 'variant': x['variant']}} for glyph in x['glyphs']]).collect()

# Create a dataframe from the glyph rows
glyphs_df = spark.createDataFrame(glyph_rows)

# Show the resulting dataframe
glyphs_df.show()

In [None]:
import pyarrow as pa
aggf= {'advance': 'avg','meanX' : 'avg','meanY' : 'avg','stddevX' : 'avg','stddevY' : 'avg','area' : 'avg','bottomLeftX' : 'avg','bottomLeftY' : 'avg','topRightX' : 'avg','topRightY' : 'avg'}
mean_df= glyphs_df.groupBy('family','variant').agg(aggf)
pandasMeans_DF = mean_df.toPandas()
pandasMeans_DF.to_csv('means_df.csv', index=False)