**Disclaimer**: Directories don't match the repo!

In [None]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

spark: SparkSession = (SparkSession
    .builder
    .appName("foo")
    .master("local[*]")
    .config("spark.driver.memory", "12g")
    .config("spark.driver.maxResultSize", "2g")
    .config("spark.sql.shuffle.partitions", "50")  # shuffle overhead
    .config("spark.memory.fraction", "0.4")  # memory cache
    .config("spark.memory.storageFraction", "0.3")
    .getOrCreate()
)

sc: SparkContext = spark.sparkContext

25/04/27 13:57:57 WARN Utils: Your hostname, aleferu-PC resolves to a loopback address: 127.0.1.1; using 192.168.1.46 instead (on interface eno1)
25/04/27 13:57:57 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/27 13:57:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
data_dir: str = "opt_data"

!du -h $data_dir/*
!du -hs $data_dir/

1,6M	opt_data/artist_tags_clean.csv
297M	opt_data/mlp2019110.5x_test.pt
730M	opt_data/mlp2019110.5x_train.pt
3,2M	opt_data/mlp2019110.5y_test.pt
7,8M	opt_data/mlp2019110.5y_train.pt
199M	opt_data/mlp2019110.75x_test.pt
462M	opt_data/mlp2019110.75x_train.pt
2,2M	opt_data/mlp2019110.75y_test.pt
5,0M	opt_data/mlp2019110.75y_train.pt
102M	opt_data/mlp2019110.9x_test.pt
227M	opt_data/mlp2019110.9x_train.pt
1,1M	opt_data/mlp2019110.9y_test.pt
2,5M	opt_data/mlp2019110.9y_train.pt
501M	opt_data/mlp2019110x_test.pt
1,3G	opt_data/mlp2019110x_train.pt
5,4M	opt_data/mlp2019110y_test.pt
14M	opt_data/mlp2019110y_train.pt
168M	opt_data/mlp2021110.5x_test.pt
859M	opt_data/mlp2021110.5x_train.pt
1,8M	opt_data/mlp2021110.5y_test.pt
9,2M	opt_data/mlp2021110.5y_train.pt
113M	opt_data/mlp2021110.75x_test.pt
548M	opt_data/mlp2021110.75x_train.pt
1,2M	opt_data/mlp2021110.75y_test.pt
5,9M	opt_data/mlp2021110.75y_train.pt
58M	opt_data/mlp2021110.9x_test.pt
270M	opt_data/mlp2021110.9x_train.pt
632K	opt_data/mlp

In [3]:
!wc -l $data_dir/*.csv

    142936 opt_data/artist_tags_clean.csv
  24324101 opt_data/tracks_no_va_merged_id_clean.csv
  24467037 total


In [4]:
!head $data_dir/*.csv

==> opt_data/artist_tags_clean.csv <==
artist,tags
4,"2, 1, 16, 3"
6,"3, 16, 2, 9"
21685,16
9,"3, 2, 9"
10,16
11,"16, 9"
12,"2, 16, 8, 5"
15,"17, 6"
16,16

==> opt_data/tracks_no_va_merged_id_clean.csv <==
name,date,year,month,artist_count,a0_id,a0_name,tags,a1_id,a1_name,a2_id,a2_name,a3_id,a3_name,a4_id,a4_name,id
 *~ƒint_vœr!~* ,201612,2016,12,1,2808021,Julius Androide,,,,,,,,,,0
roots rock reggae,200006,2000,6,1,637799,Baba Dread,,,,,,,,,,1
roots rock reggae,201304,2013,4,1,625247,Jah Sun,,,,,,,,,,2
roots rock reggae,199713,1997,13,1,442806,The Wailers Band,,,,,,,,,,3
roots rock reggae,201009,2010,9,1,426971,Dean Fraser,,,,,,,,,,4
roots rock reggae,200013,2000,13,1,248826,Bob Marley,9,,,,,,,,,5
roots rock reggae,200813,2008,13,1,248660,Tony Roots,,,,,,,,,,6
roots rock reggae,197310,1973,10,1,232732,Bob Marley & The Wailers,,,,,,,,,,7
roots rock reggae,201001,2010,1,1,693259,Solo Banton,,,,,,,,,,8


Update artist_tags_clean

In [None]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql import DataFrame

at_schema: StructType = StructType([
    StructField("artist", IntegerType(), False),  # important: int
    StructField("tags", StringType(), False)
])

artist_tags: DataFrame = spark.read.csv(
    f"{data_dir}/artist_tags_clean.csv",
    header=True,
    schema=at_schema
).filter(
    F.col("tags").isNotNull()
)
print("Se han cargado " + str(artist_tags.count()) + " entradas del fichero 'artist_tags_clean.csv'.")

artist_tags.show(5)

Se han cargado 142935 entradas del fichero 'artist_tags_clean.csv'.
+------+-----------+
|artist|       tags|
+------+-----------+
|     4|2, 1, 16, 3|
|     6|3, 16, 2, 9|
| 21685|         16|
|     9|    3, 2, 9|
|    10|         16|
+------+-----------+
only showing top 5 rows



list of tags

In [None]:
def tags_comma_to_list(df: DataFrame) -> DataFrame:
    return (df
        .withColumn("tags", F.split("tags", ", "))
        .groupBy("artist")
        .agg(F.collect_list("tags").alias("tags"))
        .withColumn("tags", F.flatten("tags"))
    )

artist_tags = tags_comma_to_list(artist_tags)

artist_tags.show(5)

[Stage 4:>                                                          (0 + 1) / 1]

+------+----------------+
|artist|            tags|
+------+----------------+
|     4|   [2, 1, 16, 3]|
|    11|         [16, 9]|
|    20|      [3, 0, 16]|
|    23|[0, 2, 16, 9, 7]|
|    28|          [2, 0]|
+------+----------------+
only showing top 5 rows



                                                                                

Other CSV

In [None]:
track_tags: DataFrame = spark.read.csv(
    f"{data_dir}/tracks_no_va_merged_id_clean.csv",
    header=True,
).select(
    # all integers for easy joins
    *[F.col(f"a{i}_id").cast("integer") for i in range(5)],  # important: int
    F.col("tags").cast("string")
).filter(
    # ease up computations
    F.col("tags").isNotNull()
)
print("Se han cargado " + str(track_tags.count()) + " entradas del fichero 'tracks_no_va_merged_id_clean.csv'.")

track_tags.show(5)



Se han cargado 3534032 entradas del fichero 'tracks_no_va_merged_id_clean.csv'.
+-------+-----+-----+-----+-----+----+
|  a0_id|a1_id|a2_id|a3_id|a4_id|tags|
+-------+-----+-----+-----+-----+----+
| 248826| NULL| NULL| NULL| NULL|   9|
|  96651| NULL| NULL| NULL| NULL|   9|
| 763291| NULL| NULL| NULL| NULL|   9|
|2158618| NULL| NULL| NULL| NULL|   9|
| 654641| NULL| NULL| NULL| NULL|   9|
+-------+-----+-----+-----+-----+----+
only showing top 5 rows



                                                                                

map *artist* - *tags*

In [None]:
track_tags = (track_tags
    .withColumn("artist", F.explode(F.array([f"a{i}_id" for i in range(5)])))
    .select("artist", "tags")
    .filter(F.col("artist").isNotNull())  # DO NOT REMOVE
)

track_tags.show(5)

+-------+----+
| artist|tags|
+-------+----+
| 248826|   9|
|  96651|   9|
| 763291|   9|
|2158618|   9|
| 654641|   9|
+-------+----+
only showing top 5 rows



In [9]:
# tags_comma_to_list transforms `3, 3, 9` to a list and merge results from different rows with the same artist
track_tags = tags_comma_to_list(track_tags)

track_tags.show(5)

25/04/27 13:58:13 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors

+------+--------------------+
|artist|                tags|
+------+--------------------+
|     4|[3, 3, 9, 3, 3, 1...|
|    20|[2, 6, 6, 3, 0, 6...|
|    31|[2, 2, 2, 2, 2, 7...|
|    32|[10, 10, 10, 7, 1...|
|    40|[2, 0, 2, 0, 2, 0...|
+------+--------------------+
only showing top 5 rows



                                                                                

In [10]:
# alias (withColumnRenamed) to avoid ambiguity
# simple join + concat with coalesce
complete_df = track_tags.withColumnRenamed("tags", "t0").join(
    artist_tags.withColumnRenamed("tags", "t1"),
    on="artist",
    how="outer"
).withColumn(
    "tags",
    F.concat(
        F.coalesce("t0", F.array()),
        F.coalesce("t1", F.array())
    )
).select("artist", "tags")

complete_df.show(5)



+------+--------------------+
|artist|                tags|
+------+--------------------+
|     4|[3, 3, 9, 3, 3, 1...|
|    20|[2, 6, 6, 3, 0, 6...|
|    31|[2, 2, 2, 2, 2, 7...|
|    32|[10, 10, 10, 7, 1...|
|    40|[2, 0, 2, 0, 2, 0...|
+------+--------------------+
only showing top 5 rows



                                                                                

In [None]:
from pyspark import StorageLevel


# yield tag_* cols
complete_df = complete_df.withColumn(
    "tags_length", F.size("tags")  # for div
).withColumn(
    "tag", F.explode("tags")
).groupBy(
    "artist", "tag", "tags_length"  # tags_length does nothing here, repetition no longer exists
).count().withColumn(
    "frequency", F.col("count") / F.col("tags_length")  # eg: artist, tag, tags_length, freq -> 1234, 2, 5, 0.4
).groupBy("artist").pivot("tag").agg(
    F.first("frequency")  # first so agg is faster (should only be one)
)

# If a tag is missing, fill with 0.0
for i in range(23):
    if str(i) not in complete_df.columns:
        complete_df = complete_df.withColumn(str(i), F.lit(0.0))

# Order tag columns and rename to tag_i
complete_df = complete_df.select(
    "artist",
    *[F.col(str(i)).alias(f"tag_{i}") for i in range(23)]
)

# Cache on disk
complete_df.persist(StorageLevel.DISK_ONLY)
print(f"Found MB tags for {complete_df.count()} artists")

complete_df.show(5)

25/04/27 13:58:26 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

Found MB tags for 413749 artists
+-------+------------------+-----+-------------------+-------------------+------------------+-----+-------------------+-----+-----+-----+------+------+------+------+------+--------------------+--------------------+------------------+------+------+------+--------------------+-------------------+
| artist|             tag_0|tag_1|              tag_2|              tag_3|             tag_4|tag_5|              tag_6|tag_7|tag_8|tag_9|tag_10|tag_11|tag_12|tag_13|tag_14|              tag_15|              tag_16|            tag_17|tag_18|tag_19|tag_20|              tag_21|             tag_22|
+-------+------------------+-----+-------------------+-------------------+------------------+-----+-------------------+-----+-----+-----+------+------+------+------+------+--------------------+--------------------+------------------+------+------+------+--------------------+-------------------+
|  53001|              NULL| NULL|                0.2|                0.2|     

                                                                                

Schemas:

In [None]:
from pyspark.sql.types import FloatType, IntegerType

# Schema of tensors, no _row_index
pt_schema: StructType = StructType([
    StructField("id0", FloatType(), False),
    StructField("hbd0", FloatType(), False),
    StructField("bd0", FloatType(), False),
    StructField("hed0", FloatType(), False),
    StructField("ed0", FloatType(), False),
    StructField("e0", FloatType(), False),
    StructField("g10", FloatType(), False),
    StructField("g20", FloatType(), False),
    StructField("g30", FloatType(), False),
    StructField("g40", FloatType(), False),
    StructField("g50", FloatType(), False),
    StructField("ps0", FloatType(), False),
    StructField("t10", FloatType(), False),
    StructField("t20", FloatType(), False),
    StructField("t30", FloatType(), False),
    StructField("t40", FloatType(), False),
    StructField("t50", FloatType(), False),
    StructField("t60", FloatType(), False),
    *[StructField(f"tag0_{i}", FloatType(), False) for i in range(23)],
    StructField("cc0", FloatType(), False),
    StructField("cp0", FloatType(), False),
    StructField("sc0", FloatType(), False),
    StructField("sp0", FloatType(), False),

    StructField("id1", FloatType(), False),
    StructField("hbd1", FloatType(), False),
    StructField("bd1", FloatType(), False),
    StructField("hed1", FloatType(), False),
    StructField("ed1", FloatType(), False),
    StructField("e1", FloatType(), False),
    StructField("g11", FloatType(), False),
    StructField("g21", FloatType(), False),
    StructField("g31", FloatType(), False),
    StructField("g41", FloatType(), False),
    StructField("g51", FloatType(), False),
    StructField("ps1", FloatType(), False),
    StructField("t11", FloatType(), False),
    StructField("t21", FloatType(), False),
    StructField("t31", FloatType(), False),
    StructField("t41", FloatType(), False),
    StructField("t51", FloatType(), False),
    StructField("t61", FloatType(), False),
    *[StructField(f"tag1_{i}", FloatType(), False) for i in range(23)],
    StructField("cc1", FloatType(), False),
    StructField("cp1", FloatType(), False),
    StructField("sc1", FloatType(), False),
    StructField("sp1", FloatType(), False),

    StructField("lfm", FloatType(), False),
    StructField("mrt", FloatType(), False),
    StructField("prt", FloatType(), False),
    StructField("lt", FloatType(), False),

    StructField("_row_index", FloatType(), False),
])

In [None]:
import torch
import numpy as np
import pandas as pd
from pyspark.sql import Window
from pathlib import Path

def get_df_from_pt(spark: SparkSession, pt_path: str, pt_schema: StructType) -> DataFrame:
    """
    Reads a tensor and returns a Spark DataFrame.
    """
    print("Loading", pt_path)
    np_arr: np.ndarray = torch.load(pt_path, weights_only=False).numpy()

    num_rows, num_cols = np_arr.shape
    print(f"Loaded array of {num_rows} rows and {num_cols} columns.")

    # keep row order
    _row_index = np.arange(num_rows)
    np_arr = np.column_stack((np_arr, _row_index))

    # Serialization for lower memory consumption

    pdf = pd.DataFrame(np_arr, columns=[f.name for f in pt_schema.fields], dtype=np.float32)

    temp_path = f"temp_{Path(pt_path).stem}.parquet"
    pdf.to_parquet(temp_path)

    df = spark.read.schema(pt_schema).parquet(temp_path)

    # too expensive!
    # could have been done, but spark complains a lot
    # it sends all the DF into only one executor because window is specified without partitioning
    # it can only be done this way, as executors are independent
    # do it in the numpy array was the easy solution
    # I let this here for possible extra credits :P

    # df = df.withColumn("_row_index", F.row_number().over(Window.orderBy(F.monotonically_increasing_id())))

    df = df.withColumn("_row_index", F.col("_row_index").cast("integer"))
    df = df.withColumn("id0", F.col("id0").cast("integer"))
    df = df.withColumn("id1", F.col("id1").cast("integer"))

    return df

In [None]:
def add_z_col(train_df: DataFrame, test_df: DataFrame, col_to_norm: str | F.Column) -> DataFrame:
    # hack because idk how I'll pass it atm
    if isinstance(col_to_norm, str):
        col_name = col_to_norm
        col_obj = F.col(col_name)
    else:
        col_obj = col_to_norm
        col_name = str(col_obj)

    mean_std = train_df.select(
        F.mean(col_to_norm).alias("mean"),
        F.stddev(col_to_norm).alias("std"),
    ).first()
    mean, std = mean_std.mean, mean_std.std

    normalized_col = ((col_obj - F.lit(mean)) / F.lit(std)).alias(f"z{col_name}")

    # Error checking (not necessary)
    original_columns = train_df.columns
    try:
        original_index = original_columns.index(col_name)
    except ValueError:
        raise ValueError(f"Column '{col_name}' not found in DataFrame.")

    # Easier computation this way
    # Also, ordered columns
    new_cols = []
    for i, col in enumerate(original_columns):
        new_cols.append(F.col(col))
        if i == original_index:
            new_cols.append(normalized_col)

    return train_df.select(*new_cols), test_df.select(*new_cols)

In [15]:
def add_mb_tags(arr_df: DataFrame, complete_df: DataFrame) -> DataFrame:
    tag_cols = [f"tag_{i}" for i in range(23)]

    # Update tag0_* columns based on id0
    final_df: DataFrame = arr_df.join(complete_df, arr_df["id0"] == complete_df["artist"], "left")
    for i in range(23):
        final_df = final_df.withColumn(f"mbtag0_{i}", F.coalesce(F.col(f"tag_{i}"), F.lit(0.0)))
    final_df = final_df.drop("artist", *tag_cols)

    # Update tag1_* columns based on id0
    final_df = final_df.join(complete_df, final_df["id1"] == complete_df["artist"], "left")
    for i in range(23):
        final_df = final_df.withColumn(f"mbtag1_{i}", F.coalesce(F.col(f"tag_{i}"), F.lit(0.0)))
    return final_df.drop("artist", *tag_cols)

In [16]:
def save_to_pt(df: DataFrame, cols: list[str], outpath: str):
    print(f"Saving to {outpath}...")
    torch.save(
        torch.from_numpy(
            df.select(cols).toPandas().to_numpy(dtype="float32")
        ),
        outpath
    )
    print("Done!")

In [17]:
# Vars and dir
out_dir: str = "opt_result"
!mkdir -p $out_dir

# Columns used for the original tensor
lfm_og: list[str] = [
    "hbd0",
    "bd0",
    "hed0",
    "ed0",
    "e0",
    "g10",
    "g20",
    "g30",
    "g40",
    "g50",
    "ps0",
    "t10",
    "t20",
    "t30",
    "t40",
    "t50",
    "t60",
    *[f"tag0_{i}" for i in range(23)],
    "cc0",
    "cp0",
    "sc0",
    "sp0",

    "hbd1",
    "bd1",
    "hed1",
    "ed1",
    "e1",
    "g11",
    "g21",
    "g31",
    "g41",
    "g51",
    "ps1",
    "t11",
    "t21",
    "t31",
    "t41",
    "t51",
    "t61",
    *[f"tag1_{i}" for i in range(23)],
    "cc1",
    "cp1",
    "sc1",
    "sp1",

    "lfm",
    "mrt",
    "prt",
    "lt",
]

# Columns used for the original tensor with normalized popularity
lfm_z: list[str] = [
    "hbd0",
    "bd0",
    "hed0",
    "ed0",
    "e0",
    "g10",
    "g20",
    "g30",
    "g40",
    "g50",
    "zps0",
    "t10",
    "t20",
    "t30",
    "t40",
    "t50",
    "t60",
    *[f"tag0_{i}" for i in range(23)],
    "cc0",
    "zcp0",
    "sc0",
    "zsp0",

    "hbd1",
    "bd1",
    "hed1",
    "ed1",
    "e1",
    "g11",
    "g21",
    "g31",
    "g41",
    "g51",
    "zps1",
    "t11",
    "t21",
    "t31",
    "t41",
    "t51",
    "t61",
    *[f"tag1_{i}" for i in range(23)],
    "cc1",
    "zcp1",
    "sc1",
    "zsp1",

    "lfm",
    "mrt",
    "prt",
    "lt",
]



# Columns used for the tensor that doesn't have LFM info
mb_og: list[str] = [
    "hbd0",
    "bd0",
    "hed0",
    "ed0",
    "e0",
    "g10",
    "g20",
    "g30",
    "g40",
    "g50",
    "t10",
    "t20",
    "t30",
    "t40",
    "t50",
    "t60",
    *[f"mbtag0_{i}" for i in range(23)],
    "cc0",
    "sc0",

    "hbd1",
    "bd1",
    "hed1",
    "ed1",
    "e1",
    "g11",
    "g21",
    "g31",
    "g41",
    "g51",
    "t11",
    "t21",
    "t31",
    "t41",
    "t51",
    "t61",
    *[f"mbtag1_{i}" for i in range(23)],
    "cc1",
    "sc1",

    "mrt",
    "prt",
    "lt",
]

**MAIN LOOP**

In [None]:
# loop
for year in [2023, 2021, 2019]:
    for perc in [0, 0.5, 0.75, 0.9]:
        # filename
        train_name: str = f"mlp{year}11{perc}x_train.pt"
        test_name: str = f"mlp{year}11{perc}x_test.pt"

        # data read
        train_df = get_df_from_pt(spark, f"{data_dir}/{train_name}", pt_schema)
        test_df = get_df_from_pt(spark, f"{data_dir}/{test_name}", pt_schema)

        # add norm cols
        for c_name in [f"{c}{i}" for c in ["ps", "cp", "sp"] for i in [0, 1]]:
            train_df, test_df = add_z_col(train_df, test_df, c_name)

        # persist intermediate result once, after all the shared z_col work
        train_df = train_df.orderBy("_row_index").persist(StorageLevel.DISK_ONLY)
        test_df = test_df.orderBy("_row_index").persist(StorageLevel.DISK_ONLY)

        # now compute the MB-tags versions + persist
        train_df_tagged = add_mb_tags(train_df, complete_df).orderBy("_row_index").persist(StorageLevel.DISK_ONLY)
        test_df_tagged = add_mb_tags(test_df, complete_df).orderBy("_row_index").persist(StorageLevel.DISK_ONLY)

        # save original
        save_to_pt(train_df, lfm_og, f"{out_dir}/{train_name}")
        save_to_pt(test_df, lfm_og, f"{out_dir}/{test_name}")

        # save normalized
        save_to_pt(train_df, lfm_z, f"{out_dir}/mlp{year}11{perc}x_train_norm.pt")
        save_to_pt(test_df, lfm_z, f"{out_dir}/mlp{year}11{perc}x_test_norm.pt")

        # save with mb tags
        save_to_pt(train_df_tagged, mb_og, f"{out_dir}/mlp{year}11{perc}x_train_mb.pt")
        save_to_pt(test_df_tagged, mb_og, f"{out_dir}/mlp{year}11{perc}x_test_mb.pt")

        # clean up everything
        train_df.unpersist()
        test_df.unpersist()
        train_df_tagged.unpersist()
        test_df_tagged.unpersist()

Loading opt_data/mlp2023110x_train.pt
Loaded array of 4662908 rows and 94 columns.
Loading opt_data/mlp2023110x_test.pt
Loaded array of 263196 rows and 94 columns.
Saving to opt_result/mlp2023110x_train.pt...


                                                                                

Done!
Saving to opt_result/mlp2023110x_test.pt...


                                                                                

Done!
Saving to opt_result/mlp2023110x_train_norm.pt...


                                                                                

Done!
Saving to opt_result/mlp2023110x_test_norm.pt...


                                                                                

Done!
Saving to opt_result/mlp2023110x_train_mb.pt...


                                                                                

Done!
Saving to opt_result/mlp2023110x_test_mb.pt...
Done!
Loading opt_data/mlp2023110.5x_train.pt
Loaded array of 2713188 rows and 94 columns.
Loading opt_data/mlp2023110.5x_test.pt
Loaded array of 146576 rows and 94 columns.
Saving to opt_result/mlp2023110.5x_train.pt...


                                                                                

Done!
Saving to opt_result/mlp2023110.5x_test.pt...
Done!
Saving to opt_result/mlp2023110.5x_train_norm.pt...


                                                                                

Done!
Saving to opt_result/mlp2023110.5x_test_norm.pt...
Done!
Saving to opt_result/mlp2023110.5x_train_mb.pt...


                                                                                

Done!
Saving to opt_result/mlp2023110.5x_test_mb.pt...
Done!
Loading opt_data/mlp2023110.75x_train.pt
Loaded array of 1742016 rows and 94 columns.
Loading opt_data/mlp2023110.75x_test.pt
Loaded array of 97864 rows and 94 columns.
Saving to opt_result/mlp2023110.75x_train.pt...


                                                                                

Done!
Saving to opt_result/mlp2023110.75x_test.pt...
Done!
Saving to opt_result/mlp2023110.75x_train_norm.pt...


                                                                                

Done!
Saving to opt_result/mlp2023110.75x_test_norm.pt...
Done!
Saving to opt_result/mlp2023110.75x_train_mb.pt...


                                                                                

Done!
Saving to opt_result/mlp2023110.75x_test_mb.pt...
Done!
Loading opt_data/mlp2023110.9x_train.pt
Loaded array of 863004 rows and 94 columns.
Loading opt_data/mlp2023110.9x_test.pt
Loaded array of 50736 rows and 94 columns.
Saving to opt_result/mlp2023110.9x_train.pt...


                                                                                

Done!
Saving to opt_result/mlp2023110.9x_test.pt...
Done!
Saving to opt_result/mlp2023110.9x_train_norm.pt...
Done!
Saving to opt_result/mlp2023110.9x_test_norm.pt...
Done!
Saving to opt_result/mlp2023110.9x_train_mb.pt...


                                                                                

Done!
Saving to opt_result/mlp2023110.9x_test_mb.pt...
Done!
Loading opt_data/mlp2021110x_train.pt
Loaded array of 4149268 rows and 94 columns.
Loading opt_data/mlp2021110x_test.pt
Loaded array of 776836 rows and 94 columns.
Saving to opt_result/mlp2021110x_train.pt...


                                                                                

Done!
Saving to opt_result/mlp2021110x_test.pt...


                                                                                

Done!
Saving to opt_result/mlp2021110x_train_norm.pt...


                                                                                

Done!
Saving to opt_result/mlp2021110x_test_norm.pt...
Done!
Saving to opt_result/mlp2021110x_train_mb.pt...


                                                                                

Done!
Saving to opt_result/mlp2021110x_test_mb.pt...


                                                                                

Done!
Loading opt_data/mlp2021110.5x_train.pt
Loaded array of 2393720 rows and 94 columns.
Loading opt_data/mlp2021110.5x_test.pt
Loaded array of 466044 rows and 94 columns.
Saving to opt_result/mlp2021110.5x_train.pt...


                                                                                

Done!
Saving to opt_result/mlp2021110.5x_test.pt...


                                                                                

Done!
Saving to opt_result/mlp2021110.5x_train_norm.pt...


                                                                                

Done!
Saving to opt_result/mlp2021110.5x_test_norm.pt...
Done!
Saving to opt_result/mlp2021110.5x_train_mb.pt...


                                                                                

Done!
Saving to opt_result/mlp2021110.5x_test_mb.pt...
Done!
Loading opt_data/mlp2021110.75x_train.pt
Loaded array of 1526336 rows and 94 columns.
Loading opt_data/mlp2021110.75x_test.pt
Loaded array of 313544 rows and 94 columns.
Saving to opt_result/mlp2021110.75x_train.pt...


                                                                                

Done!
Saving to opt_result/mlp2021110.75x_test.pt...


                                                                                

Done!
Saving to opt_result/mlp2021110.75x_train_norm.pt...
Done!
Saving to opt_result/mlp2021110.75x_test_norm.pt...
Done!
Saving to opt_result/mlp2021110.75x_train_mb.pt...


                                                                                

Done!
Saving to opt_result/mlp2021110.75x_test_mb.pt...
Done!
Loading opt_data/mlp2021110.9x_train.pt
Loaded array of 752744 rows and 94 columns.
Loading opt_data/mlp2021110.9x_test.pt
Loaded array of 160996 rows and 94 columns.
Saving to opt_result/mlp2021110.9x_train.pt...


                                                                                

Done!
Saving to opt_result/mlp2021110.9x_test.pt...
Done!
Saving to opt_result/mlp2021110.9x_train_norm.pt...
Done!
Saving to opt_result/mlp2021110.9x_test_norm.pt...
Done!
Saving to opt_result/mlp2021110.9x_train_mb.pt...


                                                                                

Done!
Saving to opt_result/mlp2021110.9x_test_mb.pt...
Done!
Loading opt_data/mlp2019110x_train.pt
Loaded array of 3530604 rows and 94 columns.
Loading opt_data/mlp2019110x_test.pt
Loaded array of 1395500 rows and 94 columns.
Saving to opt_result/mlp2019110x_train.pt...


                                                                                

Done!
Saving to opt_result/mlp2019110x_test.pt...


                                                                                

Done!
Saving to opt_result/mlp2019110x_train_norm.pt...


                                                                                

Done!
Saving to opt_result/mlp2019110x_test_norm.pt...
Done!
Saving to opt_result/mlp2019110x_train_mb.pt...


                                                                                

Done!
Saving to opt_result/mlp2019110x_test_mb.pt...


                                                                                

Done!
Loading opt_data/mlp2019110.5x_train.pt
Loaded array of 2033564 rows and 94 columns.
Loading opt_data/mlp2019110.5x_test.pt
Loaded array of 826200 rows and 94 columns.
Saving to opt_result/mlp2019110.5x_train.pt...


                                                                                

Done!
Saving to opt_result/mlp2019110.5x_test.pt...


                                                                                

Done!
Saving to opt_result/mlp2019110.5x_train_norm.pt...
Done!
Saving to opt_result/mlp2019110.5x_test_norm.pt...
Done!
Saving to opt_result/mlp2019110.5x_train_mb.pt...


                                                                                

Done!
Saving to opt_result/mlp2019110.5x_test_mb.pt...


                                                                                

Done!
Loading opt_data/mlp2019110.75x_train.pt
Loaded array of 1286384 rows and 94 columns.
Loading opt_data/mlp2019110.75x_test.pt
Loaded array of 553496 rows and 94 columns.
Saving to opt_result/mlp2019110.75x_train.pt...


                                                                                

Done!
Saving to opt_result/mlp2019110.75x_test.pt...


                                                                                

Done!
Saving to opt_result/mlp2019110.75x_train_norm.pt...
Done!
Saving to opt_result/mlp2019110.75x_test_norm.pt...
Done!
Saving to opt_result/mlp2019110.75x_train_mb.pt...


                                                                                

Done!
Saving to opt_result/mlp2019110.75x_test_mb.pt...


                                                                                

Done!
Loading opt_data/mlp2019110.9x_train.pt
Loaded array of 631620 rows and 94 columns.
Loading opt_data/mlp2019110.9x_test.pt
Loaded array of 282120 rows and 94 columns.
Saving to opt_result/mlp2019110.9x_train.pt...


                                                                                

Done!
Saving to opt_result/mlp2019110.9x_test.pt...


                                                                                

Done!
Saving to opt_result/mlp2019110.9x_train_norm.pt...
Done!
Saving to opt_result/mlp2019110.9x_test_norm.pt...
Done!
Saving to opt_result/mlp2019110.9x_train_mb.pt...


                                                                                

Done!
Saving to opt_result/mlp2019110.9x_test_mb.pt...
Done!


In [19]:
!ls opt_result/

mlp2019110.5x_test_mb.pt      mlp2021110.9x_test_mb.pt
mlp2019110.5x_test_norm.pt    mlp2021110.9x_test_norm.pt
mlp2019110.5x_test.pt	      mlp2021110.9x_test.pt
mlp2019110.5x_train_mb.pt     mlp2021110.9x_train_mb.pt
mlp2019110.5x_train_norm.pt   mlp2021110.9x_train_norm.pt
mlp2019110.5x_train.pt	      mlp2021110.9x_train.pt
mlp2019110.75x_test_mb.pt     mlp2021110x_test_mb.pt
mlp2019110.75x_test_norm.pt   mlp2021110x_test_norm.pt
mlp2019110.75x_test.pt	      mlp2021110x_test.pt
mlp2019110.75x_train_mb.pt    mlp2021110x_train_mb.pt
mlp2019110.75x_train_norm.pt  mlp2021110x_train_norm.pt
mlp2019110.75x_train.pt       mlp2021110x_train.pt
mlp2019110.9x_test_mb.pt      mlp2023110.5x_test_mb.pt
mlp2019110.9x_test_norm.pt    mlp2023110.5x_test_norm.pt
mlp2019110.9x_test.pt	      mlp2023110.5x_test.pt
mlp2019110.9x_train_mb.pt     mlp2023110.5x_train_mb.pt
mlp2019110.9x_train_norm.pt   mlp2023110.5x_train_norm.pt
mlp2019110.9x_train.pt	      mlp2023110.5x_train.pt
mlp2019110x_test_mb.pt	   

## cleanup

In [20]:
!ls temp_mlp*.parquet

temp_mlp2019110.5x_test.parquet    temp_mlp2021110.9x_test.parquet
temp_mlp2019110.5x_train.parquet   temp_mlp2021110.9x_train.parquet
temp_mlp2019110.75x_test.parquet   temp_mlp2021110x_test.parquet
temp_mlp2019110.75x_train.parquet  temp_mlp2021110x_train.parquet
temp_mlp2019110.9x_test.parquet    temp_mlp2023110.5x_test.parquet
temp_mlp2019110.9x_train.parquet   temp_mlp2023110.5x_train.parquet
temp_mlp2019110x_test.parquet	   temp_mlp2023110.75x_test.parquet
temp_mlp2019110x_train.parquet	   temp_mlp2023110.75x_train.parquet
temp_mlp2021110.5x_test.parquet    temp_mlp2023110.9x_test.parquet
temp_mlp2021110.5x_train.parquet   temp_mlp2023110.9x_train.parquet
temp_mlp2021110.75x_test.parquet   temp_mlp2023110x_test.parquet
temp_mlp2021110.75x_train.parquet  temp_mlp2023110x_train.parquet


In [21]:
!rm temp_mlp*.parquet

In [22]:
spark.stop()