## This Notebook will split movie genre by single genre per row
Spark Version 3.1.2
Databricks Runtime LTS 9.1

In [0]:
from pyspark.sql.functions import array, col, explode, struct, lit

In [0]:
movies_filepath="dbfs:/FileStore/tables/asos_data/movies/"
tag_filepath="dbfs:/FileStore/tables/asos_data/tags/"

### Input Raw folder structure

In [0]:
dbutils.fs.ls(movies_filepath)

Out[6]: [FileInfo(path='dbfs:/FileStore/tables/asos_data/movies/20220306/', name='20220306/', size=0)]

In [0]:
df_movies = spark.read.format("csv").option("header", True).option("inferSchema", "true").load(movies_filepath + "/*")
df_tag = spark.read.format("csv").option("header", True).option("inferSchema", "true").load(tag_filepath + "/*")

### Selecting columns necessary for processing

In [0]:
cols = ['movie_title','unknown','Action','Adventure','animation',
        'childrens','comedy','crime','documentary','drama','fantasy',
        'filmnoir','horror','musical','mystery']
df_movies_selected_cols = df_movies.select(*cols)

### Function used to transpose the columns into rows

In [0]:
def to_long(df, by):
    """Filter dtypes and split into column names and type description"""
    cols, dtypes = zip(*((c, t) for (c, t) in df.dtypes if c not in by))
    # Spark SQL supports only homogeneous columns
    assert len(set(dtypes)) == 1, "All columns have to be of the same type"

    # Create and explode an array of (column_name, column_value) structs
    kvs = explode(array([
      struct(lit(c).alias("key"), col(c).alias("val")) for c in cols
    ])).alias("kvs")

    return df.select(by + [kvs]).select(by + ["kvs.key", "kvs.val"])

In [0]:
df_transposed = to_long(df_movies_selected_cols, ['movie_title'])
df_final = df_transposed.select("movie_title",col("key").alias("genre")).where(df_transposed['val'] ==1)

In [0]:
display(df_final)

movie_title,genre
Toy Story (1995),animation
Toy Story (1995),childrens
Toy Story (1995),comedy
GoldenEye (1995),Action
GoldenEye (1995),Adventure
Get Shorty (1995),Action
Get Shorty (1995),comedy
Get Shorty (1995),drama
Copycat (1995),crime
Copycat (1995),drama
