In [0]:
#Import Spark Session - Encompasses SparkContext & SQLContext 
from pyspark.sql import SparkSession
#SQL functions
from pyspark.sql.functions import split, explode , count
#Get the raw data ----> I have used Databricks Filestore to store my files
sourcepath = "dbfs:/FileStore/tables/tsvfile/title_basics.tsv"
sourcepath_Basics = "dbfs:/FileStore/tables/name_basics.tsv"

#read the tsv using the spark read function and store it into the dataframe
df_title = spark.read.csv(sourcepath,sep = "\t",header = True,inferSchema = True)
df_name = spark.read.csv(sourcepath_Basics,sep = "\t",header = True,inferSchema = True)

#Get the required parameter and value from both title and name dataframe
df_filtered_name = df_name.filter(df_name["primaryName"].isin(['Omar Sy', 'Saoirse Ronan',  'Frances McDormand'])).select('primaryName','knownForTitles')
df_filtered_title = df_title.select('tconst','titleType','genres')

#As the data in the KnowForTitle is in the form of array lets split the data using comma seperater and remane the column
splitTitles = df_filtered_name.withColumn("knownForTitles", explode(split(("knownForTitles"), ","))).withColumnRenamed("knownForTitles","tconst")

#Joining splited dataframe and title dataframe using tconst 
join_both = df_filtered_title.join(splitTitles,["tconst"])

#As the genes data is in the array format split the data
genes = join_both.withColumn("genres", explode(split(("genres"), ",")))

#Getting the count of each genres acted by repspective Actor
finalResulr = genes.groupBy(("genres"),("primaryName")).agg(count("genres").alias("NumberofGenres"))

#Displaying the final result
display(finalResulr)

genres,primaryName,NumberofGenres
Drama,Frances McDormand,4
Comedy,Frances McDormand,3
Romance,Frances McDormand,1
Fantasy,Saoirse Ronan,1
Comedy,Omar Sy,1
Drama,Saoirse Ronan,4
Thriller,Saoirse Ronan,1
Biography,Omar Sy,1
Drama,Omar Sy,1
Action,Omar Sy,3


In [0]:
display(df_filtered_name)

primaryName,knownForTitles
Frances McDormand,"tt0116282,tt5027774,tt0181875,tt1748122"
Omar Sy,"tt1675434,tt1877832,tt0369610,tt3371366"
Saoirse Ronan,"tt0380510,tt0993842,tt2381111,tt0783233"


In [0]:
display(join_both)

tconst,titleType,genres,primaryName
tt1748122,movie,"Comedy,Drama,Romance",Frances McDormand
tt0380510,movie,"Drama,Fantasy,Thriller",Saoirse Ronan
tt1675434,movie,"Biography,Comedy,Drama",Omar Sy
tt0369610,movie,"Action,Adventure,Sci-Fi",Omar Sy
tt0783233,movie,"Drama,Mystery,Romance",Saoirse Ronan
tt0993842,movie,"Action,Adventure,Drama",Saoirse Ronan
tt1877832,movie,"Action,Adventure,Sci-Fi",Omar Sy
tt0116282,movie,"Crime,Drama,Thriller",Frances McDormand
tt0181875,movie,"Adventure,Comedy,Drama",Frances McDormand
tt3371366,movie,"Action,Adventure,Sci-Fi",Omar Sy


In [0]:
display(genes)

tconst,titleType,genres,primaryName
tt1748122,movie,Comedy,Frances McDormand
tt1748122,movie,Drama,Frances McDormand
tt1748122,movie,Romance,Frances McDormand
tt0380510,movie,Drama,Saoirse Ronan
tt0380510,movie,Fantasy,Saoirse Ronan
tt0380510,movie,Thriller,Saoirse Ronan
tt1675434,movie,Biography,Omar Sy
tt1675434,movie,Comedy,Omar Sy
tt1675434,movie,Drama,Omar Sy
tt0369610,movie,Action,Omar Sy


In [0]:
genes.registerTempTable('genes')
spark.sql("select genres,primaryName from genes where genres = 'Action'").show()