#Part 2

In [None]:
!pip install pyspark
!pip install findspark

###Reading the Data

In [None]:
import os
import findspark
findspark.init()
from pyspark.sql.types import *
from pyspark.sql.functions import *

from pyspark import SparkContext
from pyspark.sql import SparkSession

def init_spark(app_name: str):
 spark = SparkSession.builder.appName(app_name).getOrCreate()
 sc = spark.sparkContext
 return spark, sc

spark = SparkSession.builder.appName("my project 1").getOrCreate()
sc = spark.sparkContext
sc

# Read a CSV into a dataframe
def load_PD_file(filename_or_dir, schema) :
    dataPath = "/mnt/ddscoursedatastorage/fwm-stb-data/" + filename_or_dir
    df = spark.read.format("csv")\
      .option("header","false")\
      .option("delimiter", "|")\
      .schema(schema)\
      .load(dataPath)
    return df

# Reading the Reference Parquet files

ref_data_df = spark.read.parquet('/ref_data_raw').withColumnRenamed("_device-id","device_id")\
                                                .withColumnRenamed("_dma","dma")\
                                                .withColumnRenamed("_dma-code","dma_code")\
                                                .withColumnRenamed("_household-id","household_id")\
                                                .withColumnRenamed("_household-type","household_type")\
                                                .withColumnRenamed("_system-type","system_type")\
                                                .withColumnRenamed("_zipcode","zipcode")

# Reading the Daily Programs CSV file

daily_prog_schema =  StructType([StructField('prog_code',StringType()),
                     StructField('title',StringType()),
                     StructField('genre',StringType()),
                     StructField('air_date',StringType()),
                     StructField('air_time',StringType()),
                     StructField('Duration',FloatType())
                                       ])
program_data_df = load_PD_file("Daily program data/" , daily_prog_schema  )


# Reading the 2.5% sample of the viewing data from a Parquet file

viewing_data_df = spark.read.parquet('/sample_viewing_2_5percent')


# Reading the Demographic CSV file

demographic_schema =  StructType([StructField('household_id',StringType()),
                      StructField('household_size',IntegerType()),
                      StructField('num_adults',IntegerType()),
                      StructField('num_generations',IntegerType()),
                      StructField('adult_range',StringType()),
                      StructField('marital_status',StringType()),
                      StructField('race_code',StringType()),
                      StructField('presence_children',StringType()),
                      StructField('num_children',IntegerType()),
                      StructField('age_children',StringType()), #format like range - 'bitwise'
                      StructField('age_range_children',StringType()),
                      StructField('dwelling_type',StringType()),
                      StructField('home_owner_status',StringType()),
                      StructField('length_residence',IntegerType()),
                      StructField('home_market_value',StringType()),
                      StructField('num_vehicles',IntegerType()),
                      StructField('vehicle_make',StringType()),
                      StructField('vehicle_model',StringType()),
                      StructField('vehicle_year',IntegerType()),
                      StructField('net_worth',IntegerType()),
                      StructField('income',StringType()),
                      StructField('gender_individual',StringType()),
                      StructField('age_individual',IntegerType()),
                      StructField('education_highest',StringType()),
                      StructField('occupation_highest',StringType()),
                      StructField('education_1',StringType()),
                      StructField('occupation_1',StringType()),
                      StructField('age_2',IntegerType()),
                      StructField('education_2',StringType()),
                      StructField('occupation_2',StringType()),
                      StructField('age_3',IntegerType()),
                      StructField('education_3',StringType()),
                      StructField('occupation_3',StringType()),
                      StructField('age_4',IntegerType()),
                      StructField('education_4',StringType()),
                      StructField('occupation_4',StringType()),
                      StructField('age_5',IntegerType()),
                      StructField('education_5',StringType()),
                      StructField('occupation_5',StringType()),
                      StructField('polit_party_regist',StringType()),
                      StructField('polit_party_input',StringType()),
                      StructField('household_clusters',StringType()),
                      StructField('insurance_groups',StringType()),
                      StructField('financial_groups',StringType()),
                      StructField('green_living',StringType())
                                       ])

demographic_data_df = load_PD_file("demographic/" , demographic_schema  )


##2.1

In [None]:
#selecting only necessary columns
program_data_df_2 = program_data_df.select('prog_code', 'genre')
viewing_data_df_2 = viewing_data_df.select('device_id', 'prog_code')
ref_data_df_2 = ref_data_df.select('device_id', 'dma').na.drop().dropDuplicates()

# Split the genre field and explode the resulting array
program_data_df_2 = program_data_df_2.withColumn('genre', explode(split(col('genre'), ','))).distinct()

#merging dataframes
merged_df = viewing_data_df_2.join(program_data_df_2, on='prog_code', how='inner').join(ref_data_df_2, on='device_id', how='inner')

#calculate popularity in dma
popularity_df = merged_df.groupBy('dma', 'genre').agg(count('prog_code').alias('popularity')).drop('prog_code').dropDuplicates()

#order by popularity
popularity_df = popularity_df.orderBy('dma', 'popularity', ascending=False)

#saving a pandas dataframe of popularity to save to a csv file
popularity_pandas_df = popularity_df.toPandas()

#displaying the results
display(popularity_pandas_df)

#getting all dma codes
distinct_dmas = popularity_df.select('dma').distinct().rdd.map(lambda x: x[0]).collect()

#creating a seperate dataframe for each dma
dma_dataframes = {}

for dma in distinct_dmas:
    if dma in ['Waco-Temple-Bryan', 'New York', 'Washington, DC (Hagrstwn)']:
        dma_dataframes[dma] = popularity_df.filter(col('dma') == dma).orderBy('popularity', ascending=False).limit(5).select('dma', 'genre')

#showing most popular genres for each dma
for df in dma_dataframes.values():
    df.show()

##2.2

####Preprocessing

In [None]:
#selecting columns
program_data_df_2 = program_data_df.select('prog_code', 'genre')
viewing_data_df_2 = viewing_data_df.select('device_id', 'prog_code')
ref_data_df_2 = ref_data_df.select('household_id', 'device_id', 'dma').na.drop().dropDuplicates()
demographic_data_df_2 = demographic_data_df.select('household_id', 'net_worth', 'income')

#replacing null with 0
demographic_data_df_2 = demographic_data_df_2.fillna(0)
demographic_data_df_2 = demographic_data_df_2.withColumn('income', when(col('income') != 'null', col('income')).otherwise(0))

#fixing values
mapping = {'A': 10, 'B': 11, 'C': 12, 'D': 13}

demographic_data_df_2 = demographic_data_df_2.withColumn('income',
                                when(col('income') == 'A', mapping['A'])
                               .when(col('income') == 'B', mapping['B'])
                               .when(col('income') == 'C', mapping['C'])
                               .when(col('income') == 'D', mapping['D'])
                               .otherwise(col('income')))

demographic_data_df_2 = demographic_data_df_2.withColumn('income', col('income').cast('integer'))
demographic_data_df_2 = demographic_data_df_2.withColumn('net_worth', col('net_worth').cast('integer'))

####Calculating Wealth-Score per DMA

In [None]:
#finding max net worth
max_net_worth = demographic_data_df_2.agg(max('net_worth')).collect()[0][0]

#finding max income
max_income = demographic_data_df_2.agg(max('income')).collect()[0][0]

#matching designated market areas to households
demo_ref_df = ref_data_df_2.join(demographic_data_df_2, on='household_id', how='inner').drop('household_id', 'device_id')

#calculating average networth per dma
avg_net_worth_per_dma = demo_ref_df.groupBy('dma').agg(avg('net_worth').alias('avg_net_worth')).drop('net_worth').distinct()

#calculating average income per dma
avg_income_per_dma = demo_ref_df.groupBy('dma').agg(avg('income').alias('avg_income')).drop('income').distinct()

#adding data to the main dataframe
demo_ref_df = demo_ref_df.join(avg_net_worth_per_dma, on='dma').drop('net_worth')
demo_ref_df = demo_ref_df.join(avg_income_per_dma, on='dma').drop('income').distinct()

#calculating wealth score per dma
demo_ref_df = demo_ref_df.withColumn('wealth_score', (col('avg_net_worth')/max_net_worth) + (col('avg_income')/max_income))

#dropping unnecessary columns
demo_ref_df = demo_ref_df.drop('avg_net_worth', 'avg_income')

#ordering DMAs by wealth score
dma_by_order = demo_ref_df.orderBy('wealth_score', ascending=False).select('dma').rdd.map(lambda x: x[0]).collect()

####Finding most popular genres per dma

In [None]:
# Split the genre field and explode the resulting array
program_data_df_2 = program_data_df_2.withColumn('genre', explode(split(col('genre'), ','))).distinct()

#saving all genres to a set
genres_df = program_data_df_2.select('genre').distinct()
available_genres = program_data_df_2.select('genre').distinct().rdd.map(lambda x: x[0]).collect()
available_genres = set(available_genres)

#getting rid of unnecessary columns
ref_data_df_2 = ref_data_df_2.drop('household_id')

#merging programs with DMAs
merged_df = viewing_data_df_2.join(program_data_df_2, on='prog_code', how='inner').join(ref_data_df_2, on='device_id', how='inner').distinct()

#calculate popularity in dma
popularity_df = merged_df.groupBy('dma', 'genre').agg(count('prog_code').alias('popularity')).drop('prog_code').dropDuplicates()

#Cartesian join between genres and DMAs
dma_genre_df = merged_df.select('dma').distinct().crossJoin(genres_df)

#adding genres to dma with no score with the popularity score 0
popularity_df = dma_genre_df.join(popularity_df, on=['dma', 'genre'], how='left')
popularity_df = popularity_df.fillna(0)

#order by popularity
popularity_df = popularity_df.orderBy('dma', 'popularity', ascending=False)

####Creating a dataframe of popular genres for each DMA

In [None]:
#creating a seperate dataframe for each dma
dma_dataframes = {}

#filling each dataframe with the best genres for that dma
for dma in dma_by_order:
    dma_dataframes[dma] = popularity_df.filter(col('dma') == dma).orderBy('popularity', ascending=False).select('dma', 'genre')

# Dictionary to store selected genres for each DMA
selected_genres = {}

#iterating over all DMAs and selecting the best genres for each
for dma in dma_by_order:
    selected_genres[dma] = []
    for row in dma_dataframes[dma].select('genre').collect():
        genre = row['genre']
        if genre in available_genres:
            selected_genres[dma].append(genre)
            available_genres.remove(genre)

            if len(selected_genres[dma]) == 8:
                break

# Convert the dictionary to a list of tuples
selected_genres_list = [(dma, genres) for dma, genres in selected_genres.items()]

# Create a DataFrame from the list
selected_genres_df = spark.createDataFrame(selected_genres_list, ['dma', 'genre'])

#add wealth_result
selected_genres_df = selected_genres_df.join(demo_ref_df, on='dma').select('dma', 'wealth_score', 'genre')
selected_genres_df = selected_genres_df.orderBy('wealth_score', ascending=False)

####Displaying results

In [None]:
# Convert DataFrame to Pandas DataFrame
pandas_df = selected_genres_df.toPandas()
pandas_df['genre'] = pandas_df['genre'].apply(lambda x: ', '.join(x))

#display results in a csv format to save
display(pandas_df)

selected_genres_df.show(25)

dma,wealth_score,genre
San Antonio,1.623931623931624,
Baltimore,1.3484309314846228,"Reality, Talk, Sitcom, News, Comedy, Documentary, Drama, Children"
San Francisco-Oak-San Jose,1.3357808419815185,"Animated, Adventure, Fantasy, Educational, Crime, Action, Sports non-event, Cooking"
Detroit,1.305826181524095,"Sports event, Crime drama, House/garden, Entertainment, Mystery, Suspense, Newsmagazine, Special"
Austin,1.2722546588818684,"Game show, Football, Consumer, Interview, Travel, Politics, Law, Public affairs"
Sacramnto-Stkton-Modesto,1.2359677521362542,"Shopping, Western, Science fiction, Home improvement, Music, Horror, Animals, Outdoors"
Cleveland-Akron (Canton),1.211882525608016,"Basketball, Science, History, Baseball, Playoff sports, How-to, Golf, Paranormal"
Harrisburg-Lncstr-Leb-York,1.2019055299195078,"Medical, Health, Nature, Romance, Sports talk, Auto, Religious, Soap"
Toledo,1.199510473429358,"Romance-comedy, Comedy-drama, War, Fashion, Docudrama, Auto racing, Weather, Bus./financial"
Philadelphia,1.1948499023773462,"Hunting, Community, Biography, Hockey, Tennis, Fishing, Soccer, Variety"


+--------------------+------------------+--------------------+
|                 dma|      wealth_score|               genre|
+--------------------+------------------+--------------------+
|         San Antonio| 1.623931623931624|                  []|
|           Baltimore|1.3484309314846228|[Reality, Talk, S...|
|San Francisco-Oak...|1.3357808419815185|[Animated, Advent...|
|             Detroit| 1.305826181524095|[Sports event, Cr...|
|              Austin|1.2722546588818684|[Game show, Footb...|
|Sacramnto-Stkton-...|1.2359677521362542|[Shopping, Wester...|
|Cleveland-Akron (...| 1.211882525608016|[Basketball, Scie...|
|Harrisburg-Lncstr...|1.2019055299195078|[Medical, Health,...|
|              Toledo| 1.199510473429358|[Romance-comedy, ...|
|        Philadelphia|1.1948499023773462|[Hunting, Communi...|
|             Houston| 1.193882983854647|[Spanish, Collect...|
|           Lexington| 1.191293806480053|[Horse, Technolog...|
|      Seattle-Tacoma|1.1870636727174375|[Action sports

#Part 3

In [None]:
pandas_df_3 = selected_genres_df.toPandas()
program_data_df_3 = program_data_df.na.drop()

#selecting dma's with not empty genre value
non_empty_dmas = pandas_df_3[pandas_df_3['genre'].apply(lambda x: len(x) > 0)]['dma'].unique()

#create folder
path = f"/mnt/partitions_211903752_337844252"
dbutils.fs.mkdirs(path)

for dma in non_empty_dmas:
    #for every such dma creating a folder
    dma_path = f"{path}/{dma}"
    dbutils.fs.mkdirs(dma_path)
    # getting the 8 most popular genres for each such dma
    genres_per_dma = pandas_df_3[(pandas_df_3['dma'] == dma) & (pandas_df_3['genre'].apply(lambda x: len(x) > 0))]['genre'].tolist()
    for genres in genres_per_dma:
        #creating dataframes and saving in csv folders
        for genre in genres:
            #saving relevant records in seperate dataframe
            genre_programs_df = program_data_df_3.filter(col('genre') == genre)

            #writing dataframe to csv
            genre_dir_path = f"{path}/{genre}"
            genre_programs_df.write.csv(f'{genre_dir_path}.csv')