In [116]:
import os.path as path
from pyspark.sql import SparkSession
from pyspark.sql import column
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
from pyspark.ml.feature import StringIndexer, VectorAssembler,OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.mllib.util import MLUtils
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier

In [152]:
def init_spark():
  return SparkSession \
        .builder \
        .appName("Big data project") \
        .config("spark.executor.memory", "8g") \
        .config("spark.driver.memory", "8g") \
        .getOrCreate()

spark = init_spark()

In [153]:
def get_data_dataframe(dir="final_dataset.csv"):
    # Specify the directory where the CSV files are saved
    csv_directory = "final_dataset.csv"
    
    # Read the CSV files back into a DataFrame
    final_df_read = spark.read.option("header", "true").csv(csv_directory)
    
    # Add a random column to shuffle data randomly
    shuffled_df = final_df_read.withColumn("rand", rand())
    shuffled_df = shuffled_df.orderBy("rand")
    final_df_read = shuffled_df.drop("rand")
    return final_df_read.head(10000)



In [154]:
def print_class_distribution(df):
    # Group the DataFrame by the "genre" column and count the occurrences of each genre
    genre_counts = df.groupBy("genre").count()
    
    # Show the genre counts
    genre_counts.show()

# Replacing Null Values With Zero

In [155]:
def show_null_values_distribution(train_df):
    null_value_list = list()
    for col_ in train_df.columns:
        null_value_list.append(train_df.filter(train_df[col_]=="\\N").count())                     
    plt.rcParams["figure.figsize"] = (10,6)
    columns = [col_ for col_ in train_df.columns]
    
    plt.figure(figsize=(10, 6))
    plt.bar(columns, null_value_list, color='skyblue')
    plt.xlabel('Columns')
    plt.ylabel('Number of Null Values')
    plt.ylim(0, train_df.count())
    plt.title('Distribution of Null Values in Columns')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

# Drop Duplicates

In [157]:
def drop_duplicates(train_df):
    print(train_df.count())
    train_df=train_df.dropDuplicates()
    print(train_df.count())

# Normalization

In [159]:
def get_stages_of_normalization(categoricalColumns =['title',
     'director',
     'writer',
     'actorPrimaryName'], predicting='genre'):
    stages = []
    for categoricalCol in categoricalColumns:
        stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index')
        # encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
        stages += [stringIndexer]
    label_stringIdx = StringIndexer(inputCol = predicting, outputCol = 'label')
    stages += [label_stringIdx]
    
    assemblerInputs = [c + "Index" for c in categoricalColumns] 
    assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
    stages += [assembler]
    return stages

In [160]:

def get_normalized_data(df):
    pipeline = Pipeline(stages = stages)
    pipelineModel = pipeline.fit(df)
    final_df_read = pipelineModel.transform(df)
    selectedCols = ['label', 'features','genre']+['title',
     'director',
     'writer',
     'actorPrimaryName']
    final_df_read = final_df_read.select(selectedCols)
    final_df_read.printSchema()
    return final_df_read

# Utils

In [146]:
def print_and_return_mapping_of_index_to_label(df,predicting='genre'):
    # Collect distinct pairs of (label, genre)
    label_genre_mapping = df.select("label", predicting).distinct().collect()
    map={}
    # Print the mapping
    for mapping in label_genre_mapping:
        print("Label %s is mapped to genre '%s'" % (mapping.label, mapping.genre))
        map[mapping.label]= mapping.genre
    return map

In [147]:
print_and_return_mapping_of_index_to_label(final_df_read)

Label 2.0 is mapped to genre 'Action'
Label 5.0 is mapped to genre 'Romance'
Label 1.0 is mapped to genre 'Comedy'
Label 0.0 is mapped to genre 'Drama'
Label 4.0 is mapped to genre 'Thriller'
Label 3.0 is mapped to genre 'Horror'


{2.0: 'Action',
 5.0: 'Romance',
 1.0: 'Comedy',
 0.0: 'Drama',
 4.0: 'Thriller',
 3.0: 'Horror'}

# Train and Test

In [126]:
# Split the data into training and testing sets
train_df, test_df = final_df_read.randomSplit([0.8, 0.2], seed=42)

# Show the number of rows in each set
print("Training set count:", train_df.count())
print("Testing set count:", test_df.count())

24/03/20 16:07:44 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB


Training set count: 63052
Testing set count: 15495


24/03/20 16:07:45 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB


In [127]:
dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label',maxBins=60000)
dtModel = dt.fit(train_df)

24/03/20 16:07:45 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/03/20 16:07:46 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/03/20 16:07:46 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/03/20 16:07:54 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/03/20 16:08:02 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/03/20 16:08:11 WARN DAGScheduler: Broadcasting large task binary with size 4.6 MiB
24/03/20 16:08:28 WARN DAGScheduler: Broadcasting large task binary with size 4.8 MiB
                                                                                

In [128]:
predictions = dtModel.transform(test_df)
predictions.show(10)

+-----+--------------------+-----+-------+--------------------+------------------+--------------------+--------------------+--------------------+----------+
|label|            features|genre|  title|            director|            writer|    actorPrimaryName|       rawPrediction|         probability|prediction|
+-----+--------------------+-----+-------+--------------------+------------------+--------------------+--------------------+--------------------+----------+
|  0.0|[6.0,3157.0,2194....|Drama| Escape|      Victor Bojinov|    Delyana Maneva|       Hristo Petkov|[9406.0,6620.0,21...|[0.42624733765350...|       0.0|
|  0.0|[11.0,189.0,155.0...|Drama|Goliath|  Adilkhan Yerzhanov|Adilkhan Yerzhanov|    Daniyar Alshinov|[390.0,70.0,90.0,...|[0.64462809917355...|       0.0|
|  0.0|[11.0,623.0,5627....|Drama|Goliath|    Frédéric Tellier|     Gaëlle Bellan|     Laurent Stocker|[19.0,120.0,259.0...|[0.04059829059829...|       2.0|
|  0.0|[22.0,2756.0,2530...|Drama|   Vera|    Nedeljko Kov

24/03/20 16:09:01 WARN DAGScheduler: Broadcasting large task binary with size 5.2 MiB


# Evaluation

In [129]:


def print_statistis(predictions,mapping):

    # Compute raw scores on the test set
    predictionAndLabels = predictions.rdd.map(lambda lp: (lp.prediction, lp.label))
    
    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabels)
    confusion_matrix = metrics.confusionMatrix().toArray()
    
    # Overall statistics
    precision = metrics.precision(1.0)
    recall = metrics.recall(1.0)
    f1Score = metrics.fMeasure(1.0)
    total_predictions = confusion_matrix.sum(axis=1)
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)
    
    # Statistics by class
    labels = predictions.rdd.map(lambda lp: lp.label).distinct().collect()
    accuracies = {}
    for label in sorted(labels):
        print("______________________"+mapping[label]+"_____________________")
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
        print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))
        correct_predictions = confusion_matrix[label, label]
        accuracy = correct_predictions / total_predictions[label]
        print("Accuracy for label %s: %s" % (label, accuracy))
    
    # # Calculate the accuracy for each label
   
    # print(total_predictions)
    # for label in range(len(total_predictions)):
    #     correct_predictions = confusion_matrix[label, label]
    #     accuracy = correct_predictions / total_predictions[label]
    #     accuracies[label] = accuracy
    
    # # Print accuracies for each label
    # for label, accuracy in accuracies.items():
        
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"Test Accuracy: {accuracy:.2f}")

24/03/20 16:09:01 WARN DAGScheduler: Broadcasting large task binary with size 5.2 MiB


Test Accuracy: 0.41


In [150]:
print_statistis(predictions,print_and_return_mapping_of_index_to_label())

24/03/20 16:54:53 WARN DAGScheduler: Broadcasting large task binary with size 5.2 MiB
24/03/20 16:54:54 WARN DAGScheduler: Broadcasting large task binary with size 5.2 MiB
24/03/20 16:56:21 WARN DAGScheduler: Broadcasting large task binary with size 5.2 MiB


Summary Stats
Precision = 0.41846323935876173
Recall = 0.17592377411108528
F1 Score = 0.2477094240837696




Class 0.0 precision = 0.4130849220103986
Class 0.0 recall = 0.811404255319149
Class 0.0 F1 Measure = 0.5474590869939707
Class 1.0 precision = 0.41846323935876173
Class 1.0 recall = 0.17592377411108528
Class 1.0 F1 Measure = 0.2477094240837696
Class 2.0 precision = 0.35239423523942354
Class 2.0 recall = 0.30626262626262624
Class 2.0 F1 Measure = 0.32771292693471676
Class 3.0 precision = 0.0
Class 3.0 recall = 0.0
Class 3.0 F1 Measure = 0.0
Class 4.0 precision = 0.0
Class 4.0 recall = 0.0
Class 4.0 F1 Measure = 0.0
Class 5.0 precision = 0.0
Class 5.0 recall = 0.0
Class 5.0 F1 Measure = 0.0
[5875. 4303. 2475. 1458. 1090.  299.]
Accuracy for label 0: 0.811404255319149
Accuracy for label 1: 0.17592377411108528
Accuracy for label 2: 0.30626262626262624
Accuracy for label 3: 0.0
Accuracy for label 4: 0.0
Accuracy for label 5: 0.0


                                                                                

Accuracy for label 0: 0.811404255319149
Accuracy for label 1: 0.17592377411108528
Accuracy for label 2: 0.30626262626262624
Accuracy for label 3: 0.0
Accuracy for label 4: 0.0
Accuracy for label 5: 0.0
