In [None]:
Pkg.add("PGFPlots");
Pkg.add("Iterators");
Pkg.add("BayesNets");
Pkg.add("LightGraphs");
Pkg.add("TikzGraphs");
Pkg.add("Discretizers");
Pkg.add("RDatasets");
Pkg.add("Plots");

In [None]:
using Iterators
using LightGraphs
using BayesNets
using PGFPlots
using TikzGraphs
using Discretizers
using RDatasets
using Plots

In [None]:
movieData = readtable("movie_metadata.csv");

In [None]:
#variables:
#Gross (query variable)
#Budget (evidence/input variable)
#Genre (evidence/input variable)
#IMDB score (evidence/input variable)
#Number of critic reviews (evidence/input variable)
#Number of Movie facebook likes
#Total cast facebook likes
#Number of faces in movie poster
#Director facebook likes
#Content rating
#Duration
#Title year


In [None]:
##### MANUAL BINNING #####

titleYear_edges = [1980,2000,2005,2010,2020]

budget_edges = [0,10000001,35000001,300000001]

gross_edges = [0,10000000,50000000,200000000,500000000,800000000]

duration_edges = [91,121,151,331]

numMovieFacebookLikes_edges = [1,10000,400000]

directorFacebookLikes_edges = [1,10000,40000]

castMovieLikes_edges = [100,10000,40000,700000]

numFacesInPoster_edges = [1,2,6,20]

imdbScore_edges = [3.1,4.1,5.1,6.1,7.1,8.1,9.1,10]

critic_edges = [101,201,301,401,820]

In [None]:
##### UNIFORM WIDTH BINNING #####
#nbinsLarge = 7
#nbinsSmall = 3

titleYear_bins = 5
titleYear_edges = binedges(DiscretizeUniformWidth(titleYear_bins), movieData[:title_year])

budget_bins = 3
budget_edges = binedges(DiscretizeUniformWidth(budget_bins), movieData[:budget])

gross_bins = 5
gross_edges = binedges(DiscretizeUniformWidth(gross_bins), movieData[:gross])

duration_bins = 4
duration_edges = binedges(DiscretizeUniformWidth(duration_bins), movieData[:duration])

numMovieFacebookLikes_bins = 3
numMovieFacebookLikes_edges = binedges(DiscretizeUniformWidth(numMoveFacebookLikes_bins), movieData[:movie_facebook_likes])

directorFacebookLikes_bins = 3
directorFacebookLikes_edges = binedges(DiscretizeUniformWidth(directorFacebookLikes_bins), movieData[:director_facebook_likes])

castMovieLikes_bins = 4
castMovieLikes_edges = binedges(DiscretizeUniformWidth(castMovieLikes_bins), movieData[:cast_total_facebook_likes])

numFacesInPoster_bins = 4
numFacesInPoster_edges = binedges(DiscretizeUniformWidth(numFacesInPoster_bins), movieData[:facenumber_in_poster])

imdbScore_bins = 8
imdbScore_edges = binedges(DiscretizeUniformWidth(imdbScore_bins), movieData[:imdb_score])

critic_bins = 5
critic_edges = binedges(DiscretizeUniformWidth(critic_bins), movieData[:num_critic_for_reviews])


In [None]:
##### UNIFORM COUNT BINNING #####
#nbinsLarge = 7
#nbinsSmall = 3

titleYear_bins = 5
titleYear_edges = binedges(DiscretizeUniformCount(titleYear_bins), movieData[:title_year])

budget_bins = 3
budget_edges = binedges(DiscretizeUniformCount(budget_bins), movieData[:budget])

gross_bins = 5
gross_edges = binedges(DiscretizeUniformCount(gross_bins), movieData[:gross])

duration_bins = 4
duration_edges = binedges(DiscretizeUniformCount(duration_bins), movieData[:duration])

numMovieFacebookLikes_bins = 3
numMovieFacebookLikes_edges = binedges(DiscretizeUniformCount(numMoveFacebookLikes_bins), movieData[:movie_facebook_likes])

directorFacebookLikes_bins = 3
directorFacebookLikes_edges = binedges(DiscretizeUniformCount(directorFacebookLikes_bins), movieData[:director_facebook_likes])

castMovieLikes_bins = 4
castMovieLikes_edges = binedges(DiscretizeUniformCount(castMovieLikes_bins), movieData[:cast_total_facebook_likes])

numFacesInPoster_bins = 4
numFacesInPoster_edges = binedges(DiscretizeUniformCount(numFacesInPoster_bins), movieData[:facenumber_in_poster])

imdbScore_bins = 8
imdbScore_edges = binedges(DiscretizeUniformCount(imdbScore_bins), movieData[:imdb_score])

critic_bins = 5
critic_edges = binedges(DiscretizeUniformCount(critic_bins), movieData[:num_critic_for_reviews])

In [None]:
##### BAYESIAN BLOCKS BINNING #####

titleYear_edges = binedges(DiscretizeBayesianBlocks(), movieData[:title_year])

budget_edges = binedges(DiscretizeBayesianBlocks(), movieData[:budget])

gross_edges = binedges(DiscretizeBayesianBlocks(), movieData[:gross])

duration_edges = binedges(DiscretizeBayesianBlocks(), movieData[:duration])

numMovieFacebookLikes_edges = binedges(DiscretizeBayesianBlocks(), movieData[:movie_facebook_likes])

directorFacebookLikes_edges = binedges(DiscretizeBayesianBlocks(), movieData[:director_facebook_likes])

castMovieLikes_edges = binedges(DiscretizeBayesianBlocks(), movieData[:cast_total_facebook_likes])

numFacesInPoster_edges = binedges(DiscretizeBayesianBlocks(), movieData[:facenumber_in_poster])

imdbScore_edges = binedges(DiscretizeBayesianBlocks(), movieData[:imdb_score])

critic_edges = binedges(DiscretizeBayesianBlocks(), movieData[:num_critic_for_reviews])

In [None]:
#data discretization
contentRating_discretizer = CategoricalDiscretizer(movieData[:content_rating])

titleYear_discretizer = LinearDiscretizer(titleYear_edges)

budget_discretizer = LinearDiscretizer(budget_edges)

gross_discretizer = LinearDiscretizer(gross_edges)

duration_discretizer = LinearDiscretizer(duration_edges)

numMovieFacebookLikes_discretizer = LinearDiscretizer(numMovieFacebookLikes_edges)

directorFacebookLikes_discretizer = LinearDiscretizer(directorFacebookLikes_edges)

castMovieLikes_discretizer = LinearDiscretizer(castMovieLikes_edges)

numFacesInPoster_discretizer = LinearDiscretizer(numFacesInPoster_edges)

imdbScore_discretizer = LinearDiscretizer(imdbScore_edges)

critic_discretizer = LinearDiscretizer(critic_edges)

In [None]:
dataDiscretized = DataFrame(
    gross = encode(gross_discretizer, movieData[:gross]),
    budget = encode(budget_discretizer, movieData[:budget]),
    numGenres = movieData[:num_genres],
    imdbScore = encode(imdbScore_discretizer, movieData[:imdb_score]),
    numCriticReviews = encode(critic_discretizer, movieData[:num_critic_for_reviews]),
    numMovieFacebookLikes = encode(numMovieFacebookLikes_discretizer, movieData[:movie_facebook_likes]),
    castMovieLikes = encode(castMovieLikes_discretizer, movieData[:cast_total_facebook_likes]),
    numFacesInPoster = encode(numFacesInPoster_discretizer, movieData[:facenumber_in_poster]),
    directorFacebookLikes = encode(directorFacebookLikes_discretizer, movieData[:director_facebook_likes]),
    contentRating = encode(contentRating_discretizer, movieData[:content_rating]),
    duration = encode(duration_discretizer, movieData[:duration]),
    titleYear = encode(titleYear_discretizer, movieData[:title_year]),
);

In [None]:
totalSize = length(dataDiscretized[1])
percentageTrain = 0.9
lastTrainExample = Int(floor(percentageTrain*totalSize))

In [None]:
writetable("dataDiscretized.csv", dataDiscretized)

dataDiscretizedTrain = dataDiscretized[1:lastTrainExample,:];
dataDiscretizedTest = dataDiscretized[lastTrainExample+1:totalSize,:];
#display(dataDiscretizedTrain)
#display(dataDiscretizedTest)

In [None]:
##structure learning
#K2
params = K2GraphSearch([:budget, :gross, :numFacesInPoster, :directorFacebookLikes, :titleYear, :duration, :contentRating, :castMovieLikes, :imdbScore, :numCriticReviews, :numMovieFacebookLikes],
                        DiscreteCPD,
                        max_n_parents=4);
#CategoricalCPD{Categorical{Float64}}
#DiscreteCPD
bn = fit(DiscreteBayesNet, dataDiscretizedTrain, params)

In [None]:
#greedy hill climbing
params2 = GreedyHillClimbing(ScoreComponentCache(dataDiscretized), max_n_parents=3, prior=UniformPrior())
bn2 = fit(DiscreteBayesNet, dataDiscretized, params2)

In [None]:
bayesian_score(bn, dataDiscretizedTrain)

In [None]:
function likelihoodWeightedSampling(table, value)
    numerator = 0
    denominator = 0  
    for i = 1:length(table[:,1])
        row = table[i,:]
        if row[:gross][1] == value
            numerator = numerator + row[:p][1]
        end
        denominator = denominator + row[:p][1]
    end
    return numerator./denominator
end

In [None]:
#getMostLikelyClass(table,5)

In [None]:
function getMostLikelyClass(table,numClasses)
    highestLikelihood = 0
    mostLikelyClass = -1
    classProbabilities = zeros(numClasses)
    for i = 1:length(table[:,1])
        row = table[i,:]
        grossCategory = row[:gross][1]
        classProbabilities[grossCategory]+= row[:p][1]
        if classProbabilities[grossCategory] > highestLikelihood
            highestLikelihood = classProbabilities[grossCategory]
            mostLikelyClass = grossCategory
        end
    end
    return mostLikelyClass
end

In [None]:
function getPredictionError(testDataTable)
    numMistakes = 0
    for i = 1:length(testDataTable[:,1])
        row = testDataTable[i,:]
        gross = row[:gross][1]
        budget = row[:budget][1]
        imdbScore = row[:imdbScore][1]
        numCriticReviews = row[:numCriticReviews][1]
        numGenres = row[:numGenres][1]
        println(i)
        table = rand_table_weighted(bn2; nsamples=1000, consistent_with=Assignment(:budget=>budget,:imdbScore=>imdbScore,:numCriticReviews=>numCriticReviews, :numGenres=>numGenres))
        #println("here") 
        estimatedTable = estimate(table)
        
        predictedGrossCategory = getMostLikelyClass(estimatedTable,nbinsSmall)
        
        if predictedGrossCategory != gross
            numMistakes += 1
        end
        
    end
    return numMistakes/length(testDataTable[:,1])
end

In [None]:
getPredictionError(dataDiscretizedTest)

In [None]:
## DEBUGGING CODE - IGNORE.
row = dataDiscretizedTest[34,:]
display(row)
display(critic_edges)
#table(bn, :numCriticReviews)
#count(bn, :numCriticReviews, dataDiscretized)
table = rand_table_weighted(bn; nsamples=1000, consistent_with=Assignment(:budget=>1,:imdbScore=>6,:numCriticReviews=>4, :numGenres=>3))
#estimatedTable = estimate(table)