# Data Prep:

In [0]:
import numpy as np
import pandas as pd
import imageio
import skimage
from sklearn.naive_bayes import GaussianNB
import re
from copy import deepcopy
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

## Global Definitions

In [0]:
PATH = "src" # Path where MovieGenre.csv $ /posters located
NEW_SIZE = (44,30,3)

## Prep Functions

In [0]:
def read_and_clean_data():
    df = pd.read_csv(PATH + "/MovieGenre.csv", encoding="ISO-8859-1", usecols=["imdbId", "Title", "Genre", "Poster"])
    df.set_index(["imdbId"], inplace=True)
    print(f"Shape of the original dataset: {df.shape}")
    df.dropna(inplace=True)
    print(f"Shape after dropping rows with missing values: {df.shape}")
    df.drop_duplicates(subset="Poster", keep=False, inplace=True)
    print(f"Shape after dropping rows with potentially misleading poster link: {df.shape}\n")
    return df

In [0]:
def get_genre_feature(genre):
    return "is_" + re.sub(r'\W+', '', genre.lower())

def create_boolean_genres(df):
    df["Genre"] = df.Genre.map(lambda x: x.split("|"))
    all_genres = set([item for l in df.Genre for item in l])
    filtered_genres = []
    for genre in all_genres:
        if len(movie_data[movie_data.Genre.apply(lambda genres: genre in genres)].index) < 2000 :
            print(f"{genre} was filtered out!")
            continue
        filtered_genres.append(genre)
        new_var = get_genre_feature(genre)
        df[new_var] = df.Genre.map(lambda x: genre in x)

    print(f"\nThere are {len(filtered_genres)} genres in the dataset: {filtered_genres}\n")    
    df.drop(["Genre"], axis=1, inplace=True)
    return filtered_genres

In [0]:
def add_year_variable(df):
    re_year = re.compile("\((\d{4})\)")
    df["Year"] = df.Title.map(lambda x: int(re_year.findall(x)[0]) if re_year.findall(x) else 0)
    print(f"There are movies between {int(np.min(df.Year))} and {int(np.max(df.Year))} available in the dataset.\n")

In [0]:
def filter_by_year(df, year):
    new_df = df.loc[df.Year >= year]
    return new_df

In [0]:
movie_data = read_and_clean_data()
add_year_variable(movie_data)
movie_data = filter_by_year(movie_data, 2000)
GENRES = create_boolean_genres(movie_data)

In [0]:
movie_data.shape

In [0]:
movie_data.sort_index(inplace=True)
movie_data.head()

## Get Images

In [0]:
def preprocess_image(img):
    return skimage.transform.resize(skimage.color.rgb2hsv(img), NEW_SIZE,  anti_aliasing=True,  preserve_range=True, mode='reflect')

In [0]:
# Add images via apply:
def get_image(row):
    image_path = PATH + "/posters/{}.jpg".format(row.name)
    try:
        img = preprocess_image(imageio.imread(image_path))
        return img
    except FileNotFoundError as e:
        pass
    except Exception as e:
        print("Error in: ",row.name, "\n" , e)
        try:
            img = imageio.imread(row.Poster)
            imageio.imwrite(image_path, img)
            img = preprocess_image(img)
            print(" -->  Fetched via Poster attribute")
            return img
        except:
            return None

In [0]:
movie_data.Poster = movie_data.apply(get_image, axis=1)
print("DONE")

## Create explenatory and dependent dataset

In [0]:
movie_data.dropna(inplace=True)

In [0]:
for genre in GENRES:
    print(f"{genre} movies in the dataset: {sum(movie_data[get_genre_feature(genre)])}")

In [0]:
movie_data

In [0]:
poster_df = pd.DataFrame(movie_data.apply(lambda row: row.Poster.ravel() ,axis=1).tolist(), index=movie_data.index)
genres_df = movie_data.drop(["Poster", "Title","Year"], axis=1)
poster_df.head()

In [0]:
X = list(movie_data.Poster.values)
y = movie_data.iloc[:, 3:].values * 1

# Baseline - Naive Bayes

In [0]:
cv = KFold(n_splits=5)
f1_scores = []
for train_index, test_index in cv.split(poster_df, genres_df):
    train_posters, test_posters, train_genres, test_genres = poster_df.iloc[train_index], poster_df.iloc[test_index], genres_df.iloc[train_index], genres_df.iloc[test_index]
    models_dict = {genre:GaussianNB() for genre in GENRES}
    model_pred = {}
    model_scores = {}
    for genre, model in models_dict.items():
        genre_bool = get_genre_feature(genre)
        model.fit(train_posters, train_genres[genre_bool])
        model_pred[get_genre_feature(genre)] = model.predict(test_posters)


    pred_df = pd.DataFrame(model_pred, index=test_genres.index).sort_index()
    f1 = f1_score(y_true=test_genres, y_pred=pred_df, average='weighted')
    f1_scores.append(f1)
    
print(f"Average F1-score is: {np.mean(f1_scores)}")

In [0]:
models_dict = {genre:GaussianNB() for genre in GENRES}
train_posters, test_posters, train_genres, test_genres  = train_test_split(poster_df, genres_df)
model_pred = {}
model_proba = {}
model_scores = {}
for genre, model in models_dict.items():
    genre_bool = get_genre_feature(genre)
    model.fit(train_posters, train_genres[genre_bool])
    model_pred[get_genre_feature(genre)] = model.predict(test_posters)
    model_proba[genre] = model.predict_proba(test_posters)[:,1]
    

pred_df = pd.DataFrame(model_pred, index=test_genres.index).sort_index()
proba_df = pd.DataFrame(model_proba, index=test_genres.index).sort_index().round(3)
f1 = f1_score(y_true=test_genres, y_pred=pred_df, average='weighted', )
print(f"Weighted F1-score is: {f1}")

# Basic ,Advanced and Creative

In [0]:
## Aux functions

In [0]:
def normalize_params(params, r):
    new_params = []
    ss = np.sqrt(sum([np.sum(x * x) for x in params]))
    for param in params:
        new_params.append(param * (r / ss))
    return new_params

def get_features(img_block):
    return img_block.ravel()
 
    
def normalize_factor(factor):
    return factor / np.sum(factor)

## Cluster and ClusterGraph classes 

In [0]:
class Cluster:
    def __init__(self,cluster_type, num_hidden, params=None, loc=None, genre_prior=0.5, is_ver=None, is_creative=False):
        self.loc = loc
        self.params = params
        self.type = cluster_type
        self.is_creative = is_creative
        self.is_ver = is_ver
        if cluster_type == 'y':
            self.pot = np.array([1-genre_prior, genre_prior] )
        elif cluster_type == 'hy':
            self.pot = np.exp(self.params[1])
        elif cluster_type == 'hhy':
            self.pot = np.exp(self.params[2] + self.params[3][self.is_ver])
        elif cluster_type == 'h': 
            self.pot = None
        
        self.messages = {}
        self.prev_messages = {}
        self.original_messages = {}
        self.neighbors = {}
        self.belief = None
    
    
    def set_params(self, params):
        self.params = params
        if self.type == 'hy':
            self.pot = np.exp(self.params[1])
        elif self.type == 'hhy':
            self.pot = np.exp(self.params[2] + self.params[3][self.is_ver])
    
    
    def set_pot_X(self, img_block):
        self.pot = np.exp(self.params[0] @ get_features(img_block))
    
    
    def set_pot_y(self, y):
        self.pot = self.pot[y]
    
    
    def add_neighbor(self, neighbor, messages, neighbor_type, loc=None):
        if neighbor_type == 'y':
            self.messages['y'] = deepcopy(messages)
            self.prev_messages['y'] = deepcopy(messages)
            self.original_messages['y'] = deepcopy(messages)
            self.neighbors['y'] = neighbor
        else:
            self.messages[(neighbor_type, loc)] = deepcopy(messages)
            self.prev_messages[(neighbor_type, loc)] = deepcopy(messages)
            self.original_messages[(neighbor_type, loc)] = deepcopy(messages)
            self.neighbors[(neighbor_type, loc)] = neighbor
    
    
    def reset_messages(self):
        self.prev_messages = deepcopy(self.original_messages)
        self.messages = deepcopy(self.original_messages)
        self.belief = None
    
    
    def calculate_belief(self, is_y_None):
        belief = self.pot
        if self.type == 'h':
            for neighbor in self.neighbors.values():
                belief = belief * neighbor.messages[('h', self.loc)]
        if is_y_None:
            if self.type == 'y':
                for neighbor in self.neighbors.values():
                    belief = belief * neighbor.messages['y']
            elif self.type == 'hy':
                for neighbor in self.neighbors.values():
                    if neighbor.type == 'y':
                        belief = belief * neighbor.messages[('hy', self.loc)][:, None]
                    else:
                        belief = belief * neighbor.messages[('hy', self.loc)][None, :]
            elif self.type == 'hhy':
                for neighbor in self.neighbors.values():
                    if neighbor.type == 'y':
                        belief = belief * neighbor.messages[('hhy', self.loc)][:, None, None]
                    else:
                        loc1, loc2 = self.loc
                        if neighbor.loc == loc1:
                            belief = belief * neighbor.messages[('hhy', self.loc)][None, :, None]
                        elif neighbor.loc == loc2:
                            belief = belief * neighbor.messages[('hhy', self.loc)][None, None, :]
        else:
            if self.type == 'hy':
                for neighbor in self.neighbors.values():
                    if neighbor.type != 'y':
                        belief = belief * neighbor.messages[('hy', self.loc)]
            elif self.type == 'hhy':
                for neighbor in self.neighbors.values():
                    if neighbor.type != 'y':
                        loc1, loc2 = self.loc
                        if neighbor.loc == loc1:
                            belief = belief * neighbor.messages[('hhy', self.loc)][:, None]
                        elif neighbor.loc == loc2:
                            belief = belief * neighbor.messages[('hhy', self.loc)][None, :]
        self.belief = normalize_factor(belief)

In [0]:
class ClusterGraph:
    def __init__(self, img_size, num_blocks, num_hidden, est_type, genre_prior, num_classes):
        self.liklihood = 0
        self.genre_prior = genre_prior
        self.est_type = est_type
        self.is_creative = (self.est_type == 'Creative')
        self.block_size = tuple(int(img_size[i] / num_blocks[i]) for i in range(2))
        self.num_features = 3 * self.block_size[0] * self.block_size[1] # RGB for each pixel
        self.img_size = img_size
        self.num_blocks = num_blocks
        self.num_hidden = num_hidden
        self.num_classes = num_classes
        self.params = [np.random.normal(0, 10, (self.num_hidden, self.num_features)), np.random
                       .normal(0, 10, (self.num_classes, self.num_hidden)), np.random.normal(0, 10, (self.num_classes, self.num_hidden, self.num_hidden)), np.zeros((2, self.num_classes, self.num_hidden, self.num_hidden))]        
            
        clusters = {'h':{}, 'hy':{}, 'hhy':{}}
        clusters['y'] = Cluster('y', num_hidden, genre_prior=genre_prior)
        self.all_clusters = set()
        self.all_clusters.add(clusters['y'])
        for i in range(num_blocks[0]):
            for j in range(num_blocks[1]):
                loc1 = (i, j)
                clusters['h'][loc1] = Cluster('h', num_hidden, self.params, loc1)
                clusters['hy'][loc1] = Cluster('hy', num_hidden, self.params, loc1)  
                if j != num_blocks[1] - 1:
                    loc2 = (i, j + 1)
                    clusters['hhy'][(loc1, loc2)] = Cluster('hhy', num_hidden, self.params, (loc1, loc2), is_ver=0)
                if i != num_blocks[0] - 1:
                    loc2 = (i + 1, j)
                    clusters['hhy'][(loc1, loc2)] = Cluster('hhy', num_hidden, self.params, (loc1, loc2), is_ver=1)
        for loc in clusters['hy'].keys():
            clusters['y'].add_neighbor(clusters['hy'][loc], np.ones(self.num_classes), 'hy', loc)
            clusters['hy'][loc].add_neighbor(clusters['y'], np.ones(self.num_classes), 'y')
            clusters['h'][loc].add_neighbor(clusters['hy'][loc], np.ones(self.num_hidden), 'hy', loc)
            clusters['hy'][loc].add_neighbor(clusters['h'][loc], np.ones(self.num_hidden), 'h', loc)
        for loc1, loc2 in clusters['hhy'].keys():
            clusters['y'].add_neighbor(clusters['hhy'][(loc1, loc2)], np.ones(self.num_classes), 'hhy', (loc1, loc2))
            clusters['hhy'][(loc1, loc2)].add_neighbor(clusters['y'], np.ones(self.num_classes), 'y')
            clusters['h'][loc1].add_neighbor(clusters['hhy'][(loc1, loc2)], np.ones(self.num_hidden), 'hhy', (loc1, loc2))
            clusters['h'][loc2].add_neighbor(clusters['hhy'][(loc1, loc2)], np.ones(self.num_hidden), 'hhy', (loc1, loc2))
            clusters['hhy'][(loc1, loc2)].add_neighbor(clusters['h'][loc1], np.ones(self.num_hidden), 'h', loc1)
            clusters['hhy'][(loc1, loc2)].add_neighbor(clusters['h'][loc2], np.ones(self.num_hidden), 'h', loc2)
        self.clusters = clusters
        
        for cluster_type in ['h', 'hy', 'hhy']:
            for cluster in self.clusters[cluster_type].values():
                self.all_clusters.add(cluster)        
     
    
    def reset_messages(self):
        for cluster in self.all_clusters:
            cluster.reset_messages()
    
    
    def set_params(self, params):
        self.params = params
        for cluster in self.all_clusters:
            cluster.set_params(params)
    
    
    def update_prev_messages(self):
        for cluster in self.all_clusters:
            cluster.prev_messages = deepcopy(cluster.messages)   


    def convert_to_blocks(self, image):
        row_blocks = np.split(image, self.num_blocks[0], axis=0)
        blocks = [np.split(row_block, self.num_blocks[1], axis=1) for row_block in row_blocks]
        return blocks
        
        
    def gradient_ascent(self, X, y, size, num_LBP_iterations, num_GD_iterations):
        for t in range(num_GD_iterations):
            step_size = 2 / (t + 1)
            new_params = deepcopy(self.params)
            grad = self.gradient(X, y, size, num_LBP_iterations)
            
            for j in range(4):
                new_params[j] += step_size * grad[j]
            self.set_params(normalize_params(new_params, 1))
    
        
    def gradient(self, X, y, size, num_LBP_iterations):
        self.liklihood = 0
        grad = [np.zeros((self.num_hidden, self.num_features)), np.zeros((self.num_classes, self.num_hidden)), np.zeros((self.num_classes, self.num_hidden, self.num_hidden)), np.zeros((2, self.num_classes, self.num_hidden, self.num_hidden))]
        for i in range(size):
            local_grad = self.local_gradient(self.convert_to_blocks(X[i]), y[i], num_LBP_iterations)
            for j in range(4):
                grad[j] += local_grad[j]
        return grad
   

    def local_gradient(self, blocks, label, num_LBP_iterations):
        grad = [np.zeros((self.num_hidden, self.num_features)), np.zeros((self.num_classes, self.num_hidden)), np.zeros((self.num_classes, self.num_hidden, self.num_hidden)), np.zeros((2, self.num_classes, self.num_hidden, self.num_hidden))]
        self.LBP(num_LBP_iterations, blocks, label)
        for loc, cluster in self.clusters['h'].items():
            grad[0] += np.outer(cluster.belief, get_features(blocks[loc[0]][loc[1]]))
            grad[1][label] = grad[1][label] + cluster.belief
        for cluster in self.clusters['hhy'].values():
            grad[2][label] += cluster.belief
            if self.is_creative:
                grad[3][cluster.is_ver][label] += cluster.belief
        self.LBP(num_LBP_iterations, blocks)
        self.liklihood += np.log(self.clusters['y'].belief[label])
        for loc, cluster in self.clusters['hy'].items():
            grad[0] -= np.outer(cluster.belief.sum(axis=0), get_features(blocks[loc[0]][loc[1]]))
            grad[1] -= cluster.belief
        for cluster in self.clusters['hhy'].values():
            grad[2] -= cluster.belief
            if self.is_creative:
                grad[3][cluster.is_ver] += cluster.belief
        return grad

        
    def LBP(self, num_iterations, blocks, label=None):
        self.reset_messages()
        self.set_params(self.params)
        for i in range(self.num_blocks[0]):
            for j in range(self.num_blocks[1]):
                self.clusters['h'][(i, j)].set_pot_X(blocks[i][j])
        if label is None:
            for t in range(num_iterations):
                y_cluster = self.clusters['y']
                neighbors = set(y_cluster.prev_messages.keys())
                for neighbor in neighbors:
                    messages = y_cluster.pot
                    other_neighbors = neighbors - set([neighbor])
                    for other_neighbor in other_neighbors:
                        neighbor_type = other_neighbor[0]
                        neighbor_loc = other_neighbor[1]
                        messages = messages * self.clusters[neighbor_type][neighbor_loc].messages['y']
                    y_cluster.messages[neighbor] = normalize_factor(messages)
                for loc, cluster in self.clusters['hy'].items():
                    cluster.messages['y'] = normalize_factor(np.apply_along_axis(lambda x: x@self.clusters['h'][loc].messages[('hy', loc)], axis=1, arr=cluster.pot))                   
                    cluster.messages[('h', loc)] = normalize_factor(np.apply_along_axis(lambda x: x@self.clusters['y'].messages[('hy', loc)], axis=0, arr=cluster.pot))
                for loc, cluster in self.clusters['h'].items():
                    neighbors = set(cluster.prev_messages.keys())
                    for neighbor in neighbors:
                        messages = cluster.pot
                        other_neighbors = neighbors - set([neighbor])
                        for other_neighbor in other_neighbors:
                            neighbor_type = other_neighbor[0]
                            neighbor_loc = other_neighbor[1]
                            messages = messages * self.clusters[neighbor_type][neighbor_loc].messages[('h', loc)]
                        cluster.messages[neighbor] = normalize_factor(messages)
                for loc, cluster in self.clusters['hhy'].items():
                    from_y = sum([self.clusters['y'].messages[('hhy', loc)][i] * cluster.pot[i,:,:] for i in range(self.num_classes)])
                    to_y = sum([self.clusters['h'][loc[0]].messages[('hhy', loc)][i] * cluster.pot[:,i,:] for i in range(self.num_hidden)])
                    cluster.messages['y'] = normalize_factor(sum([self.clusters['h'][loc[1]].messages[('hhy', loc)][i] * to_y[:,i] for i in range(self.num_hidden)]))
                    for i_loc in [0, 1]:
                        neighbor_recieve = self.clusters['h'][loc[1 - i_loc]].messages[('hhy', loc)]
                        cluster.messages[('h', loc[i_loc])] = normalize_factor(np.apply_along_axis(lambda x: x@neighbor_recieve, axis=1 - i_loc, arr=from_y))
                self.update_prev_messages()
        else:
            for loc, cluster in self.clusters['hy'].items():
                cluster.set_pot_y(label)
                cluster.prev_messages = cluster.messages = {('h', loc): cluster.pot}
            for cluster in self.clusters['hhy'].values():
                cluster.set_pot_y(label)
                cluster.messages.pop('y')  
            for t in range(num_iterations):
                for loc, cluster in self.clusters['h'].items():
                    neighbors = set(cluster.prev_messages.keys())
                    for neighbor in neighbors:
                        messages = cluster.pot
                        other_neighbors = neighbors - set([neighbor])
                        for other_neighbor in other_neighbors:
                            neighbor_type = other_neighbor[0]
                            neighbor_loc = other_neighbor[1]
                            messages = messages * self.clusters[neighbor_type][neighbor_loc].messages[('h', loc)]
                        cluster.messages[neighbor] = normalize_factor(messages)
                for loc, cluster in self.clusters['hhy'].items():
                    messages = self.clusters['y'].messages[('hhy', loc)]
                    for i_loc in [0, 1]:
                        neighbor_recieve = self.clusters['h'][loc[1 - i_loc]].messages[('hhy', loc)]
                        messages = np.apply_along_axis(lambda x: x@neighbor_recieve, axis=1 - i_loc, arr=cluster.pot)
                        neighbor = ('h', loc[i_loc])
                        cluster.messages[neighbor] = normalize_factor(messages)
                self.update_prev_messages()
        is_y_None = label is None
        self.calculate_belief(is_y_None)

        
    def calculate_belief(self, is_y_None):
        for cluster in self.all_clusters:
            cluster.calculate_belief(is_y_None)

## Estimator Class

In [0]:
class Estimator:
    def __init__(self, est_type, genre_prior):
        self.est_type = est_type
        self.genre_prior = genre_prior
        self.img_size = (44, 30)
        if est_type == 'Horizontal':
            self.num_blocks = (4, 1)
        elif est_type == 'Vertical':
            self.num_blocks = (1, 3)
        elif est_type == 'Advanced' or est_type == 'Creative':
            self.num_blocks = (4, 3)
        self.num_hidden = 3
        self.num_LBP_iterations = 8
        self.num_GD_iterations = 50
        self.cluster_graph = ClusterGraph(self.img_size, self.num_blocks, self.num_hidden, self.est_type, genre_prior, 2)
    
    def predict(self, X):
        results = []
        proba = self.predict_proba(X)
        for i in range(len(X)):
            if proba[i][1] > proba[i][0]:
                results.append(1)
            else:
                results.append(0)
        return np.array(results)
    
    def fit(self, X, y):
        size = len(X)
        self.cluster_graph.gradient_ascent(X, y, size, self.num_LBP_iterations, self.num_GD_iterations)
        return self

    def get_params(self, deep=True):
        return {}
    
    def predict_proba(self, X):
        results = []
        for i, image in enumerate(X):
            self.cluster_graph.LBP(self.num_LBP_iterations, self.cluster_graph.convert_to_blocks(image))
            results.append(self.cluster_graph.clusters['y'].belief)
        return results

## Expermients

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

X_train, X_test, y_train, y_test = train_test_split(X, y)
estimators = {'Horizontal': {}, 'Vertical': {}, 'Advanced': {}, 'Creative': {}}
for est_type in estimators.keys():
    pred = []
    estimators[est_type]['estimators'] = []
    for genre_index in range(y.shape[1]):
        genre_prior = (y_train[:, genre_index].sum() / y_train[:, genre_index].shape[0])
        est = Estimator(est_type, genre_prior)
        est.fit(X_train, y_train[:, genre_index])
        result = est.predict(X_test)
        pred.append(result)
        estimators[est_type]['estimators'].append((est, result))
    pred = np.array(pred).T
    estimators[est_type]['pred'] = pred
    f1 = f1_score(y_true=y_test, y_pred=pred,average='weighted')
    estimators[est_type]['f1'] = f1
    print(est_type, ap, f1)