In [1]:
import numpy as np
import numpy.linalg as la
import pandas as pd

In [2]:
def GNB(train, target, test):
    train['rating'] = target['rating']
    n_feat = train.shape[1]
    n_samp = train.shape[0]
    count_rating = np.zeros(5)
    count_male = len(train[train['Gender'] == 0])
    count_female = n_samp - count_male
    genre_dict = {'Drama': 1, 'Comedy': 2, 'Thriller' : 3, 'Action' : 4, 'Romance': 5, 'Horror': 6, 'Adventure': 7, 'Sci-Fi': 8, 'Children\'s' : 9, 'Crime': 10, 'War' : 11, 'Documentary' : 12, 'Musical': 13, 'Animation': 14, 'Mystery': 15, 'Fantasy': 16, 'Western': 17, 'Film-Noir': 18}
    genre_dict = list(genre_dict.keys())
    count_genres = np.array((5,18))
    count_male_r = np.array(5)
    count_female_r = np.array(5)
    year_mean_r = np.array(5)
    year_var_r = np.array(5)
    occupations = list(train.Occupation.unique())
    count_occup_r = np.array((5, len(occupations)))
    age_mean_r = np.array(5)
    age_var_r = np.array(5)
    data_means = data.groupby('rating').mean()
    data_variance = data.groupby('rating').var()
    
    for rating in range(1,6):
        
        count_rating[rating-1] = train['rating'][train['rating'] == rating].count()
        count_male_r[rating-1] = train[train['rating'] == rating & train['Gender'] == 0].count()
        count_female_r[rating-1] = count_rating[rating-1] - count_male_r[rating-1]
        for i, genre in enumerate(genre_dict):
            count_genres[rating-1][i] = train[train[str(genre)] == 1 & train['rating'] == rating].count()
        for i, occupation in enumerate(occupations):
            count_occup_r[rating-1][i] = train[train['Occupation'] == occupations[i] & train['rating'] == rating].count()
    
        year_mean_r[rating-1] = data_means['Year'][data_means.index == rating].values[0]
        year_var_r[rating-1] = data_variance['Year'][data_variance.index == rating].values[0]
        age_mean_r[rating-1] = data_means['Age'][data_means.index == rating].values[0]
        age_var_r[rating-1] = data_variance['Age'][data_variance.index == rating].values[0]
    
    print ('Classifier Made, Starting Predictive Task')
    
    pred = np.zeros(len(test))
    for i in range(len(test)):
        rating_probs = []
        for rating in range(1,6):
            prob_age = 0.
            prob_gender = 0.
            prob_occ = 0.
            prob_year = 0.
            prob_genre = 0.
            age_mean = age_mean_r[rating-1]
            age_var = age_var_r[rating-1]
            year_mean = year_mean_r[rating-1]
            year_var = year_var_r[rating-1]
            prob_age = 1/(np.sqrt(2*np.pi*age_var)) * np.exp((-(test['Age'][i]-age_mean)**2)/(2*age_var)) if (age_mean != 0 and age_var != 0) else 0.02
            prob_gender = count_male_r[rating-1]/count_rating[rating-1] if test['Gender'][i] == 0 else count_female_r[rating-1]/count_rating[rating-1]
            if prob_gender == 0: prob_gender = 0.5
            for i, occ in enumerate(occupations):
                if occ == test['Occupation'][i]:
                    prob_occ = count_occup_r[rating-1][i]/count_rating[rating-1]
            if prob_occ == 0: prob_occ = 1/len(occupations)
            
            prob_year = 1/(np.sqrt(2*np.pi*year_var)) * np.exp((-(test['Year'][i]-year_mean)**2)/(2*year_var)) if (year_mean != 0 and year_var != 0) else 0.02
            
            num_genres = 0
            for i, genre in genre_dict:
                if test[str(genre)][i] == 1:
                    prob_genre += (count_genres[rating-1][i])
                    num_genres += 1
            prob_genre /= (num_genres*count_rating[rating-1])
            if prob_genre == 0: prob_genre = 0.02
            prob_rating = prob_age * prob_gender * prob_occ * prob_year * prob_genre * (count_rating[rating-1]/n_samp)
            rating_probs.append(prob_rating)
        
        pred[i] = np.argmax(rating_probs) + 1
    
    return pred