In [64]:
import numpy as np
import numpy.linalg as la
import pandas as pd
import sklearn
from sklearn.preprocessing import Imputer
import scipy.stats as sp
from random import randint
#from sklearn.svm import SVC
#from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
#from sklearn import neighbors
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [2]:
#Import Training Data Using Pandas
training_data = pd.read_csv('./Data/train.txt')
#training_data = training_data.head(100)
training_data.head(10)

Unnamed: 0,Id,user-Id,movie-Id,rating
0,0,4557,2740,5
1,1,4557,3053,4
2,2,4557,2343,4
3,3,4557,595,5
4,4,4557,471,4
5,5,4557,2941,3
6,6,4557,2550,5
7,7,4557,1487,5
8,8,4557,3235,4
9,9,4557,2664,3


In [3]:
#Assign 'rating' column to a pandas dataframe
target = training_data["rating"]
target.head()

0    5
1    4
2    4
3    5
4    4
Name: rating, dtype: int64

In [4]:
#Import the user and movie information
data_movie = pd.read_csv('./Data/movie.txt')
data_user = pd.read_csv('./Data/user.txt')
print (data_movie.head())
print (data_user.head())

   Id  Year           Genre
0   8   NaN  Comedy|Musical
1  20   NaN          Horror
2  67   NaN          Comedy
3  76   NaN           Drama
4  81   NaN           Drama
   ID Gender   Age  Occupation
0   0      M  50.0         7.0
1   1      M  25.0        17.0
2   2      M  25.0         6.0
3   3      M  25.0         7.0
4   4      M   1.0        19.0


In [5]:
#Merge training data with movie information
training_data = pd.merge(training_data, data_movie, left_on = 'movie-Id', right_on = 'Id')
training_data.head()

Unnamed: 0,Id_x,user-Id,movie-Id,rating,Id_y,Year,Genre
0,0,4557,2740,5,2740,1995.0,
1,390,4929,2740,4,2740,1995.0,
2,526,3776,2740,4,2740,1995.0,
3,602,3826,2740,5,2740,1995.0,
4,894,3086,2740,5,2740,1995.0,


In [6]:
#Merge Training data with user information
training_data = pd.merge(training_data, data_user, left_on='user-Id', right_on='ID')
training_data.head()

Unnamed: 0,Id_x,user-Id,movie-Id,rating,Id_y,Year,Genre,ID,Gender,Age,Occupation
0,0,4557,2740,5,2740,1995.0,,4557,F,1.0,10.0
1,1,4557,3053,4,3053,1999.0,Drama,4557,F,1.0,10.0
2,2,4557,2343,4,2343,1989.0,Drama,4557,F,1.0,10.0
3,3,4557,595,5,595,1998.0,,4557,F,1.0,10.0
4,4,4557,471,4,471,2000.0,,4557,F,1.0,10.0


In [7]:
#Drop irrelevant columns
training_data = training_data.drop(['user-Id', 'movie-Id', 'Id_x', 'Id_y', 'ID'], 1)
training_data.head()

Unnamed: 0,rating,Year,Genre,Gender,Age,Occupation
0,5,1995.0,,F,1.0,10.0
1,4,1999.0,Drama,F,1.0,10.0
2,4,1989.0,Drama,F,1.0,10.0
3,5,1998.0,,F,1.0,10.0
4,4,2000.0,,F,1.0,10.0


In [8]:
#Rearrange columns
training_data = training_data[['Gender', 'Age', 'Occupation', 'Genre', 'Year', 'rating']]
training_data.head()

Unnamed: 0,Gender,Age,Occupation,Genre,Year,rating
0,F,1.0,10.0,,1995.0,5
1,F,1.0,10.0,Drama,1999.0,4
2,F,1.0,10.0,Drama,1989.0,4
3,F,1.0,10.0,,1998.0,5
4,F,1.0,10.0,,2000.0,4


In [9]:
#Assign target based on new arrangement
target = training_data["rating"]
target.head()

0    5
1    4
2    4
3    5
4    4
Name: rating, dtype: int64

In [10]:
#Function to assign Female as 1, Male as 0 and Nan as np.nan
def assignGender(c):
    if not isinstance(c['Gender'], str):
        return np.nan
    else:
        if c['Gender'] == 'F':
            return 1
        else: 
            return 0

In [11]:
#Applying the assignGender function to the row column, converting all 'F' with 1, 'M' with 0 and Nan with np.nan
training_data['Gender'] = training_data.apply(assignGender, axis=1)
training_data.head(10)

Unnamed: 0,Gender,Age,Occupation,Genre,Year,rating
0,1.0,1.0,10.0,,1995.0,5
1,1.0,1.0,10.0,Drama,1999.0,4
2,1.0,1.0,10.0,Drama,1989.0,4
3,1.0,1.0,10.0,,1998.0,5
4,1.0,1.0,10.0,,2000.0,4
5,1.0,1.0,10.0,Action|Adventure|Comedy|Romance,,3
6,1.0,1.0,10.0,Animation|Children's|Musical,1941.0,5
7,1.0,1.0,10.0,Drama,,5
8,1.0,1.0,10.0,Thriller,,4
9,1.0,1.0,10.0,Animation|Children's|Musical,1996.0,3


In [12]:
#List of genres
genre_dict = {'Drama': 1, 'Comedy': 2, 'Thriller' : 3, 'Action' : 4, 'Romance': 5, 'Horror': 6, 'Adventure': 7, 'Sci-Fi': 8, 'Children\'s' : 9, 'Crime': 10, 'War' : 11, 'Documentary' : 12, 'Musical': 13, 'Animation': 14, 'Mystery': 15, 'Fantasy': 16, 'Western': 17, 'Film-Noir': 18}
genre_dict = list(genre_dict.keys())
#Column for each genre. 1 means the movie is of the genre, 0 means it isn't. Initialized as -1
for j in genre_dict:
    training_data[str(j)] = -1
training_data.head()

Unnamed: 0,Gender,Age,Occupation,Genre,Year,rating,Drama,Comedy,Thriller,Action,...,Children's,Crime,War,Documentary,Musical,Animation,Mystery,Fantasy,Western,Film-Noir
0,1.0,1.0,10.0,,1995.0,5,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,1.0,1.0,10.0,Drama,1999.0,4,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,1.0,1.0,10.0,Drama,1989.0,4,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,1.0,1.0,10.0,,1998.0,5,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,1.0,1.0,10.0,,2000.0,4,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [13]:
#Function to assign the several Genre columns.
def assignGenres(row):
    if not isinstance(row['Genre'], str):
        for j in genre_dict:
            row[str(j)] = np.nan
    else:
        y = row['Genre'].split('|')
        for j in genre_dict:
            if j in y:
                row[str(j)] = 1
            else: 
                row[str(j)] = 0
    return row

In [14]:
#Applying the assignGenres function to each column
training_data = training_data.apply(lambda row: assignGenres(row), axis = 1)
training_data.head(20)

Unnamed: 0,Gender,Age,Occupation,Genre,Year,rating,Drama,Comedy,Thriller,Action,...,Children's,Crime,War,Documentary,Musical,Animation,Mystery,Fantasy,Western,Film-Noir
0,1.0,1.0,10.0,,1995.0,5,,,,,...,,,,,,,,,,
1,1.0,1.0,10.0,Drama,1999.0,4,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,10.0,Drama,1989.0,4,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,10.0,,1998.0,5,,,,,...,,,,,,,,,,
4,1.0,1.0,10.0,,2000.0,4,,,,,...,,,,,,,,,,
5,1.0,1.0,10.0,Action|Adventure|Comedy|Romance,,3,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.0,1.0,10.0,Animation|Children's|Musical,1941.0,5,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
7,1.0,1.0,10.0,Drama,,5,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1.0,1.0,10.0,Thriller,,4,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.0,1.0,10.0,Animation|Children's|Musical,1996.0,3,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


In [15]:
#Dropping the original genre column, since that information is covered by the new genre specific columns
training_data = training_data.drop('Genre', 1)
target = training_data['rating']
training_data.head(10)

Unnamed: 0,Gender,Age,Occupation,Year,rating,Drama,Comedy,Thriller,Action,Romance,...,Children's,Crime,War,Documentary,Musical,Animation,Mystery,Fantasy,Western,Film-Noir
0,1.0,1.0,10.0,1995.0,5,,,,,,...,,,,,,,,,,
1,1.0,1.0,10.0,1999.0,4,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,10.0,1989.0,4,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,10.0,1998.0,5,,,,,,...,,,,,,,,,,
4,1.0,1.0,10.0,2000.0,4,,,,,,...,,,,,,,,,,
5,1.0,1.0,10.0,,3,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.0,1.0,10.0,1941.0,5,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
7,1.0,1.0,10.0,,5,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1.0,1.0,10.0,,4,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.0,1.0,10.0,1996.0,3,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


In [16]:
#Getting the mean and standard deviation of the Age column
meanAge = int(training_data['Age'].mean())
stdAge = int(training_data['Age'].std())
def fillNanAge(c):
    if c['Age'] == np.nan or c['Age'] == 'NaN':
        return randint(int(meanAge-stdAge),int(meanAge+stdAge))
    return c['Age']
print (meanAge)
print (stdAge)

29
11


In [17]:
#training_data['Age'].fillna(training_data['Age'].apply(fillNanAge)
#training_data.isnull().sum()
#Fill the missing values in age column with a random value between mean+-std using the fillNanAge function
training_data['Age'].fillna(training_data.groupby('rating')['Age'].transform(lambda x: (randint(meanAge-stdAge,meanAge+stdAge))), inplace=True)
#Drop the rating field
training_data = training_data.drop('rating', 1)
training_data.isnull().sum()

Gender         79280
Age                0
Occupation     80913
Year           82595
Drama          90520
Comedy         90520
Thriller       90520
Action         90520
Romance        90520
Horror         90520
Adventure      90520
Sci-Fi         90520
Children's     90520
Crime          90520
War            90520
Documentary    90520
Musical        90520
Animation      90520
Mystery        90520
Fantasy        90520
Western        90520
Film-Noir      90520
dtype: int64

In [18]:

#Filling in other missing values using heuristics based on previous data analysis
training_data['Occupation'].fillna(training_data.groupby('Age')['Occupation'].transform(lambda x: sp.mode(x)), inplace=True)
training_data['Year'].fillna(training_data['Year'].median(), inplace = True)
for i in genre_dict:
    training_data[i].fillna(training_data.groupby('Age')[i].transform(lambda x: 1 if x.mean() > 0.055 else 0), inplace=True)
training_data['Gender'].fillna(training_data.groupby('Occupation')['Gender'].transform(lambda x: 1 if x.mean() >= 0.5 else 0), inplace=True)

#Rearranging the columns in a fixed order
training_data = training_data[['Gender', 'Age', 'Occupation', 'Year', 'War', 'Mystery', 'Fantasy', 'Musical', 'Crime', 'Adventure', 'Sci-Fi',
       'Drama', 'Action', 'Documentary', 'Romance', 'Comedy', "Children's",
       'Thriller', 'Western', 'Film-Noir', 'Horror', 'Animation']]


print ("Training Data Tasks Done.")
training_data.head(20)

Training Data Tasks Done.


Unnamed: 0,Gender,Age,Occupation,Year,War,Mystery,Fantasy,Musical,Crime,Adventure,...,Action,Documentary,Romance,Comedy,Children's,Thriller,Western,Film-Noir,Horror,Animation
0,1.0,1.0,10.0,1995.0,1.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
1,1.0,1.0,10.0,1999.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,10.0,1989.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,10.0,1998.0,1.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
4,1.0,1.0,10.0,2000.0,1.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
5,1.0,1.0,10.0,1992.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.0,1.0,10.0,1941.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
7,1.0,1.0,10.0,1992.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1.0,1.0,10.0,1992.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,1.0,1.0,10.0,1996.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [19]:
training_data.isnull().sum()

Gender         0
Age            0
Occupation     0
Year           0
War            0
Mystery        0
Fantasy        0
Musical        0
Crime          0
Adventure      0
Sci-Fi         0
Drama          0
Action         0
Documentary    0
Romance        0
Comedy         0
Children's     0
Thriller       0
Western        0
Film-Noir      0
Horror         0
Animation      0
dtype: int64

In [75]:
target.head()

0    5
1    4
2    4
3    5
4    4
Name: rating, dtype: int64

In [20]:
#Read the test data and merge it with the movie and user information
test_data = pd.read_csv('./Data/test.txt')
#test_data = test_data.head(50)
test_data = pd.merge(test_data, data_movie, left_on = 'movie-Id', right_on = 'Id')
test_data = pd.merge(test_data, data_user, left_on='user-Id', right_on='ID')

print ("Test Data imported and merged")
test_data.head(20)

Test Data imported and merged


Unnamed: 0,Id_x,user-Id,movie-Id,Id_y,Year,Genre,ID,Gender,Age,Occupation
0,802553,4557,3067,3067,,Animation|Children's|Musical,4557,F,1.0,10.0
1,802554,4557,3867,3867,1986.0,Comedy,4557,F,1.0,10.0
2,802555,4557,1180,1180,,Adventure|Children's|Drama|Musical,4557,F,1.0,10.0
3,802556,4557,1672,1672,1996.0,Animation,4557,F,1.0,10.0
4,802557,4557,290,290,1958.0,Musical,4557,F,1.0,10.0
5,802558,4557,575,575,1962.0,,4557,F,1.0,10.0
6,802559,4557,162,162,1975.0,Drama,4557,F,1.0,10.0
7,802560,4557,231,231,1937.0,Animation|Children's|Musical,4557,F,1.0,10.0
8,802561,4557,3461,3461,1991.0,Animation|Children's|Musical,4557,F,1.0,10.0
9,802562,4557,3143,3143,1989.0,Drama,4557,F,1.0,10.0


In [21]:
#Assigning individual rows for each genre just like the training data
for j in genre_dict:
    test_data[str(j)] = -1
test_data = test_data.apply(lambda row: assignGenres(row), axis = 1)
test_data = test_data.drop(['Genre', 'ID', 'Id_y', 'user-Id', 'movie-Id'], 1)
test_data.head(20)

Unnamed: 0,Id_x,Year,Gender,Age,Occupation,Drama,Comedy,Thriller,Action,Romance,...,Children's,Crime,War,Documentary,Musical,Animation,Mystery,Fantasy,Western,Film-Noir
0,802553,,F,1.0,10.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
1,802554,1986.0,F,1.0,10.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,802555,,F,1.0,10.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,802556,1996.0,F,1.0,10.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,802557,1958.0,F,1.0,10.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,802558,1962.0,F,1.0,10.0,,,,,,...,,,,,,,,,,
6,802559,1975.0,F,1.0,10.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,802560,1937.0,F,1.0,10.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
8,802561,1991.0,F,1.0,10.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
9,802562,1989.0,F,1.0,10.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
#Assigning 1 to females, 0 to males and np.nan to Nan just like the training data
test_data['Gender'] = test_data.apply(assignGender, axis=1)

In [23]:
test_data.head(20)

Unnamed: 0,Id_x,Year,Gender,Age,Occupation,Drama,Comedy,Thriller,Action,Romance,...,Children's,Crime,War,Documentary,Musical,Animation,Mystery,Fantasy,Western,Film-Noir
0,802553,,1.0,1.0,10.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
1,802554,1986.0,1.0,1.0,10.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,802555,,1.0,1.0,10.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,802556,1996.0,1.0,1.0,10.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,802557,1958.0,1.0,1.0,10.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,802558,1962.0,1.0,1.0,10.0,,,,,,...,,,,,,,,,,
6,802559,1975.0,1.0,1.0,10.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,802560,1937.0,1.0,1.0,10.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
8,802561,1991.0,1.0,1.0,10.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
9,802562,1989.0,1.0,1.0,10.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
#Randomly filling the missing age entries using same heuristic as earlier
test_data['Age'].fillna(test_data.groupby('Id_x')['Age'].transform(lambda x: (randint(meanAge-stdAge,meanAge+stdAge))), inplace=True)
test_data.isnull().sum()

Id_x               0
Year           20285
Gender         19492
Age                0
Occupation     19938
Drama          22162
Comedy         22162
Thriller       22162
Action         22162
Romance        22162
Horror         22162
Adventure      22162
Sci-Fi         22162
Children's     22162
Crime          22162
War            22162
Documentary    22162
Musical        22162
Animation      22162
Mystery        22162
Fantasy        22162
Western        22162
Film-Noir      22162
dtype: int64

In [25]:

#test_data['Age'].fillna(test_data['Age'].mean(), inplace = True)
#Filling other missing entries using the same heuristics as used for training data
test_data['Occupation'].fillna(test_data.groupby('Age')['Occupation'].transform(lambda x: sp.mode(x)), inplace=True)
test_data['Year'].fillna(test_data['Year'].median(), inplace = True)
for i in genre_dict:
    test_data[i].fillna(test_data.groupby('Age')[i].transform(lambda x: 1 if x.mean() > 0.055 else 0), inplace=True)
test_data['Gender'].fillna(test_data.groupby('Occupation')['Gender'].transform(lambda x: 1 if x.mean() >= 0.5 else 0), inplace=True)
test_ids = test_data['Id_x']
#Rearranging columns to a fixed order
test_data = test_data[['Gender', 'Age', 'Occupation', 'Year', 'War', 'Mystery', 'Fantasy', 'Musical', 'Crime', 'Adventure', 'Sci-Fi',
       'Drama', 'Action', 'Documentary', 'Romance', 'Comedy', "Children's",
       'Thriller', 'Western', 'Film-Noir', 'Horror', 'Animation']]


print ("Test Data tasks done")
test_data.head(20)

Test Data tasks done


Unnamed: 0,Gender,Age,Occupation,Year,War,Mystery,Fantasy,Musical,Crime,Adventure,...,Action,Documentary,Romance,Comedy,Children's,Thriller,Western,Film-Noir,Horror,Animation
0,1.0,1.0,10.0,1992.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,1.0,1.0,10.0,1986.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,10.0,1992.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,10.0,1996.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,1.0,10.0,1958.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,1.0,10.0,1962.0,1.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
6,1.0,1.0,10.0,1975.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1.0,1.0,10.0,1937.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
8,1.0,1.0,10.0,1991.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
9,1.0,1.0,10.0,1989.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
test_data.isnull().sum()

Gender         0
Age            0
Occupation     0
Year           0
War            0
Mystery        0
Fantasy        0
Musical        0
Crime          0
Adventure      0
Sci-Fi         0
Drama          0
Action         0
Documentary    0
Romance        0
Comedy         0
Children's     0
Thriller       0
Western        0
Film-Noir      0
Horror         0
Animation      0
dtype: int64

In [33]:
#model = LogisticRegression(solver = 'newton-cg', multi_class = 'multinomial')
#model.fit(training_data, target)
#dt = DecisionTreeClassifier(criterion = 'entropy', max_depth = 10)
#dt.fit(training_data, target)
#gnb = GaussianNB()
#gnb.fit(training_data, target)
#print ("Classifier Made")

Classifier Made


In [55]:
#n = 5
#preds = np.zeros((n, test_data.shape[0]))
#train_len = training_data.shape[0]
#for i in range(n):
    #dt = DecisionTreeClassifier(criterion = 'gini', max_depth = 10)
    #dt.fit(training_data[int(i*(train_len/(n))):int((i+1)*(train_len/(n)))], target[int(i*(train_len/(n))):int((i+1)*(train_len/(n)))])
    #pred = dt.predict(test_data)
    #    gnb = GaussianNB()
    #gnb.fit(training_data[int(i*(train_len/(n))):int((i+1)*(train_len/(n)))], target[int(i*(train_len/(n))):int((i+1)*(train_len/(n)))])
    #pred = gnb.predict(test_data)
    #preds[i] = pred

In [99]:
#training_data[(training_data['Horror'] == 1) & (training_data['Gender'] == 0)].count()[0]

111808

In [103]:
#len(training_data[training_data['Gender'] == 0])

625988

In [120]:
#Gaussian Naive Bayesian Classifier
def GNB(train, target, test):
    train = pd.concat([train, target], axis=1) #concatenate training data with corresponding ratings
    #Variables to hold the priors
    n_feat = train.shape[1]
    n_samp = train.shape[0]
    count_rating = np.zeros(5)
    count_male = len(train[train['Gender'] == 0])
    count_female = n_samp - count_male
    genre_dict = {'Drama': 1, 'Comedy': 2, 'Thriller' : 3, 'Action' : 4, 'Romance': 5, 'Horror': 6, 'Adventure': 7, 'Sci-Fi': 8, 'Children\'s' : 9, 'Crime': 10, 'War' : 11, 'Documentary' : 12, 'Musical': 13, 'Animation': 14, 'Mystery': 15, 'Fantasy': 16, 'Western': 17, 'Film-Noir': 18}
    genre_dict = list(genre_dict.keys())
    count_genres = np.zeros((5,18))
    count_male_r = np.zeros(5)
    count_female_r = np.zeros(5)
    year_mean_r = np.zeros(5)
    year_var_r = np.zeros(5)
    occupations = list(train.Occupation.unique())
    count_occup_r = np.zeros((5, len(occupations)))
    age_mean_r = np.zeros(5)
    age_var_r = np.zeros(5)
    data_means = train.groupby('rating').mean()
    data_variance = train.groupby('rating').var()
    
    #Calculation of Priors using the training data set
    for rating in range(1,6):
        
        count_rating[rating-1] = train['rating'][train['rating'] == rating].count()
        count_male_r[rating-1] = train[(train['rating'] == rating) & (train['Gender'] == 0)].count()[0]
        count_female_r[rating-1] = count_rating[rating-1] - count_male_r[rating-1]
        for i, genre in enumerate(genre_dict):
            count_genres[rating-1][i] = train[(train[str(genre)] == 1) & (train['rating'] == rating)].count()[0]
        for i, occupation in enumerate(occupations):
            count_occup_r[rating-1][i] = train[(train['Occupation'] == occupations[i]) & (train['rating'] == rating)].count()[0]
    
        year_mean_r[rating-1] = data_means['Year'][data_means.index == rating].values[0]
        year_var_r[rating-1] = data_variance['Year'][data_variance.index == rating].values[0]
        age_mean_r[rating-1] = data_means['Age'][data_means.index == rating].values[0]
        age_var_r[rating-1] = data_variance['Age'][data_variance.index == rating].values[0]
    
    print ('Classifier Made, Starting Predictive Task')
    
    #Prediction using the classifier made above. Standard Naive Bayesian.
    pred = np.zeros(len(test))
    for i in range(len(test)):
        rating_probs = []
        for rating in range(1,6):
            prob_age = 0.
            prob_gender = 0.
            prob_occ = 0.
            prob_year = 0.
            prob_genre = 0.
            age_mean = age_mean_r[rating-1]
            age_var = age_var_r[rating-1]
            year_mean = year_mean_r[rating-1]
            year_var = year_var_r[rating-1]
            #Using gaussian distribution to calculate probability of age given rating
            prob_age = 1/(np.sqrt(2*np.pi*age_var)) * np.exp((-(test['Age'][i]-age_mean)**2)/(2*age_var)) if (age_mean != 0 and age_var != 0) else 0.02
            prob_gender = count_male_r[rating-1]/count_rating[rating-1] if test['Gender'][i] == 0 else count_female_r[rating-1]/count_rating[rating-1]
            if prob_gender == 0: prob_gender = 0.5
            for j, occ in enumerate(occupations):
                if occ == test['Occupation'][i]:
                    prob_occ = count_occup_r[rating-1][j]/count_rating[rating-1]
            if prob_occ == 0: prob_occ = 1/len(occupations)
            
            prob_year = 1/(np.sqrt(2*np.pi*year_var)) * np.exp((-(test['Year'][i]-year_mean)**2)/(2*year_var)) if (year_mean != 0 and year_var != 0) else 0.02
            #Heuristical way of calculating probability of genre match given rating
            num_genres = 0
            for j, genre in enumerate(genre_dict):
                if test[str(genre)][i] == 1:
                    prob_genre += (count_genres[rating-1][j])
                    num_genres += 1
            prob_genre /= (num_genres*count_rating[rating-1])
            if prob_genre == 0: prob_genre = 0.02
            prob_rating = prob_age * prob_gender * prob_occ * prob_year * prob_genre * (count_rating[rating-1]/n_samp)
            rating_probs.append(prob_rating)
        
        #pred[i] = np.argmax(rating_probs) + 1
        #Heuristic to increase result accuracy.
        if rating_probs[0] + rating_probs[1] + rating_probs[2] > rating_probs[3] + rating_probs[4]:
            pred[i] = np.argmax(rating_probs[:3]) + 1
        else:
            pred[i] = np.argmax(rating_probs[3:]) + 4
    
    return pred

In [125]:
#Call the GNB function to predict rating given training set
pred = GNB(training_data, target, test_data)

Classifier Made, Starting Predictive Task


In [126]:
len(pred)

197656

In [127]:
pred[45:100]

array([ 4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,
        4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,
        3.,  5.,  3.,  3.,  3.,  4.,  4.,  3.,  4.,  3.,  3.,  3.,  3.,
        4.,  4.,  4.,  3.,  4.,  3.,  3.,  3.,  4.,  5.,  3.,  5.,  4.,
        4.,  3.,  3.])

In [56]:
preds[3][123:150]

array([ 4.,  5.,  4.,  4.,  4.,  3.,  4.,  4.,  4.,  4.,  4.,  4.,  5.,
        3.,  4.,  4.,  4.,  5.,  4.,  4.,  5.,  4.,  4.,  4.,  4.,  4.,  4.])

In [57]:
pred = np.zeros(preds.shape[1])
for i in range(preds.shape[1]):
    pred[i] = round(np.average(preds[:, i]))

In [34]:
pred = dt.predict(test_data)

In [108]:
pred[:20]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  1.,  0.,  0.])

In [39]:
type(pred)

numpy.ndarray

In [72]:
len(pred)

197656

In [129]:
#Writing results to a file
predictions = open('nb_self_predictions.txt', 'w')
predictions.write('Id,rating\n')
for i in range(len(pred)):
    prediction = str(test_ids[i]) + ',' + str(int(round(pred[i]))) + '\n'
    predictions.write(prediction)

print ("Done!")

Done!


In [68]:
def LSTSQ(train, target, test):
    n_feat = train.shape[1]
    n_samp = train.shape[0]
    counts = np.zeros(())
    x = la.lstsq(train, target)
    #return x
    pred = test_data @ x[0]
    pred = pred.round()
    return pred

In [69]:
x = LSTSQ(training_data, target, test_data)
x

array([ 4.,  3.,  4., ...,  4.,  4.,  3.])

In [70]:
len(x)

197656

In [71]:
pred = x