In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import Imputer
import scipy.stats as sp

In [2]:
training_data = pd.read_csv('new_train.csv', sep='\s*,\s*', header=0, encoding='ascii', engine='python')
training_data = training_data.head(50)
target = training_data["Rating_Given"]
training_data['Genre'] = training_data['Genre'].astype('category')

In [3]:
def assignGender(c):
    if not isinstance(c['Gender'], str):
        return np.nan
    else:
        if c['Gender'] == 'F':
            return 1
        else: 
            return 0

In [4]:
training_data['Gender'] = training_data.apply(assignGender, axis=1)
training_data.head(10)

Unnamed: 0,Gender,Age,Occupation,Year_Movie_Was_Released,Genre,Rating_Given
0,1.0,1.0,10.0,1995.0,,5
1,1.0,1.0,10.0,1999.0,Drama,4
2,1.0,1.0,10.0,1989.0,Drama,4
3,1.0,1.0,10.0,1998.0,,5
4,1.0,1.0,10.0,2000.0,,4
5,1.0,1.0,10.0,,Action/ Adventure/ Comedy/ Romance,3
6,1.0,1.0,10.0,1941.0,Animation/ Children's/ Musical,5
7,1.0,1.0,10.0,,Drama,5
8,1.0,1.0,10.0,,Thriller,4
9,1.0,1.0,10.0,1996.0,Animation/ Children's/ Musical,3


In [5]:
genre_dict = {'Drama': 1, 'Comedy': 2, 'Thriller' : 3, 'Action' : 4, 'Romance': 5, 'Horror': 6, 'Adventure': 7, 'Sci-Fi': 8, 'Children\'s' : 9, 'Crime': 10, 'War' : 11, 'Documentary' : 12, 'Musical': 13, 'Animation': 14, 'Mystery': 15, 'Fantasy': 16, 'Western': 17, 'Film-Noir': 18}
genre_dict = list(genre_dict.keys())
for j in genre_dict:
    training_data[str(j)] = -1

In [6]:
def assignGenres(row):
    if not isinstance(row['Genre'], str):
        for j in genre_dict:
            row[str(j)] = np.nan
    else:
        y = row['Genre'].split('/ ')
        for j in genre_dict:
            if j in y:
                row[str(j)] = 1
            else: 
                row[str(j)] = 0
    return row

In [7]:
training_data = training_data.apply(lambda row: assignGenres(row), axis = 1)
training_data.head(20)                

Unnamed: 0,Gender,Age,Occupation,Year_Movie_Was_Released,Genre,Rating_Given,Drama,Comedy,Thriller,Action,...,Children's,Crime,War,Documentary,Musical,Animation,Mystery,Fantasy,Western,Film-Noir
0,1.0,1.0,10.0,1995.0,,5,,,,,...,,,,,,,,,,
1,1.0,1.0,10.0,1999.0,Drama,4,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,10.0,1989.0,Drama,4,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,10.0,1998.0,,5,,,,,...,,,,,,,,,,
4,1.0,1.0,10.0,2000.0,,4,,,,,...,,,,,,,,,,
5,1.0,1.0,10.0,,Action/ Adventure/ Comedy/ Romance,3,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.0,1.0,10.0,1941.0,Animation/ Children's/ Musical,5,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
7,1.0,1.0,10.0,,Drama,5,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1.0,1.0,10.0,,Thriller,4,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.0,1.0,10.0,1996.0,Animation/ Children's/ Musical,3,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


In [8]:
training_data = training_data.drop('Genre', 1)
training_data = training_data.drop('Rating_Given', 1)
training_data.head(10)

Unnamed: 0,Gender,Age,Occupation,Year_Movie_Was_Released,Drama,Comedy,Thriller,Action,Romance,Horror,...,Children's,Crime,War,Documentary,Musical,Animation,Mystery,Fantasy,Western,Film-Noir
0,1.0,1.0,10.0,1995.0,,,,,,,...,,,,,,,,,,
1,1.0,1.0,10.0,1999.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,10.0,1989.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,10.0,1998.0,,,,,,,...,,,,,,,,,,
4,1.0,1.0,10.0,2000.0,,,,,,,...,,,,,,,,,,
5,1.0,1.0,10.0,,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.0,1.0,10.0,1941.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
7,1.0,1.0,10.0,,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1.0,1.0,10.0,,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.0,1.0,10.0,1996.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


In [10]:
from sklearn.svm import SVC
svc = SVC(kernel='linear')

In [11]:
training_data['Age'].fillna(training_data['Age'].mean(), inplace = True)
training_data['Occupation'].fillna(training_data.groupby('Age')['Occupation'].transform(lambda x: sp.mode(x)), inplace=True)
training_data['Year_Movie_Was_Released'].fillna(training_data['Year_Movie_Was_Released'].median(), inplace = True)
for i in genre_dict:
    training_data[i].fillna(training_data.groupby('Age')[i].transform(lambda x: 1 if x.mean() > 0.055 else 0), inplace=True)
training_data['Gender'].fillna(training_data.groupby('Occupation')['Gender'].transform(lambda x: 1 if x.mean() >= 0.5 else 0), inplace=True)
training_data.head(20)

Unnamed: 0,Gender,Age,Occupation,Year_Movie_Was_Released,Drama,Comedy,Thriller,Action,Romance,Horror,...,Children's,Crime,War,Documentary,Musical,Animation,Mystery,Fantasy,Western,Film-Noir
0,1.0,1.0,10.0,1995.0,1.0,1.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
1,1.0,1.0,10.0,1999.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,10.0,1989.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,10.0,1998.0,1.0,1.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4,1.0,1.0,10.0,2000.0,1.0,1.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
5,1.0,1.0,10.0,1995.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.0,1.0,10.0,1941.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
7,1.0,1.0,10.0,1995.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1.0,1.0,10.0,1995.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.0,1.0,10.0,1996.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


In [13]:
training_data.isnull().sum()

Gender                     0
Age                        0
Occupation                 0
Year_Movie_Was_Released    0
Drama                      0
Comedy                     0
Thriller                   0
Action                     0
Romance                    0
Horror                     0
Adventure                  0
Sci-Fi                     0
Children's                 0
Crime                      0
War                        0
Documentary                0
Musical                    0
Animation                  0
Mystery                    0
Fantasy                    0
Western                    0
Film-Noir                  0
dtype: int64

In [14]:
svc.fit(training_data, target)
test_data = pd.read_csv('test.csv')
test_data = test_data.head(50)

Unnamed: 0,Gender,Age,Occupation,Year_Movie_Was_Released,Drama,Comedy,Thriller,Action,Romance,Horror,...,Children's,Crime,War,Documentary,Musical,Animation,Mystery,Fantasy,Western,Film-Noir
0,1.0,1.0,10.0,1995.0,1.0,1.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
1,1.0,1.0,10.0,1999.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,10.0,1989.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,10.0,1998.0,1.0,1.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4,1.0,1.0,10.0,2000.0,1.0,1.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
5,1.0,1.0,10.0,1995.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.0,1.0,10.0,1941.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
7,1.0,1.0,10.0,1995.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1.0,1.0,10.0,1995.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.0,1.0,10.0,1996.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


In [15]:
data_movie = pd.read_csv('movie.txt')
data_user = pd.read_csv('user.txt')

test_data = pd.merge(test_data, data_movie, left_on = 'movie-Id', right_on = 'Id')
test_data = pd.merge(test_data, data_user, left_on='user-Id', right_on='ID')
test_data.head(20)

Unnamed: 0,Id_x,user-Id,movie-Id,Id_y,Year,Genre,ID,Gender,Age,Occupation
0,802553,4557,3067,3067,,Animation|Children's|Musical,4557,F,1.0,10.0
1,802554,4557,3867,3867,1986.0,Comedy,4557,F,1.0,10.0
2,802555,4557,1180,1180,,Adventure|Children's|Drama|Musical,4557,F,1.0,10.0
3,802556,4557,1672,1672,1996.0,Animation,4557,F,1.0,10.0
4,802557,4557,290,290,1958.0,Musical,4557,F,1.0,10.0
5,802558,4557,575,575,1962.0,,4557,F,1.0,10.0
6,802559,4557,162,162,1975.0,Drama,4557,F,1.0,10.0
7,802560,4557,231,231,1937.0,Animation|Children's|Musical,4557,F,1.0,10.0
8,802561,4557,3461,3461,1991.0,Animation|Children's|Musical,4557,F,1.0,10.0
9,802562,4557,3143,3143,1989.0,Drama,4557,F,1.0,10.0


In [16]:
def assignGenresTest(row):
    if not isinstance(row['Genre'], str):
        for j in genre_dict:
            row[str(j)] = np.nan
    else:
        y = row['Genre'].split('|')
        for j in genre_dict:
            if j in y:
                row[str(j)] = 1
            else: 
                row[str(j)] = 0
    return row

In [17]:
for j in genre_dict:
    test_data[str(j)] = -1
test_data = test_data.apply(lambda row: assignGenresTest(row), axis = 1)


Unnamed: 0,Id_x,user-Id,movie-Id,Year,Gender,Age,Occupation,Drama,Comedy,Thriller,...,Children's,Crime,War,Documentary,Musical,Animation,Mystery,Fantasy,Western,Film-Noir
0,802553,4557,3067,,F,1.0,10.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
1,802554,4557,3867,1986.0,F,1.0,10.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,802555,4557,1180,,F,1.0,10.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,802556,4557,1672,1996.0,F,1.0,10.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,802557,4557,290,1958.0,F,1.0,10.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,802558,4557,575,1962.0,F,1.0,10.0,,,,...,,,,,,,,,,
6,802559,4557,162,1975.0,F,1.0,10.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,802560,4557,231,1937.0,F,1.0,10.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
8,802561,4557,3461,1991.0,F,1.0,10.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
9,802562,4557,3143,1989.0,F,1.0,10.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
test_data = test_data.drop(['Genre', 'ID', 'Id_y'], 1)
test_data.head(10)

In [18]:
test_data['Gender'] = test_data.apply(assignGender, axis=1)
test_data.head(20)

Unnamed: 0,Id_x,user-Id,movie-Id,Year,Gender,Age,Occupation,Drama,Comedy,Thriller,...,Children's,Crime,War,Documentary,Musical,Animation,Mystery,Fantasy,Western,Film-Noir
0,802553,4557,3067,,1.0,1.0,10.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
1,802554,4557,3867,1986.0,1.0,1.0,10.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,802555,4557,1180,,1.0,1.0,10.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,802556,4557,1672,1996.0,1.0,1.0,10.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,802557,4557,290,1958.0,1.0,1.0,10.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,802558,4557,575,1962.0,1.0,1.0,10.0,,,,...,,,,,,,,,,
6,802559,4557,162,1975.0,1.0,1.0,10.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,802560,4557,231,1937.0,1.0,1.0,10.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
8,802561,4557,3461,1991.0,1.0,1.0,10.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
9,802562,4557,3143,1989.0,1.0,1.0,10.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
test_data['Age'].fillna(test_data['Age'].mean(), inplace = True)
test_data['Occupation'].fillna(test_data.groupby('Age')['Occupation'].transform(lambda x: sp.mode(x)), inplace=True)
test_data['Year'].fillna(test_data['Year'].median(), inplace = True)
for i in genre_dict:
    test_data[i].fillna(test_data.groupby('Age')[i].transform(lambda x: 1 if x.mean() > 0.055 else 0), inplace=True)
test_data['Gender'].fillna(test_data.groupby('Occupation')['Gender'].transform(lambda x: 1 if x.mean() >= 0.5 else 0), inplace=True)
test_data = test_data.drop(['user-Id', 'movie-Id'], 1)
test_data.rename(columns={'Year': 'Year_Movie_Was_Released'}, inplace=True)
test_ids = test_data['Id_x']
test_data = test_data[['Gender', 'Age', 'Occupation', 'Year_Movie_Was_Released', 'War', 'Mystery', 'Fantasy', 'Musical', 'Crime', 'Adventure', 'Sci-Fi',
       'Drama', 'Action', 'Documentary', 'Romance', 'Comedy', "Children's",
       'Thriller', 'Western', 'Film-Noir', 'Horror', 'Animation']]
test_data.head(20)

Unnamed: 0,Gender,Age,Occupation,Year_Movie_Was_Released,War,Mystery,Fantasy,Musical,Crime,Adventure,...,Action,Documentary,Romance,Comedy,Children's,Thriller,Western,Film-Noir,Horror,Animation
0,1.0,1.0,10.0,1987.5,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,1.0,1.0,10.0,1986.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,10.0,1987.5,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,10.0,1996.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,1.0,10.0,1958.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,1.0,10.0,1962.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
6,1.0,1.0,10.0,1975.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1.0,1.0,10.0,1937.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
8,1.0,1.0,10.0,1991.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
9,1.0,1.0,10.0,1989.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
test_data.isnull().sum()

Gender                     0
Age                        0
Occupation                 0
Year_Movie_Was_Released    0
War                        0
Mystery                    0
Fantasy                    0
Musical                    0
Crime                      0
Adventure                  0
Sci-Fi                     0
Drama                      0
Action                     0
Documentary                0
Romance                    0
Comedy                     0
Children's                 0
Thriller                   0
Western                    0
Film-Noir                  0
Horror                     0
Animation                  0
dtype: int64

In [21]:
pred = svc.predict(test_data)
print (pred)

predictions = open('predictions1.txt', 'w')
predictions.write('Id,rating\n')
for i in range(len(pred)):
    prediction = str(test_ids[i]) + ',' + str(pred[i]) + '\n'
    predictions.write(prediction)

[3 4 5 4 5 5 5 5 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4
 3 3 3 3 4 3 4 4 3 3 3 3 3]
