# Project: Kaggle Challenge Movie Rating Prediction 
In this project, we write a K-Nearest Neighbor algorithm to predict IMDB rating of movies. We use a dataset Kaggle that contains 28 variables for 5043 movies, spanning across 100 years in 66 countries. 
We write our K-Nearest Neighbor algorithm from scratch without using built-in libraries.
We’ll implement the three steps of the K-Nearest Neighbor Algorithm:
- Normalize the data
- Find the k nearest neighbors
- Classify the new point based on those neighbors

In [1]:
import pandas as pd

## Part 1: Load and clean the data

In [2]:
# https://www.kaggle.com/carolzhangdc/predict-imdb-score-with-data-mining-algorithms
# The dataset is from Kaggle website. It contains 28 variables for 5043 movies, spanning across 100 years in 66 countries. 
df = pd.read_csv('08 ML_KNearestNeighbors_KaggleChallengeMovieRatingPredictionDataset.csv')

# sorting by movie name 
df.sort_values("movie_title", inplace = True) 

# dropping ALL duplicte values 
df.drop_duplicates(subset ="movie_title", keep = False, inplace = True) 

df.head(3)


Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
4447,Color,Tara Subkoff,35.0,101.0,37.0,56.0,Balthazar Getty,501.0,,Drama|Horror|Mystery|Thriller,...,42.0,English,USA,Not Rated,1500000.0,2015.0,418.0,3.3,,750
3698,Color,Dan Trachtenberg,411.0,104.0,16.0,82.0,John Gallagher Jr.,14000.0,71897215.0,Drama|Horror|Mystery|Sci-Fi|Thriller,...,440.0,English,USA,PG-13,15000000.0,2016.0,338.0,7.3,2.35,33000
3015,Color,Timothy Hines,1.0,111.0,0.0,247.0,Kelly LeBrock,1000.0,14616.0,Drama,...,10.0,English,USA,R,12000000.0,2015.0,445.0,7.5,1.85,26000


## Part 2: Restructure and clean the data

In [3]:
# Make a movie dataset as a subset of the dataframe with relevant features to use for our K-Nearest Neighbor 
md = df[['movie_title','duration', 'actor_1_facebook_likes', 'num_critic_for_reviews','budget','gross','title_year','movie_facebook_likes']]

# check NaN values and replace them with 0 (becuase we can't make average on NaN)
md.isna().any()
md.fillna({'duration':0,'actor_1_facebook_likes':0, 'num_critic_for_reviews':0, 'budget':0, 'gross':0,'title_year':0,'movie_facebook_likes':0}, inplace=True)

# replace all zero values with the average of the column
md.loc[md.duration == 0] = md['duration'].mean()
md.loc[md.actor_1_facebook_likes == 0] = md['actor_1_facebook_likes'].mean()
md.loc[md.num_critic_for_reviews == 0] = md['num_critic_for_reviews'].mean()
md.loc[md.budget == 0] = md['budget'].mean()
md.loc[md.gross == 0] = md['gross'].mean()
md.loc[md.title_year == 0] = md['title_year'].mean()
md.loc[md.movie_facebook_likes == 0] = md['movie_facebook_likes'].mean()
# check any 0 value 
md.isin([0]).any().any()
# check any NaN value
md.isna().any()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


movie_title               False
duration                  False
actor_1_facebook_likes    False
num_critic_for_reviews    False
budget                    False
gross                     False
title_year                False
movie_facebook_likes      False
dtype: bool

In [4]:
# Make a subset of the dataframe with IMDB score 
mls = df[['movie_title','imdb_score']]
mls.head(3)

Unnamed: 0,movie_title,imdb_score
4447,#Horror,3.3
3698,10 Cloverfield Lane,7.3
3015,10 Days in a Madhouse,7.5


In [5]:
# Exploration 
%matplotlib inline
import matplotlib.pyplot as plt
scores = mls[['imdb_score']]
#plt.hist(scores, bins=20)
#plt.title("Distribution of the IMDB ratings")
#plt.show()

In [6]:
# Adding a new column diviging movies into bad movie (IMDB rating < 7) and good (IMDB rating >= 7)
mls['l'] = mls.apply(lambda row: 1 if row['imdb_score'] >= 7 else 0, axis=1)

# check any 0 value 
mls.isin([0]).any().any()
# check any NaN value
mls.isna().any()

mls.head(10)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,movie_title,imdb_score,l
4447,#Horror,3.3,0
3698,10 Cloverfield Lane,7.3,1
3015,10 Days in a Madhouse,7.5,1
2845,10 Things I Hate About You,7.2,1
279,"10,000 B.C.",7.2,1
406,102 Dalmatians,4.8,0
3420,10th & Wolf,6.4,0
3644,11:14,7.2,1
4822,12 Angry Men,8.9,1
1736,12 Monkeys,7.6,1


## Part 2: Normalize the data

In [7]:
from sklearn import preprocessing

md_t = df[['movie_title']]
md_s = md[['duration', 'actor_1_facebook_likes', 'num_critic_for_reviews','budget','gross','title_year','movie_facebook_likes']]
x = md_s.values #returns a numpy array
# formula: (l-min(lst))/(max(lst)-min(lst)) for l in lst
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
md_s = pd.DataFrame(x_scaled)
md_n = md_t.join(md_s)
md_n.head(10)

Unnamed: 0,movie_title,0,1,2,3,4,5,6
4447,#Horror,0.1950938,0.1950945,0.1950945,0.001906,0.010524,0.195092,0.195094
3698,10 Cloverfield Lane,0.1950938,0.1950945,0.1950945,0.001906,0.010524,0.195092,0.195094
3015,10 Days in a Madhouse,0.1950938,0.1950945,0.1950945,0.001906,0.010524,0.195092,0.195094
2845,10 Things I Hate About You,0.1950938,0.1950945,0.1950945,0.001906,0.010524,0.195092,0.195094
279,"10,000 B.C.",2.193823e-06,8.775286e-07,8.28777e-07,0.000405,0.000241,4.6e-05,1.4e-05
406,102 Dalmatians,1.998817e-06,2.432704e-05,4.802031e-06,0.004524,0.276934,4.6e-05,0.00095
3420,10th & Wolf,9.506567e-07,2.432704e-05,1.048159e-06,0.004048,0.015298,4.6e-05,8e-06
3644,11:14,0.1950938,0.1950945,0.1950945,0.001906,0.010524,0.195092,0.195094
4822,12 Angry Men,,,,,,,
1736,12 Monkeys,0.1950938,0.1950945,0.1950945,0.001906,0.010524,0.195092,0.195094


## Part 3: Define Distance function 

In [8]:
# Distance Between Points using Euclidean Distance
def distance(movie1, movie2):
  length_difference = 0
  for i in range(len(movie1)):
    length_difference += (movie1[i] - movie2[i]) ** 2
  return length_difference ** 0.5

## Part 4: Define K-neighbor Classifier

In [149]:
def classify(unknown, dataset, IMDB, k):
  distances = []
  num_bad = 0
  num_good = 0

  for i in range(len(dataset.index)):
    distance_to_point = distance(unknown, dataset.iloc[i])
    distances.append([distance_to_point, dataset.iloc[i]['movie_title']])
  distances.sort()
  neighbors = distances[0:k]

  for m in neighbors:
        title = m[1]
        for i in range(len(dataset.index)):
            if IMDB.iloc[i]['movie_title'] == title:
                if IMDB.iloc[i]['l'] == 0:
                    num_bad += 1
                else:
                    num_good += 1
            else:
                continue
  if num_good > num_bad:
    return 1
  else:
    return 0

In [150]:
# example classification 
classify([.950938e-01,1.950945e-01,1.950945e-01, 0.001906, 0.010524,0.195092, 0.195094], md_n, mls, 10)

1

## Part 5: Data Segregation 

In [151]:
# 80% of md_n and mls for traing as dataset
# 20% of md_n and mls for validation
print(len(md_n.index))
# round(0.80 * len(md_n.index)) > 3838
# round(0.20 * len(md_n.index)) > 951
training_set = md_n[0:3838]
training_labels = mls[0:3838]
validation_set = md_n[3838:4798]
validation_labels = mls[3838:4798]

4798


## Part 6: Prediction Accuracy

In [161]:
# Feed each validation point one by one to function Classify and claculate accuracy 

def find_validation_accuracy(training_set, training_labels, validation_set, validation_labels, k):
  num_correct = 0
  for i in range(len(validation_set.index)):
        guess = classify(validation_set.iloc[i][1:8], training_set, training_labels, k)
        if guess == validation_labels.iloc[i]['l']:
            num_correct += 1
  return num_correct

In [None]:
print(find_validation_accuracy(training_set, training_labels, validation_set, validation_labels, 10))