# Import Packages

In [168]:
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier # Import K Nearest Neighbors Classifier
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

# Read Data

In [13]:
unames=['MovieID','Title', 'Genres']
movie = pd.read_table('movies.txt', sep='::', names=unames, engine='python')

In [14]:
movie.head()

Unnamed: 0,MovieID,Title,Genres
0,1.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2.0,Jumanji (1995),Adventure|Children|Fantasy
2,3.0,Grumpier Old Men (1995),Comedy|Romance
3,4.0,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5.0,Father of the Bride Part II (1995),Comedy


In [15]:
unames=['UserID','MovieID', 'Rating', 'Timestamp']
rating = pd.read_table('ratings.txt', sep='::', names=unames, engine='python')

In [16]:
rating.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1.0,122.0,5.0,838985046
1,1.0,185.0,5.0,838983525
2,1.0,231.0,5.0,838983392
3,1.0,292.0,5.0,838983421
4,1.0,316.0,5.0,838983392


# Data Processing

## Data Formatting

In [48]:
# Convert UTC in the rating file into DateTime
rating['Timestamp2']=pd.to_datetime(rating['Timestamp'], unit='s')

In [49]:
rating.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Timestamp2
0,1.0,122.0,5.0,838985046,1996-08-02 11:24:06
1,1.0,185.0,5.0,838983525,1996-08-02 10:58:45
2,1.0,231.0,5.0,838983392,1996-08-02 10:56:32
3,1.0,292.0,5.0,838983421,1996-08-02 10:57:01
4,1.0,316.0,5.0,838983392,1996-08-02 10:56:32


In [59]:
# Check the column format
print(movie.dtypes)
print(rating.dtypes)

MovieID    float64
Title       object
Genres      object
dtype: object
UserID               float64
MovieID              float64
Rating               float64
Timestamp              int64
Timestamp2    datetime64[ns]
dtype: object


## Missing Processing

In [52]:
# Detect movie data missings
movie_missing=movie.isnull()
for column in movie_missing.columns.values.tolist():
    print(column)
    print (movie_missing[column].value_counts())
    print("")

MovieID
False    10686
True         4
Name: MovieID, dtype: int64

Title
False    10689
True         1
Name: Title, dtype: int64

Genres
False    10690
Name: Genres, dtype: int64



In [53]:
# Detect rating data missings
rating_missing=rating.isnull()
for column in rating_missing.columns.values.tolist():
    print(column)
    print (rating_missing[column].value_counts())
    print("")

UserID
False    10000054
True            1
Name: UserID, dtype: int64

MovieID
False    10000053
True            2
Name: MovieID, dtype: int64

Rating
False    10000051
True            4
Name: Rating, dtype: int64

Timestamp
False    10000055
Name: Timestamp, dtype: int64

Timestamp2
False    10000055
Name: Timestamp2, dtype: int64



In [57]:
# Very few missings, let's just drop them
movie=movie.dropna()
rating=rating.dropna()

## Merge Movie with Ratings

In [60]:
df=pd.merge(movie, rating, on='MovieID')

In [115]:
df[["Rating"]] = df[["Rating"]].astype("int")

In [116]:
df.head()

Unnamed: 0,MovieID,Title,Genres,UserID,Rating,Timestamp,Timestamp2
0,1.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,1,857911264,1997-03-09 12:41:04
1,1.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,14.0,3,1133572007,2005-12-03 01:06:47
2,1.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,18.0,3,1111545931,2005-03-23 02:45:31
3,1.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,23.0,5,849543482,1996-12-02 16:18:02
4,1.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,24.0,5,868254237,1997-07-07 05:43:57


# Data Description

In [117]:
df.describe()

Unnamed: 0,MovieID,UserID,Rating,Timestamp
count,9996604.0,9996604.0,9996604.0,9996604.0
mean,4115.497,35869.91,-855.7743,1032593000.0
std,8930.424,20585.28,1358418.0,115956500.0
min,1.0,1.0,-2147484000.0,789652000.0
25%,648.0,18123.0,3.0,946766100.0
50%,1834.0,35740.0,4.0,1035426000.0
75%,3624.0,53608.0,4.0,1126651000.0
max,65133.0,71567.0,999999.0,1231132000.0


In [118]:
# The rating has some errors because it should be a number between 1 to 5
# Check the errors
df[['Rating', 'UserID']].groupby(['Rating']).count()

Unnamed: 0_level_0,UserID
Rating,Unnamed: 1_level_1
-2147483648,4
-200,1
-4,4
-3,3
-2,1
0,94980
1,502433
2,1160169
3,3235098
4,3459483


In [120]:
# Drop errors
df = df.drop(df[df.Rating < 0].index)
df = df.drop(df[df.Rating > 5].index)

In [121]:
df[['Rating', 'UserID']].groupby(['Rating']).count()

Unnamed: 0_level_0,UserID
Rating,Unnamed: 1_level_1
0,94980
1,502433
2,1160169
3,3235098
4,3459483
5,1544425


# Data Summary Questions

In [103]:
'''Titles of Top 5 Most Popular Movies'''
group_count=df[['Title', 'UserID']].groupby(['Title']).count().reset_index().sort_values(['UserID'], ascending=False).head(5)
group_count.rename(columns={'UserID': 'UserCount'})

Unnamed: 0,Title,UserCount
7659,Pulp Fiction (1994),34864
3506,Forrest Gump (1994),34457
8593,"Silence of the Lambs, The (1991)",33668
5143,Jurassic Park (1993),32631
8497,"Shawshank Redemption, The (1994)",31126


In [122]:
''' What are the top 5 ranked movie genres on average in the whole dataset?'''
df[['Genres', 'Rating']].groupby(['Genres']).agg('mean').reset_index().sort_values(['Rating'], ascending=False).head(5)

Unnamed: 0,Genres,Rating
445,Animation|IMAX|Sci-Fi,4.625
684,Drama|Film-Noir|Romance,4.240468
402,Animation|Children|Comedy|Crime,4.180156
622,Crime|Film-Noir|Mystery,4.167115
752,Film-Noir|Mystery,4.162975


In [132]:
'''How many movies have been ranked the most days?'''
df['date']=df['Timestamp2'].dt.date
df.groupby('Title').date.nunique().sort_values(ascending=False)
# Movie ranked the most days is Star Wars: Episode IV - A New Hope (a.k.a. Star Wars) (1977)

Title
Star Wars: Episode IV - A New Hope (a.k.a. Star Wars) (1977)    4297
Pulp Fiction (1994)                                             4257
Silence of the Lambs, The (1991)                                4251
Forrest Gump (1994)                                             4238
Braveheart (1995)                                               4216
                                                                ... 
Chapayev (1934)                                                    1
Ladrones (2007)                                                    1
Last Time, The (2006)                                              1
Living 'til the End (2005)                                         1
Malaya (1949)                                                      1
Name: date, Length: 10671, dtype: int64

# Data Modelling

In [133]:
# Let's transpose the data and to make the movie as feature
df_t=df.pivot_table(index='UserID', columns='Title', values='Rating')

In [138]:
# Fill NA as a negative number -1
df_t=df_t.fillna(-1)

In [150]:
# Would find the highest score movie for each user as the target variable
# then build a KNN model to use the favourite movie of closes observation as a recommendation to a new user 
df_t['favourite']=df_t.idxmax(axis=1)

In [153]:
df_t['favourite'].head()

UserID
1.0                                    Aladdin (1992)
2.0                                 Braveheart (1995)
3.0    Burnt by the Sun (Utomlyonnye solntsem) (1994)
4.0                       Addams Family Values (1993)
5.0                12 Monkeys (Twelve Monkeys) (1995)
Name: favourite, dtype: object

In [158]:
# Get feature X dataset
X=df_t.loc[:, df_t.columns != 'UserID']
X=df_t.loc[:, df_t.columns != 'favourite']

In [161]:
# Get target y
y=df_t['favourite']

In [164]:
# encode y
y = y.values

enc = LabelEncoder()
label_encoder = enc.fit(y)
y = label_encoder.transform(y)

In [167]:
# Split train test datasets
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.9)

In [169]:
# Create KNeighbors Classifier object
classifier = KNeighborsClassifier(n_neighbors=1)

In [170]:
# Train the classifier
classifier.fit(x_train, y_train)

KNeighborsClassifier(n_neighbors=1)

In [171]:
# Predict the test values
y_pred = classifier.predict(x_test)

In [177]:
'''For the first customer in the test dataset, The prediction is 25th movie after encode'''
y_pred[0]

25

In [180]:
# And we can transform it back
# The recommend movie for this customer is below
enc.inverse_transform([25])

array(['12 Monkeys (Twelve Monkeys) (1995)'], dtype=object)

In [184]:
y_pred_t=enc.inverse_transform(y_pred)
y_pred_df=pd.DataFrame({'Recommend': y_pred_t})

In [189]:
'''In my test data set, below are the most frequently recommend'''
y_pred_df.groupby(['Recommend']).agg('count').reset_index().sort_values(['Recommend'], ascending=False).head(5)

Unnamed: 0,Recommend
566,Young Frankenstein (1974)
565,You've Got Mail (1998)
564,"Wizard of Oz, The (1939)"
563,Willy Wonka & the Chocolate Factory (1971)
562,White Squall (1996)
