In [2]:
#Data (install packages)
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('ratings_small.csv')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100004 non-null  int64  
 1   movieId    100004 non-null  int64  
 2   rating     100004 non-null  float64
 3   timestamp  100004 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [5]:
df = data[['movieId','rating']]
df.head()

Unnamed: 0,movieId,rating
0,31,2.5
1,1029,3.0
2,1061,3.0
3,1129,2.0
4,1172,4.0


In [6]:
#Taking the average of all the ratings provided for each movie and then ranking them based on their average rating
average_rating = df.groupby('movieId')['rating'].mean()
rating_count = df.groupby('movieId')['rating'].count()
df2= pd.DataFrame({'avg_rating': average_rating, 'rating_count': rating_count})
df2.head()

Unnamed: 0_level_0,avg_rating,rating_count
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.87247,247
2,3.401869,107
3,3.161017,59
4,2.384615,13
5,3.267857,56


In [7]:
#Top 10 movies with at least 100 interactions
recommendations =df2[ df2['rating_count']>100].sort_values(by = 'rating_count')
recommendations.index[:10]

Int64Index([339, 25, 1258, 185, 3147, 7438, 6, 1222, 2916, 36], dtype='int64', name='movieId')

In [8]:
#Read and clean the data as needed. Look for any missing values and develop a strategy
data.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [9]:
#Predict the ratings of a user (y) using the remaining columns
#Is ratings a categorical variable (classification) or a numeric variable (regression)? 
from sklearn.model_selection import train_test_split
X = data[['userId', 'movieId', 'timestamp']]
y = data.rating #Ratings is a continous variable (Regression)
X.shape

(100004, 3)

In [10]:
#Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25)

In [11]:
#Apply the KNNRegressor model on the training data

from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=3) 
knn.fit(X_train, y_train)

KNeighborsRegressor(n_neighbors=3)

In [12]:
#Calculate the RMSE
from sklearn.metrics import mean_squared_error
from math import sqrt

y_pred = knn.predict(X_train) 
mse = mean_squared_error(y_train, y_pred) 
rmse = sqrt(mse)
print(rmse) 

0.7132087284725436


In [13]:
#Use the KNN model to predict the rating
new_data_point = np.array([[4, 10, 0]]) #Added the third column with a placeholder value
new_data_point = new_data_point.reshape(1, -1) 
rating = knn.predict(new_data_point)
print("Predicted rating:", rating[0])

Predicted rating: 3.6666666666666665
