<a href="https://colab.research.google.com/github/brekkercodes/DTI-Project/blob/main/MovieRecommenderSys.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install surprise



# **Data integration:**


*   Importing Surprise library
*   Reading rating data file




In [None]:
import pandas as pd
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.accuracy import rmse

ImportError: ignored

In [None]:

ratings = pd.read_csv('ratingsData.csv')
ratings.head()

In [None]:
ratings_dict = {'itemID': list(ratings.movieId),
                'userID': list(ratings.userId),
                'rating': list(ratings.rating)}

df = pd.DataFrame(ratings_dict)
df.shape

reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)

# **Data cleaning:**
Removing any missing values, incorrect ratings, and making sure each movie has at least 100 ratings



In [None]:
# Drop any rows with missing or NaN values
df.dropna(inplace=True)

# Remove outliers or incorrect ratings
df = df[df['rating'] >= 0.5]  # Adjust the threshold as needed

# Convert the 'userID' and 'itemID' columns to integer type
df['userID'] = df['userID'].astype(int)
df['itemID'] = df['itemID'].astype(int)

ratings_count = df['itemID'].value_counts()
movies_to_remove = ratings_count[ratings_count < 100].index
df = df[~df['itemID'].isin(movies_to_remove)]





## **Clustering & Classification:**
Using cross-validation to evaluate the performance of different
clustering algorithms:
*   SVD
*   KNN
*   CoClustering
*   Normal predictor





In [None]:
from surprise import SVD, KNNBasic, CoClustering, NormalPredictor
from surprise.model_selection import cross_validate

data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)

benchmark = []

for algorithm in [SVD(), KNNBasic(), CoClustering(), NormalPredictor()]:
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)



# **Evaluate the algorithms:**
 Evaluate the accuracy of the predictions using **RMSE**.

In [None]:
surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
surprise_results


In [None]:
trainset, testset = train_test_split(data, test_size=0.25)
algo = SVD()
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

# **Recommendation generation:**
Generates recommendations for a specific user.

In [None]:
from collections import defaultdict
def get_all_predictions(predictions):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)

    return top_n

In [None]:
all_pred = get_all_predictions(predictions)

In [None]:
#To get top 5 reommendation
n = 5

for uid, user_ratings in all_pred.items():
    user_ratings.sort(key=lambda x: x[1], reverse=True)
    all_pred[uid] = user_ratings[:n]

In [None]:
tmp = pd.DataFrame.from_dict(all_pred, orient='index')
tmp_transpose = tmp.apply(lambda x: pd.Series(dict(x[:n])), axis=1)


In [None]:
def get_predictions(user_id):
    results = tmp_transpose.loc[user_id]
    return results

In [None]:
#specifying the user
user_id= 22
results = get_predictions(user_id)
results


In [None]:
recommended_movie_ids=[]
for x in range(0, n):
    recommended_movie_ids.append(results[x][0])

recommended_movie_ids

In [None]:
#getting the title of the recommended movies
movies = pd.read_csv('movies.csv')
movies.head()
recommended_movies = movies[movies['movieId'].isin(recommended_movie_ids)]
recommended_movies

In [None]:
ratings = pd.read_csv('ratingsData.csv')


ratings_dict = {'itemID': list(ratings.movieId),
                'userID': list(ratings.userId),
                'rating': list(ratings.rating)}

df = pd.DataFrame(ratings_dict)
df.head()

In [None]:
temp = df[df['userID'] == 22].sort_values("rating", ascending = False)
temp.head()

In [None]:
history_movie_ids = temp['itemID']
user_history = movies[movies['movieId'].isin(history_movie_ids)]

In [None]:
user_history[:n]

In [None]:
recommended_movies

# **Chatbot Development:**