In [2]:
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt
import re
import scipy
from scipy.sparse import coo_matrix
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
movies_df = pd.read_csv("movies.csv")
ratings_df = pd.read_csv("ratings.csv")

In [4]:
movies_df = movies_df.rename(columns = {"movieId":"movie_id"})
ratings_df = ratings_df.rename(columns = {"movieId":"movie_id","userId":"user_id"})

In [5]:
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "",title)

In [6]:
movies_df["title"] = movies_df["title"].apply(clean_title)

In [7]:
list_user = ratings_df["user_id"].unique()

In [7]:
list_movie = ratings_df["movie_id"].unique()

In [8]:
def recommend(user):
    #User'ın izlediği tüm filmler ve oyları.
    user_df = ratings_df[(ratings_df["user_id"] == user)][["movie_id","rating"]]
    #User'ın izlediği filmlerin idsini tutan list
    list_m = list(user_df["movie_id"])
    
    #User'ın izlediği filmlere diğer userların da verdiği oylar
    r_df = ratings_df.copy()
    r_df= r_df[r_df["movie_id"].isin(list_m)]
    
    #Ortak film sayısı en az %20 olan kullanıcıların hesaplanması
    counts = r_df['user_id'].value_counts()

    if(len(list_m)>1000):
        r_df = r_df[~r_df['user_id'].isin(counts[counts <= len(list_m)/10].index)]

    elif(len(list_m)>100):
        r_df = r_df[~r_df['user_id'].isin(counts[counts <= len(list_m)/5].index)]

    elif(len(list_m)>50):
        r_df = r_df[~r_df['user_id'].isin(counts[counts <= len(list_m)/2].index)]

    elif(len(list_m)>25):
        r_df = r_df[~r_df['user_id'].isin(counts[counts <= len(list_m)].index)]
    

    list_u = list(r_df["user_id"].unique())
    
    #Seçilen userların izlediği bütün filmleri içeren dataframe
    df = ratings_df[ratings_df["user_id"].isin(list_u)]
    
    #Ortak film dışında film izlemmeiş userların silinmesi
    counts = df['user_id'].value_counts()
    

    if(len(list_m)>1000):
        df = df[~df['user_id'].isin(counts[counts == len(list_m)/10].index)]

    elif(len(list_m)>100):
        df = df[~df['user_id'].isin(counts[counts == len(list_m)/5].index)]

    elif(len(list_m)>50):
        df = df[~df['user_id'].isin(counts[counts == len(list_m)/2].index)]

    elif(len(list_m)>25):
        df = df[~df['user_id'].isin(counts[counts == len(list_m)].index)]
    
    
    df["user_id"] = df["user_id"].astype(int)
    df["movie_id"] = df["movie_id"].astype(int)
    
    
    df_coo = coo_matrix((df["rating"],(df["user_id"],df["movie_id"])))
    df1 = df_coo.tocsr()
    
    similarity = cosine_similarity(df1[user,:],df1).flatten()
    
    #En çok benzeyenleri als
    sim_df = pd.DataFrame(columns= ["user_id","similarity"])
    
    for i in list_u:
        sim_df.loc[i] = [i,similarity[i]]
        
    sim_df["user_id"] = sim_df["user_id"].astype(int)
    
    if(len(list_u)>1000):
        sim_df = sim_df.sort_values(by = "similarity", ascending = False).head(int(len(list_u)/10))

    elif(len(list_u)>100):
        sim_df = sim_df.sort_values(by = "similarity", ascending = False).head(int(len(list_u)/5))

    elif(len(list_u)>50):
        sim_df = sim_df.sort_values(by = "similarity", ascending = False).head(int(len(list_u)/2))

    elif(len(list_u)>25):
        sim_df = sim_df.sort_values(by = "similarity", ascending = False).head(int(len(list_u)))

    
    sim_df = sim_df.drop(user)
    list_sim_u = list(sim_df["user_id"])
    
    nw_movies = df[~df["movie_id"].isin(list_m)].reset_index(drop=True)
    l_m = list(nw_movies["movie_id"].unique())
    
    #Sadece 1 kişi tarafından izlenmiş filmlerin çıkarılması
    counts = nw_movies['movie_id'].value_counts()
    
    if(len(nw_movies)>1000):
        df = nw_movies[~nw_movies['movie_id'].isin(counts[counts <= 10].index)]

    elif(len(nw_movies)>100):
        df = nw_movies[~nw_movies['movie_id'].isin(counts[counts <= 5].index)]

    elif(len(nw_movies)>50):
        df = nw_movies[~nw_movies['movie_id'].isin(counts[counts <= 1].index)]

    elif(len(nw_movies)>25):
        df = nw_movies[~nw_movies['movie_id'].isin(counts[counts <= 0].index)]
    
    nw_movies = nw_movies[nw_movies['user_id'].isin(list_sim_u)].reset_index(drop = True)
    
    topwatcher = nw_movies.groupby(by="user_id",as_index = False).count().sort_values(by = "movie_id", ascending = False).head(int(len(list_sim_u)/5))
    topwatcher = list(topwatcher["user_id"])
    
    recommend_rates = nw_movies.groupby(by = "movie_id").rating.agg(['count'])
    recommend_rates["recommend_rate"] = 0
    recommend_rates["weight"] = 0
    
    for i in range(len(nw_movies)):
        recommend_rates.recommend_rate[nw_movies["movie_id"][i]] += (similarity[nw_movies["user_id"][i]] * nw_movies["rating"][i])
        recommend_rates.weight[nw_movies["movie_id"][i]] += similarity[nw_movies["user_id"][i]]
    
    recommend_rates["recommend_rate"] = recommend_rates["recommend_rate"]/recommend_rates["weight"]
    
    recommend_rates["movie_id"] = recommend_rates.index
    
    result = recommend_rates[recommend_rates["count"] >10]
    result = result.sort_values(by = "recommend_rate", ascending=False)
    top10 = result.head(10)
    
    info = movies_df[movies_df['movie_id'].isin(top10.index)]
    top10 = top10.reset_index(drop=True)
    
    top10 = top10.merge(info, how = "left", on = "movie_id")
    
    top10 = top10.rename(columns={"count": "vote_number"})
    return top10[["movie_id","title", "recommend_rate", "vote_number", "genres"]]

In [9]:
recommend(1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommend_rates.weight[nw_movies["movie_id"][i]] += similarity[nw_movies["user_id"][i]]


Unnamed: 0,movie_id,title,recommend_rate,vote_number,genres
0,6818,Come and See Idi i smotri 1985,4.625207,19,Drama|War
1,1206,Clockwork Orange A 1971,4.581322,41,Crime|Drama|Sci-Fi|Thriller
2,750,Dr Strangelove or How I Learned to Stop Worryi...,4.542307,40,Comedy|War
3,2019,Seven Samurai Shichinin no samurai 1954,4.518275,32,Action|Adventure|Drama
4,1201,Good the Bad and the Ugly The Buono il brutto ...,4.499657,33,Action|Adventure|Western
5,1208,Apocalypse Now 1979,4.495642,38,Action|Drama|War
6,26150,Andrei Rublev Andrey Rublyov 1969,4.461117,23,Drama|War
7,6643,Tokyo Story Tky monogatari 1953,4.454432,23,Drama
8,1419,Walkabout 1971,4.412912,11,Adventure|Drama
9,858,Godfather The 1972,4.404321,39,Crime|Drama
