In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix

movies_df = pd.read_csv("./movies_1m/movies.dat", sep="::", engine="python").drop(columns="genres", axis=1)
ratings_df = pd.read_csv("./movies_1m/ratings.dat", sep="::", engine="python").drop(columns="timestamp",axis=1)



# tabella dei rating
movie_ratings_df = pd.pivot_table(data=ratings_df, index="movieId", columns="userId", values="rating")
mat_movie_ratings = csr_matrix(movie_ratings_df.values)



num_movies = len(movies_df["movieId"].unique())
num_rated_movies = len(ratings_df["movieId"].unique()  )
num_users = len(ratings_df["userId"].unique())


print("Num movies : ", num_movies )
print("Num rated movies : ", num_rated_movies )
print("Num users : ", num_users)





movie_count = pd.DataFrame(data=ratings_df.groupby(by="movieId").size(),  columns=["num_ratings" ]  )
#movie_count["num_ratings"].hist(bins=100  )

user_count = pd.DataFrame(data=ratings_df.groupby(by="userId").size(), columns=["num_ratings" ]  )
#plt.figure()
#user_count["num_ratings"].hist(bins=100  )


#count most rated
min_movie_num_ratings = 40
min_user_num_ratings = 40
popular_movies = len( movie_count[movie_count["num_ratings"] >= min_movie_num_ratings])
frequent_users = len(user_count[user_count["num_ratings"] >= min_user_num_ratings ])

print("Num movies rated at least ", min_movie_num_ratings , " times : ", popular_movies)
print("Num users who rated a movie at least ", min_user_num_ratings, " times : ", frequent_users)
      
ratings_merged = ratings_df.merge(movie_count, how="inner", on="movieId")

mov_rat_df = pd.pivot_table(ratings_df, index="movieId", columns="userId", values="rating" )


mean_ratings_movies = mov_rat_df.mean(axis=1)
mean_ratings_users = mov_rat_df.mean(axis=0)

mean_rating_movies_with_num = pd.DataFrame(mean_ratings_movies, columns=["mean_rating" ] ).merge(movie_count, how="inner", on="movieId")
mean_rating_users_with_num = pd.DataFrame(mean_ratings_users, columns=["mean_rating" ] ).merge(user_count, how="inner", on="userId")



# scatter plot rappresentante il numero di rating ricevuto da un film e il rispettivo rating medio
plt.figure()
plt.scatter( mean_rating_movies_with_num["num_ratings"],  mean_rating_movies_with_num["mean_rating"]  )
plt.xlabel("num_movie_ratings")
plt.ylabel("avg_rating")
plt.figure()

# scatter plot rappresentante il numero di rating assegnati da un utente e il rispettivo rating medio assegnato
plt.scatter( mean_rating_users_with_num["num_ratings"],  mean_rating_users_with_num["mean_rating"]  )
plt.xlabel("num_user_ratings")
plt.ylabel("avg_rating")
plt.figure()


#Filtraggio che considera solo gli utenti piu attivi e i film con piu valutazioni 

#popular_movies =  movie_count["num_ratings"] > 70 
#frequent_users = user_count["num_ratings"] > 70 


grouped_movie_ratings = []

for i in range( len(mean_rating_movies_with_num) ):
    row = mean_rating_movies_with_num.iloc[i]
    avg_rating = row["mean_rating"]
    if avg_rating >= 1.0 and avg_rating < 2.0:
        grouped_movie_ratings.append(0)
    elif  avg_rating >= 2.0 and avg_rating < 3.0:
         grouped_movie_ratings.append(1)
    elif  avg_rating >= 3.0 and avg_rating < 4.0:
         grouped_movie_ratings.append(2)
    elif  avg_rating >= 4.0 and avg_rating <= 5.0:
         grouped_movie_ratings.append(3)
mean_rating_movies_with_num["bin_rating"] = grouped_movie_ratings

grouped_user_ratings = []

for i in range( len(mean_rating_users_with_num) ):
    row = mean_rating_users_with_num.iloc[i]
    avg_rating = row["mean_rating"]
    if avg_rating >= 1.0 and avg_rating < 2.0:
        grouped_user_ratings.append(0)
    elif  avg_rating >= 2.0 and avg_rating < 3.0:
         grouped_user_ratings.append(1)
    elif  avg_rating >= 3.0 and avg_rating < 4.0:
         grouped_user_ratings.append(2)
    elif  avg_rating >= 4.0 and avg_rating <= 5.0:
         grouped_user_ratings.append(3)
mean_rating_users_with_num["bin_rating"] = grouped_user_ratings

n_r_users = mean_rating_users_with_num["num_ratings"] 
n_r_movies = mean_rating_movies_with_num["num_ratings"] 




# estremi dell' intervallo preso in considerazione
lower_bound = 0
upper_bound = 500

low_users = mean_rating_users_with_num[ (n_r_users >= lower_bound) &  (n_r_users < upper_bound) ]
                           # (n_r_users > 0) & 
low_movies = mean_rating_movies_with_num[ (n_r_movies >= lower_bound)  & ( n_r_movies < upper_bound) ]
                                #(n_r_movies > 0)  & 
    
# istogramma che rappresenta il numero di utenti (i quali hanno valutato un numero di film nell intervallo considerato) e la
# frequenza con cui essi hanno assegnato un certo rating 
low_users["mean_rating"].hist(bins=20)
plt.xticks(range(1, 6))
plt.figure()

# istogramma rappresentante la frequenza con cui un certo rating è stato assegnato ad un film
# che ha ricevuto un numero di rating nell' intervallo considerato
low_movies["mean_rating"].hist(bins=20)
plt.xticks(range(1, 6))
plt.figure()











