# User Intereset Analysis

## I. Setup


In [1]:
import sys
sys.path.append('..')


In [2]:
import pandas as pd

from lib.types.dataset_type import DatasetType
from lib.types.source_type import SourceType


In [3]:
source_path: SourceType = SourceType.original


## II. Read CSVs


In [4]:
def fetch_movie_df() -> pd.DataFrame:
    movies_df = pd.read_csv(DatasetType.movies_metadata.cleaned_path())
    movies_df.rename(
        columns={'id': 'movie_id'},
        inplace=True
    )
    return movies_df


def fetch_rating_df() -> pd.DataFrame:
    rating_df = pd.read_csv(DatasetType.ratings.original_path())
    rating_df.rename(
        columns={'userId': 'user_id', 'movieId': 'movie_id'},
        inplace=True
    )
    return rating_df


## Exploration


In [5]:
movies_df = fetch_movie_df()
movies_df.shape


  movies_df = pd.read_csv(DatasetType.movies_metadata.cleaned_path())


(45464, 18)

In [6]:
rating_df = fetch_rating_df()
rating_df.shape


(26024289, 4)

## II. Find all genres


In [7]:
def find_all_genres(genres: pd.Series):
    genres = genres.str.split("|")
    genres_cleaned = [item for item in genres.to_list()
                      if isinstance(item, list)]

    all_movie_genres = "|".join([
        "|".join(i)
        for i in genres_cleaned
    ]).split("|")

    all_movie_genres = set(all_movie_genres)
    return list(all_movie_genres)


In [8]:
all_genres = find_all_genres(movies_df['genres'])
" | ".join(all_genres)


'Comedy | Animation | Action | Adventure | Documentary | Foreign | TV Movie | Mardock Scramble Production Committee | Horror | Fantasy | Aniplex | Science Fiction | Romance | Crime | GoHands | Thriller | Western | War | Drama | BROSTA TV | Mystery | Music | History | Family | Sentai Filmworks'

## II. User interest per user


In [9]:
def fetch_rating_per_user_df(user_id: int):

    rating_per_user_df = rating_df[rating_df['user_id'] == user_id]
    rating_per_user_df['movie_id'] = rating_per_user_df['movie_id'].astype(str)

    return rating_per_user_df


def fetch_movies_per_user_df(rating_per_user_df: pd.DataFrame):
    movies_per_user_df = pd.merge(
        left=rating_per_user_df,
        right=movies_df,
        on='movie_id'
    )
    return movies_per_user_df


user_id = 123963
rating_per_user_df = fetch_rating_per_user_df(user_id=user_id)
movies_per_user_df = fetch_movies_per_user_df(rating_per_user_df)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rating_per_user_df['movie_id'] = rating_per_user_df['movie_id'].astype(str)


In [10]:
movies_per_user_df.shape


(73, 21)

In [11]:
"HERE IS ALL GENRES THAT USER HAVE RATED:" + " | ".join(find_all_genres(movies_per_user_df['genres']))


'HERE IS ALL GENRES THAT USER HAVE RATED:Comedy | Animation | Action | Adventure | Documentary | Foreign | TV Movie | Horror | Fantasy | Science Fiction | Romance | Crime | Thriller | Western | Drama | Mystery | Music | History | Family'

## III. Average rating for each each genre

In [12]:
def get_avg_rating_dict(input_df: pd.DataFrame):
    rating_dict = {}
    avg_rating_dict = {}

    for row in input_df.values:
        # movie_id = row[1]
        rating = row[2]
        genres = row[12]

        for genre in str(genres).split("|"):
            if(rating_dict.get(genre) == None):
                rating_dict[genre] = []
            rating_dict[genre].append(rating)

    for key, value in rating_dict.items():
        avg_rating_dict[key] = sum(value) / len(value)

    dict_sorted = sorted(
        avg_rating_dict.items(),
        key=lambda element: element[1],
        reverse=True,
    )

    avg_rating_dict = {key: value for key, value in dict_sorted}
    return avg_rating_dict

In [13]:
print("HERE IS AVERAGE RATING FOR GENRE FOR USER: {}".format(user_id))
avg_rating_dict = get_avg_rating_dict(movies_per_user_df)
avg_rating_dict

HERE IS AVERAGE RATING FOR GENRE FOR USER: 123963


{'TV Movie': 4.0,
 'Adventure': 3.875,
 'Foreign': 3.6666666666666665,
 'Western': 3.5,
 'History': 3.5,
 'Romance': 3.4642857142857144,
 'Horror': 3.4444444444444446,
 'Fantasy': 3.375,
 'Science Fiction': 3.35,
 'Drama': 3.3205128205128207,
 'Action': 3.2857142857142856,
 'Comedy': 3.272727272727273,
 'Crime': 3.2666666666666666,
 'Thriller': 3.261904761904762,
 'Mystery': 3.142857142857143,
 'Documentary': 2.5,
 'Family': 2.0,
 'Music': 2.0,
 'Animation': 0.5}