In [1]:
import argparse
import datetime
import pandas as pd
import os
import numpy as np
import random
import re


def load_list(fname):
    list_ = []
    with open(fname, encoding="utf-8") as f:
        for line in f.readlines():
            list_.append(line.strip())
    return list_


class movielens_1m(object):
    def __init__(self,  ):
        self.size = 10
        
        # try:
        #     os.mkdir("./dataset/movielens/train")
        #     os.mkdir("./dataset/movielens/val")
        #     os.mkdir("./dataset/movielens/test")     
        # except:
        #     pass
        
        self.user_data, self.item_data, self.score_data = self.load()


        
    # def make_dataset(self,):
        
        # self.item_data = self.item_encoding(self.item_data)
        # self.user_data = self.user_encoding(self.user_data)
        
        # self.filter_user()
        # self.get_support_vector(SIZE=self.size )
        

    def load(self):
        path = "./movielens/ml-1m"
        profile_data_path = "{}/users.dat".format(path)
        score_data_path = "{}/ratings.dat".format(path)
        item_data_path = "{}/movies_extrainfos.dat".format(path)

        profile_data = pd.read_csv(
            profile_data_path, names=['user_id', 'gender', 'age', 'occupation_code', 'zip'], 
            sep="::", engine='python'
        )
        
        score_data = pd.read_csv(
            score_data_path, names=['user_id', 'item_id', 'rating', 'timestamp'],
            sep="::", engine='python'
        )

        # item_cols = ['item_id', 'item_title', 'item_genres']
        item_cols = ['item_id', 'item_title', 'year', 'rate', 'released', 'item_genres', 'director', 'writer', 'actors', 'plot', 'poster']

        item_data = pd.read_csv(
            item_data_path, names=item_cols, 
            sep="::", engine='python', encoding="utf-8"
        )

        df = pd.merge(item_data, score_data, on="item_id", how="inner")
        df = df.groupby(item_cols).agg({
            "user_id": lambda x: x.tolist(),
            "rating": lambda x: x.tolist(),
            "timestamp": lambda x: x.tolist()
        }).reset_index()
        
        df["item_id"] = [i for i in range(len(df))]
        
        item_data = df[item_cols]
        score_data = df.explode(["user_id", "rating", "timestamp"])[["user_id", "item_id", "rating", "timestamp"]]
        score_data = score_data.sort_values(by="timestamp").reset_index(drop=True)
        return profile_data, item_data[["item_id", "item_genres", "director", "actors"]], score_data
    
    def item_encoding(self, item_data):
        item_fea_hete = []
        item_fea_homo = []
        m_directors = []
        m_actors = []
        genre_feat = []

        input_dir = "./dataset/movielens/ml-1m/extra"
        rate_list = load_list("{}/m_rate.txt".format(input_dir))
        genre_list = load_list("{}/m_genre.txt".format(input_dir))
        actor_list = load_list("{}/m_actor.txt".format(input_dir))
        director_list = load_list("{}/m_director.txt".format(input_dir))

        for idx, row in item_data.iterrows():
            m_info = self.item_extra_converting(row, rate_list, genre_list, director_list, actor_list)
            item_fea_hete.append(m_info[0])
            item_fea_homo.append(m_info[1])
            m_directors.append(m_info[2])
            m_actors.append(m_info[3])
            genre_feat.append(m_info[4])

        item_data["item_fea_hete"] = item_fea_hete
        item_data["item_fea_homo"] = item_fea_homo
        item_data["m_directors"] = m_directors
        item_data["m_actors"] = m_actors

        print(item_data)
        item_data["item_feature"] = genre_feat

        self.item_cols = ["item_id", "item_feature", "item_fea_hete", "item_fea_homo", "m_directors", "m_actors"]

        item_data = item_data[self.item_cols]
        return item_data
    
    def user_encoding(self, profile_data):
        input_dir = "./dataset/movielens/ml-1m/extra"

        gender_list = load_list("{}/m_gender.txt".format(input_dir))
        age_list = load_list("{}/m_age.txt".format(input_dir))
        occupation_list = load_list("{}/m_occupation.txt".format(input_dir))
        zipcode_list = load_list("{}/m_zipcode.txt".format(input_dir))

        gender_np = pd.get_dummies(profile_data["gender"]).astype(int).to_numpy()
        age_np = pd.get_dummies(profile_data["age"]).astype(int).to_numpy()
        occ_np = pd.get_dummies(profile_data["occupation_code"]).astype(int).to_numpy()
        zip_np = pd.get_dummies(profile_data["zip"]).astype(int).to_numpy()
        print("gender_np shape, ", gender_np.shape)
        print("age_np shape, ", age_np.shape)
        print("occ_np shape, ", occ_np.shape)
        print("zip_np shape, ", zip_np.shape)
        
        user_feature = np.concatenate([gender_np, age_np, occ_np, zip_np], axis=1)

        user_extra_feat = []
        for idx, row in profile_data.iterrows():
            u_info = self.user_extra_converting(row, gender_list, age_list, occupation_list, zipcode_list)
            user_extra_feat.append(u_info)
        # user_extra_feat = np.concatenate(user_extra_feat)

        profile_data["user_feature"] = user_feature.tolist()
        profile_data["user_extra_feature"] = user_extra_feat
        self.user_cols = ["user_id", "user_feature", "user_extra_feature"]
        profile_data = profile_data[self.user_cols]
        return profile_data
    
    def item_extra_converting(self, row, rate_list, genre_list, director_list, actor_list):
        rate_idx = torch.tensor([[rate_list.index(str(row['rate']))]]).long()
        genre_idx = torch.zeros(1, 25).long()
        for genre in str(row['item_genres']).split(", "):
            idx = genre_list.index(genre)
            genre_idx[0, idx] = 1  # one-hot vector
        director_idx = torch.zeros(1, 2186).long()
        director_id = []
        for director in str(row['director']).split(", "):
            idx = director_list.index(re.sub(r'\([^()]*\)', '', director))
            director_idx[0, idx] = 1
            director_id.append(idx+1)  # id starts from 1, not index
        actor_idx = torch.zeros(1, 8030).long()
        actor_id = []
        for actor in str(row['actors']).split(", "):
            idx = actor_list.index(actor)
            actor_idx[0, idx] = 1
            actor_id.append(idx+1)
        return torch.cat((rate_idx, genre_idx), 1).squeeze().tolist(), torch.cat((rate_idx, genre_idx, director_idx, actor_idx), 1).squeeze().tolist(), director_id, actor_id, genre_idx.squeeze().tolist()

    def user_extra_converting(self, row, gender_list, age_list, occupation_list, zipcode_list):
        gender_idx = torch.tensor([[gender_list.index(str(row['gender']))]]).long()
        age_idx = torch.tensor([[age_list.index(str(row['age']))]]).long()
        occupation_idx = torch.tensor([[occupation_list.index(str(row['occupation_code']))]]).long()
        zip_idx = torch.tensor([[zipcode_list.index(str(row['zip'])[:5])]]).long()
        return torch.cat((gender_idx, age_idx, occupation_idx, zip_idx), 1).squeeze().tolist()  # (1, 4)

    
    def filter_user(self):
        df = self.score_data
        df = df.groupby("user_id").agg({
            "item_id": lambda x: x.tolist(),
            "rating": lambda x: x.tolist(),
            "timestamp": lambda x: x.tolist()
        }).reset_index()

        df["pos_len"] = df["rating"].apply(lambda x: len([item for item in x if item >= 4]))
        df["rate_len"] = df["rating"].apply(lambda x: len([item for item in x]))

        # selected_user_id = df[(df["rate_len"] >= 20)]["user_id"].tolist()
        if(self.pref):
            print("pref")
            selected_user_id = df[(df["pos_len"] >= self.size) & (df["rate_len"] >= 10)]["user_id"].tolist()
        else:
            print("support")
            selected_user_id = df[(df["rate_len"] >= self.size + 2)]["user_id"].tolist()

        
        tmp_df = pd.merge(self.user_data, df, on="user_id", how="inner")
        tmp_df = tmp_df[tmp_df["user_id"].isin(selected_user_id)].reset_index(drop=True)
        tmp_df = tmp_df.sample(frac=1.0).reset_index(drop=True)
        
        tmp_df["user_id"] = [i for i in range(len(tmp_df))]
        
        self.user_data = tmp_df[self.user_cols].reset_index(drop=True)
        self.score_data = tmp_df.explode(["item_id", "rating", "timestamp"])[["user_id", "item_id", "rating", "timestamp"]].reset_index(drop=True)
        self.score_data = self.score_data.sort_values(by="timestamp")


dataset = movielens_1m()
# dataset.make_dataset()

print("#"*10 + "dataset created" + "#"*10)


##########dataset created##########


In [2]:
user_df, item_df, score_df = dataset.user_data, dataset.item_data, dataset.score_data

In [3]:
user_df

Unnamed: 0,user_id,gender,age,occupation_code,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [4]:
item_df

[""]

['']

In [5]:
score_df

Unnamed: 0,user_id,item_id,rating,timestamp
0,6040,776,4,956703932
1,6040,2125,4,956703954
2,6040,564,5,956703954
3,6040,1779,5,956703977
4,6040,1721,4,956703977
...,...,...,...,...
992512,4958,2140,1,1046454338
992513,4958,1261,5,1046454443
992514,4958,2954,4,1046454548
992515,4958,2362,3,1046454548


In [6]:
import datetime
import pytz

def convert_timestamp_to_kst_year(timestamp):
    # UTC 시간대 설정
    utc_time = datetime.datetime.utcfromtimestamp(timestamp)
    
    # KST 시간대 객체 생성
    kst = pytz.timezone('Asia/Seoul')
    
    # UTC 시간을 KST 시간으로 변환
    kst_time = utc_time.replace(tzinfo=pytz.utc).astimezone(kst)
    
    # 연도 반환
    return kst_time.year

# 예제 사용
timestamp = 1609459200  # 2021년 1월 1일 00:00:00 UTC 타임스탬프
kst_year = convert_timestamp_to_kst_year(1046454590)
kst_year

2003

In [7]:
score_df["year"] = score_df["timestamp"].apply(convert_timestamp_to_kst_year)
score_df

Unnamed: 0,user_id,item_id,rating,timestamp,year
0,6040,776,4,956703932,2000
1,6040,2125,4,956703954,2000
2,6040,564,5,956703954,2000
3,6040,1779,5,956703977,2000
4,6040,1721,4,956703977,2000
...,...,...,...,...,...
992512,4958,2140,1,1046454338,2003
992513,4958,1261,5,1046454443,2003
992514,4958,2954,4,1046454548,2003
992515,4958,2362,3,1046454548,2003


In [8]:
score_df["year"].max(), score_df["year"].min()

(2003, 2000)

In [9]:
item_df

Unnamed: 0,item_id,item_genres,director,actors
0,0,"Animation, Adventure, Comedy",John Lasseter,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney"
1,1,"Adventure, Family, Fantasy",Joe Johnston,"Robin Williams, Jonathan Hyde, Kirsten Dunst, ..."
2,2,"Comedy, Romance",Howard Deutch,"Walter Matthau, Jack Lemmon, Sophia Loren, Ann..."
3,3,"Comedy, Drama, Romance",Forest Whitaker,"Whitney Houston, Angela Bassett, Loretta Devin..."
4,4,"Comedy, Family, Romance",Charles Shyer,"Steve Martin, Diane Keaton, Martin Short, Kimb..."
...,...,...,...,...
3588,3588,"Comedy, Romance",Jay Roach,"Robert De Niro, Ben Stiller, Teri Polo, Blythe..."
3589,3589,Drama,Darren Aronofsky,"Ellen Burstyn, Jared Leto, Jennifer Connelly, ..."
3590,3590,"Drama, War",Joel Schumacher,"Colin Farrell, Matthew Davis, Clifton Collins ..."
3591,3591,"Comedy, Drama, Romance",Raymond De Felitta,"Michael Rispoli, Kelly Macdonald, Kathrine Nar..."


### Bar Chart - Genre, Director, Actor

In [125]:
temp_df = pd.merge(item_df, score_df, on="item_id", how="inner")
temp_df = temp_df[temp_df["year"]==2003][["actors", "rating"]]
temp_df["actors"] = temp_df["actors"].apply(lambda x: x.split(", "))
temp_df = temp_df.explode("actors").reset_index(drop=False)
temp_df = temp_df.groupby("actors").agg({
    "rating": lambda x: x.tolist(),
    # "rating": lambda x: np.mean(x.tolist()),
}).reset_index(drop=False)
temp_df["len_rating"] = temp_df["rating"].apply(lambda x: len(x))
temp_df["rating"] = temp_df["rating"].apply(lambda x: np.mean(x))
temp_df = temp_df[temp_df["len_rating"] >= 20]

temp_df = temp_df.sort_values("rating", ascending=False).iloc[:5, :]
temp_df.rename(columns = {
    "actors": "Country",
    "rating": "Value",
}, inplace=True)

temp_df.to_csv("barchart_actors.csv", index=False)
temp_df

Unnamed: 0,Country,Value,len_rating
2600,Paul Newman,4.5,20
2204,Marlon Brando,4.148148,27
440,Carrie Fisher,4.130435,23
898,Edward Norton,4.047619,21
2850,Robert Redford,4.037037,27


In [126]:
temp_df = pd.merge(item_df, score_df, on="item_id", how="inner")
temp_df = temp_df[temp_df["year"]==2003][["director", "rating"]]
temp_df["director"] = temp_df["director"].apply(lambda x: x.split(", "))
temp_df = temp_df.explode("director").reset_index(drop=False)
temp_df = temp_df.groupby("director").agg({
    "rating": lambda x: x.tolist(),
    # "rating": lambda x: np.mean(x.tolist()),
}).reset_index(drop=False)
temp_df["len_rating"] = temp_df["rating"].apply(lambda x: len(x))
temp_df["rating"] = temp_df["rating"].apply(lambda x: np.mean(x))
temp_df = temp_df[temp_df["len_rating"] >= 20]

temp_df = temp_df.sort_values("rating", ascending=False).iloc[:5, :]
temp_df.rename(columns = {
    "director": "Country",
    "rating": "Value",
}, inplace=True)

temp_df.to_csv("barchart_director.csv", index=False)
temp_df

Unnamed: 0,Country,Value,len_rating
539,Martin Scorsese,4.130435,23
688,Ridley Scott,4.1,20
226,Francis Ford Coppola,4.090909,33
18,Alfred Hitchcock,4.064516,31
340,James Cameron,4.0,22


In [128]:
temp_df = pd.merge(item_df, score_df, on="item_id", how="inner")
temp_df = temp_df[temp_df["year"]==2003][["item_genres", "rating"]]
temp_df["genres"]= temp_df["item_genres"]
temp_df["genres"] = temp_df["genres"].apply(lambda x: x.split(", "))
temp_df = temp_df.explode("genres").reset_index(drop=False)
temp_df = temp_df.groupby("genres").agg({
    "rating": lambda x: x.tolist(),
    # "rating": lambda x: np.mean(x.tolist()),
}).reset_index(drop=False)
temp_df["len_rating"] = temp_df["rating"].apply(lambda x: len(x))
temp_df["rating"] = temp_df["rating"].apply(lambda x: np.mean(x))
temp_df = temp_df[temp_df["len_rating"] >= 20]

temp_df = temp_df.sort_values("rating", ascending=False).iloc[:5, :]
temp_df.rename(columns = {
    "genres": "Country",
    "rating": "Value",
}, inplace=True)

temp_df.to_csv("barchart_genres.csv", index=False)
temp_df

Unnamed: 0,Country,Value,len_rating
10,Film-Noir,4.391304,23
21,War,4.114943,87
6,Documentary,4.04,25
14,Musical,3.711864,59
11,History,3.704545,88


### Occupation - Genre, Director, Actor

In [76]:
from copy import deepcopy
import json

temp_df = pd.merge(user_df, score_df, on="user_id", how="inner")
temp_df = pd.merge(temp_df, item_df, on="item_id", how="inner")
# temp_df = temp_df[temp_df["year"] == 2003]

occup_actor_csv = temp_df[["user_id", "occupation_code", "item_id", "rating", "actors"]]

occup_actor_csv["actors"] = occup_actor_csv["actors"].apply(lambda x: x.split(", "))
occup_actor_csv = occup_actor_csv.explode("actors").reset_index(drop=False)


occup_code2name = ["other","academic/educator", "artist", "clerical/admin", "college/grad student", "customer service", "doctor/health care", "executive/managerial", "farmer", "homemaker", "K-12 student", "lawyer", "programmer", "retired", "sales/marketing", "scientist", "self-employed", "technician/engineer", "tradesman/craftsman", "unemployed", "writer"]
    
occup_actor_csv["occupation"] = occup_actor_csv["occupation_code"].apply(lambda x: occup_code2name[x])

    
occup_actor_csv = occup_actor_csv[["actors", "occupation", "rating"]]
occup_actor_csv = occup_actor_csv.groupby("actors").agg({
    "occupation": lambda x : x.tolist(),
    "rating": lambda x : x.tolist(),
}).reset_index(drop=False)
occup_actor_csv["rating_len"] = occup_actor_csv["rating"].apply(lambda x: len(x))
occup_actor_csv = occup_actor_csv[occup_actor_csv["rating_len"] >= 10]
occup_actor_csv["avg_rate"] = occup_actor_csv["rating"].apply(lambda x: np.mean(x))
occup_actor_csv = occup_actor_csv.sort_values("avg_rate", ascending=False).iloc[:5, :]

occup_actor_csv = occup_actor_csv.explode(["occupation", "rating"])


rating_lst, actor_lst, occup_lst = [], [], []
for actor in occup_actor_csv["actors"].unique():
    for occup in occup_code2name:
        df = occup_actor_csv[(occup_actor_csv["actors"] == actor) & (occup_actor_csv["occupation"] == occup)]
        if(len(df) == 0):
            rating_lst.append(1)
        else:
            rating_lst.append(df["rating"].mean())
        actor_lst.append(actor)
        occup_lst.append(occup)
    
df = pd.DataFrame({
    "variable":occup_lst ,
    "group":actor_lst ,
    "value": rating_lst,
})

df.to_csv("occup_actor.csv", index=False)
print(df["value"].min())

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  occup_actor_csv["actors"] = occup_actor_csv["actors"].apply(lambda x: x.split(", "))


1.0


Unnamed: 0,variable,group,value
0,other,Keiju Kobayashi,4.857143
1,academic/educator,Keiju Kobayashi,4.400000
2,artist,Keiju Kobayashi,4.333333
3,clerical/admin,Keiju Kobayashi,5.000000
4,college/grad student,Keiju Kobayashi,4.857143
...,...,...,...
100,self-employed,Anne Reid,4.375000
101,technician/engineer,Anne Reid,4.571429
102,tradesman/craftsman,Anne Reid,5.000000
103,unemployed,Anne Reid,4.625000


In [70]:
df["x"].unique()

array(['Keiju Kobayashi', 'Yukiko Shimazaki', 'Keiko Tsushima',
       'Richard S. Castellano', 'Anne Reid'], dtype=object)

In [72]:
df["y"].unique()

array(['other', 'academic/educator', 'artist', 'clerical/admin',
       'college/grad student', 'customer service', 'doctor/health care',
       'executive/managerial', 'farmer', 'homemaker', 'K-12 student',
       'lawyer', 'programmer', 'retired', 'sales/marketing', 'scientist',
       'self-employed', 'technician/engineer', 'tradesman/craftsman',
       'unemployed', 'writer'], dtype=object)

In [77]:
from copy import deepcopy
import json

temp_df = pd.merge(user_df, score_df, on="user_id", how="inner")
temp_df = pd.merge(temp_df, item_df, on="item_id", how="inner")
# temp_df = temp_df[temp_df["year"] == 2003]

occup_director_csv = temp_df[["user_id", "occupation_code", "item_id", "rating", "director"]]

occup_director_csv["director"] = occup_director_csv["director"].apply(lambda x: x.split(", "))
occup_director_csv = occup_director_csv.explode("director").reset_index(drop=False)


occup_code2name = ["other","academic/educator", "artist", "clerical/admin", "college/grad student", "customer service", "doctor/health care", "executive/managerial", "farmer", "homemaker", "K-12 student", "lawyer", "programmer", "retired", "sales/marketing", "scientist", "self-employed", "technician/engineer", "tradesman/craftsman", "unemployed", "writer"]
    
occup_director_csv["occupation"] = occup_director_csv["occupation_code"].apply(lambda x: occup_code2name[x])

    
occup_director_csv = occup_director_csv[["director", "occupation", "rating"]]
occup_director_csv = occup_director_csv.groupby("director").agg({
    "occupation": lambda x : x.tolist(),
    "rating": lambda x : x.tolist(),
}).reset_index(drop=False)
occup_director_csv["rating_len"] = occup_director_csv["rating"].apply(lambda x: len(x))
occup_director_csv = occup_director_csv[occup_director_csv["rating_len"] >= 10]
occup_director_csv["avg_rate"] = occup_director_csv["rating"].apply(lambda x: np.mean(x))
occup_director_csv = occup_director_csv.sort_values("avg_rate", ascending=False).iloc[:5, :]

occup_director_csv = occup_director_csv.explode(["occupation", "rating"])


rating_lst, director_lst, occup_lst = [], [], []
for director in occup_director_csv["director"].unique():
    for occup in occup_code2name:
        df = occup_director_csv[(occup_director_csv["director"] == director) & (occup_director_csv["occupation"] == occup)]
        if(len(df) == 0):
            rating_lst.append(1)
        else:
            rating_lst.append(df["rating"].mean())
        director_lst.append(director)
        occup_lst.append(occup)
    
df = pd.DataFrame({
    "variable":occup_lst ,
    "group":director_lst ,
    "value": rating_lst,
})

df.to_csv("occup_director.csv", index=False)
print(df["value"].min())

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  occup_director_csv["director"] = occup_director_csv["director"].apply(lambda x: x.split(", "))


1.0


Unnamed: 0,variable,group,value
0,other,Akira Kurosawa,4.422460
1,academic/educator,Akira Kurosawa,4.485714
2,artist,Akira Kurosawa,4.474747
3,clerical/admin,Akira Kurosawa,4.500000
4,college/grad student,Akira Kurosawa,4.328859
...,...,...,...
100,self-employed,Satyajit Ray,4.833333
101,technician/engineer,Satyajit Ray,1.000000
102,tradesman/craftsman,Satyajit Ray,1.000000
103,unemployed,Satyajit Ray,3.000000


In [78]:
df["group"].unique()

array(['Akira Kurosawa', 'Frank Darabont', 'M. Night Shyamalan',
       'Clyde Bruckman', 'Satyajit Ray'], dtype=object)

In [81]:
from copy import deepcopy
import json

temp_df = pd.merge(user_df, score_df, on="user_id", how="inner")
temp_df = pd.merge(temp_df, item_df, on="item_id", how="inner")
# temp_df = temp_df[temp_df["year"] == 2003]

occup_genre_csv = temp_df[["user_id", "occupation_code", "item_id", "rating", "item_genres"]]
occup_genre_csv["genre"] = occup_genre_csv["item_genres"]
occup_genre_csv["genre"] = occup_genre_csv["genre"].apply(lambda x: x.split(", "))
occup_genre_csv = occup_genre_csv.explode("genre").reset_index(drop=False)


occup_code2name = ["other","academic/educator", "artist", "clerical/admin", "college/grad student", "customer service", "doctor/health care", "executive/managerial", "farmer", "homemaker", "K-12 student", "lawyer", "programmer", "retired", "sales/marketing", "scientist", "self-employed", "technician/engineer", "tradesman/craftsman", "unemployed", "writer"]
    
occup_genre_csv["occupation"] = occup_genre_csv["occupation_code"].apply(lambda x: occup_code2name[x])

    
occup_genre_csv = occup_genre_csv[["genre", "occupation", "rating"]]
occup_genre_csv = occup_genre_csv.groupby("genre").agg({
    "occupation": lambda x : x.tolist(),
    "rating": lambda x : x.tolist(),
}).reset_index(drop=False)
occup_genre_csv["rating_len"] = occup_genre_csv["rating"].apply(lambda x: len(x))
occup_genre_csv = occup_genre_csv[occup_genre_csv["rating_len"] >= 10]
occup_genre_csv["avg_rate"] = occup_genre_csv["rating"].apply(lambda x: np.mean(x))
occup_genre_csv = occup_genre_csv.sort_values("avg_rate", ascending=False).iloc[:5, :]

occup_genre_csv = occup_genre_csv.explode(["occupation", "rating"])


rating_lst, genre_lst, occup_lst = [], [], []
for genre in occup_genre_csv["genre"].unique():
    for occup in occup_code2name:
        df = occup_genre_csv[(occup_genre_csv["genre"] == genre) & (occup_genre_csv["occupation"] == occup)]
        if(len(df) == 0):
            rating_lst.append(1)
        else:
            rating_lst.append(df["rating"].mean())
        genre_lst.append(genre)
        occup_lst.append(occup)
    
df = pd.DataFrame({
    "variable":occup_lst ,
    "group":genre_lst ,
    "value": rating_lst,
})

df.to_csv("occup_genre.csv", index=False)
print(df["value"].min())

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  occup_genre_csv["genre"] = occup_genre_csv["item_genres"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  occup_genre_csv["genre"] = occup_genre_csv["genre"].apply(lambda x: x.split(", "))


3.7142857142857144


Unnamed: 0,variable,group,value
0,other,Short,4.359133
1,academic/educator,Short,4.337607
2,artist,Short,4.443182
3,clerical/admin,Short,4.320000
4,college/grad student,Short,4.346705
...,...,...,...
100,self-employed,War,3.954604
101,technician/engineer,War,3.981692
102,tradesman/craftsman,War,3.919771
103,unemployed,War,3.753165


In [83]:
df["group"].unique()

array(['Short', 'Film-Noir', 'Documentary', 'History', 'War'],
      dtype=object)

In [54]:
occup_actor_csv["rating"].mean()

4.536266349583829

In [86]:
user_df

Unnamed: 0,user_id,gender,age,occupation_code,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


### Age - Genre, Director, Actor

In [103]:
from copy import deepcopy
import json

temp_df = pd.merge(user_df, score_df, on="user_id", how="inner")
temp_df = pd.merge(temp_df, item_df, on="item_id", how="inner")
# temp_df = temp_df[temp_df["year"] == 2003]

age_genre_csv = temp_df[["user_id", "age", "item_id", "rating", "item_genres"]]
age_genre_csv["genre"] = age_genre_csv["item_genres"]
age_genre_csv["genre"] = age_genre_csv["genre"].apply(lambda x: x.split(", "))
age_genre_csv = age_genre_csv.explode("genre").reset_index(drop=False)


    
age_convert_dict = {
	 1:  "Under 18",
	18:  "18-24",
	25:  "25-34",
	35:  "35-44",
	45:  "45-49",
	50:  "50-55",
	56:  "56+"
}

age_genre_csv["age"] = age_genre_csv["age"].apply(lambda x: age_convert_dict[x])

    
age_genre_csv = age_genre_csv[["genre", "age", "rating"]]
age_genre_csv = age_genre_csv.groupby("genre").agg({
    "age": lambda x : x.tolist(),
    "rating": lambda x : x.tolist(),
}).reset_index(drop=False)
age_genre_csv["rating_len"] = age_genre_csv["rating"].apply(lambda x: len(x))
age_genre_csv = age_genre_csv[age_genre_csv["rating_len"] >= 10]
age_genre_csv["avg_rate"] = age_genre_csv["rating"].apply(lambda x: np.mean(x))
age_genre_csv = age_genre_csv.sort_values("avg_rate", ascending=False).iloc[:5, :]

age_genre_csv = age_genre_csv.explode(["age", "rating"])


rating_lst, genre_lst, age_lst = [], [], []
for genre in age_genre_csv["genre"].unique():
    for age in age_convert_dict.values():
        df = age_genre_csv[(age_genre_csv["genre"] == genre) & (age_genre_csv["age"] == age)]
        if(len(df) == 0):
            rating_lst.append(1)
        else:
            rating_lst.append(df["rating"].mean())
        genre_lst.append(genre)
        age_lst.append(age)
    
df = pd.DataFrame({
    "variable":age_lst ,
    "group":genre_lst ,
    "value": rating_lst,
})

df.to_csv("age_genre.csv", index=False)
print(df["variable"].unique().tolist(), df["group"].unique().tolist())

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_genre_csv["genre"] = age_genre_csv["item_genres"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_genre_csv["genre"] = age_genre_csv["genre"].apply(lambda x: x.split(", "))


['Under 18', '18-24', '25-34', '35-44', '45-49', '50-55', '56+'] ['Short', 'Film-Noir', 'Documentary', 'History', 'War']


Unnamed: 0,variable,group,value
0,Under 18,Short,4.150794
1,18-24,Short,4.340681
2,25-34,Short,4.403197
3,35-44,Short,4.32392
4,45-49,Short,4.430233
5,50-55,Short,4.476821
6,56+,Short,4.405797
7,Under 18,Film-Noir,4.241667
8,18-24,Film-Noir,4.108987
9,25-34,Film-Noir,4.265928


In [104]:
from copy import deepcopy
import json

temp_df = pd.merge(user_df, score_df, on="user_id", how="inner")
temp_df = pd.merge(temp_df, item_df, on="item_id", how="inner")
# temp_df = temp_df[temp_df["year"] == 2003]

age_director_csv = temp_df[["user_id", "age", "item_id", "rating", "director"]]
age_director_csv["director"] = age_director_csv["director"]
age_director_csv["director"] = age_director_csv["director"].apply(lambda x: x.split(", "))
age_director_csv = age_director_csv.explode("director").reset_index(drop=False)

    
age_convert_dict = {
	 1:  "Under 18",
	18:  "18-24",
	25:  "25-34",
	35:  "35-44",
	45:  "45-49",
	50:  "50-55",
	56:  "56+"
}

age_director_csv["age"] = age_director_csv["age"].apply(lambda x: age_convert_dict[x])

    
age_director_csv = age_director_csv[["director", "age", "rating"]]
age_director_csv = age_director_csv.groupby("director").agg({
    "age": lambda x : x.tolist(),
    "rating": lambda x : x.tolist(),
}).reset_index(drop=False)
age_director_csv["rating_len"] = age_director_csv["rating"].apply(lambda x: len(x))
age_director_csv = age_director_csv[age_director_csv["rating_len"] >= 10]
age_director_csv["avg_rate"] = age_director_csv["rating"].apply(lambda x: np.mean(x))
age_director_csv = age_director_csv.sort_values("avg_rate", ascending=False).iloc[:5, :]

age_director_csv = age_director_csv.explode(["age", "rating"])


rating_lst, director_lst, age_lst = [], [], []
for director in age_director_csv["director"].unique():
    for age in age_convert_dict.values():
        df = age_director_csv[(age_director_csv["director"] == director) & (age_director_csv["age"] == age)]
        if(len(df) == 0):
            rating_lst.append(1)
        else:
            rating_lst.append(df["rating"].mean())
        director_lst.append(director)
        age_lst.append(age)
    
df = pd.DataFrame({
    "variable":age_lst ,
    "group":director_lst ,
    "value": rating_lst,
})

df.to_csv("age_director.csv", index=False)
print(df["variable"].unique().tolist(), df["group"].unique().tolist())

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_director_csv["director"] = age_director_csv["director"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_director_csv["director"] = age_director_csv["director"].apply(lambda x: x.split(", "))


['Under 18', '18-24', '25-34', '35-44', '45-49', '50-55', '56+'] ['Akira Kurosawa', 'Frank Darabont', 'M. Night Shyamalan', 'Clyde Bruckman', 'Satyajit Ray']


Unnamed: 0,variable,group,value
0,Under 18,Akira Kurosawa,4.36
1,18-24,Akira Kurosawa,4.473118
2,25-34,Akira Kurosawa,4.428279
3,35-44,Akira Kurosawa,4.382586
4,45-49,Akira Kurosawa,4.417647
5,50-55,Akira Kurosawa,4.50289
6,56+,Akira Kurosawa,4.469136
7,Under 18,Frank Darabont,4.411765
8,18-24,Frank Darabont,4.490305
9,25-34,Frank Darabont,4.40284


In [105]:
from copy import deepcopy
import json

temp_df = pd.merge(user_df, score_df, on="user_id", how="inner")
temp_df = pd.merge(temp_df, item_df, on="item_id", how="inner")
# temp_df = temp_df[temp_df["year"] == 2003]

age_actors_csv = temp_df[["user_id", "age", "item_id", "rating", "actors"]]
age_actors_csv["actors"] = age_actors_csv["actors"]
age_actors_csv["actors"] = age_actors_csv["actors"].apply(lambda x: x.split(", "))
age_actors_csv = age_actors_csv.explode("actors").reset_index(drop=False)

    
age_convert_dict = {
	 1:  "Under 18",
	18:  "18-24",
	25:  "25-34",
	35:  "35-44",
	45:  "45-49",
	50:  "50-55",
	56:  "56+"
}

age_actors_csv["age"] = age_actors_csv["age"].apply(lambda x: age_convert_dict[x])

    
age_actors_csv = age_actors_csv[["actors", "age", "rating"]]
age_actors_csv = age_actors_csv.groupby("actors").agg({
    "age": lambda x : x.tolist(),
    "rating": lambda x : x.tolist(),
}).reset_index(drop=False)
age_actors_csv["rating_len"] = age_actors_csv["rating"].apply(lambda x: len(x))
age_actors_csv = age_actors_csv[age_actors_csv["rating_len"] >= 10]
age_actors_csv["avg_rate"] = age_actors_csv["rating"].apply(lambda x: np.mean(x))
age_actors_csv = age_actors_csv.sort_values("avg_rate", ascending=False).iloc[:5, :]

age_actors_csv = age_actors_csv.explode(["age", "rating"])


rating_lst, actors_lst, age_lst = [], [], []
for actors in age_actors_csv["actors"].unique():
    for age in age_convert_dict.values():
        df = age_actors_csv[(age_actors_csv["actors"] == actors) & (age_actors_csv["age"] == age)]
        if(len(df) == 0):
            rating_lst.append(1)
        else:
            rating_lst.append(df["rating"].mean())
        actors_lst.append(actors)
        age_lst.append(age)
    
df = pd.DataFrame({
    "variable":age_lst ,
    "group":actors_lst ,
    "value": rating_lst,
})

df.to_csv("age_actors.csv", index=False)
print(df["variable"].unique().tolist(), df["group"].unique().tolist())

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_actors_csv["actors"] = age_actors_csv["actors"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_actors_csv["actors"] = age_actors_csv["actors"].apply(lambda x: x.split(", "))


['Under 18', '18-24', '25-34', '35-44', '45-49', '50-55', '56+'] ['Keiju Kobayashi', 'Yukiko Shimazaki', 'Keiko Tsushima', 'Richard S. Castellano', 'Anne Reid']


Unnamed: 0,variable,group,value
0,Under 18,Keiju Kobayashi,5.0
1,18-24,Keiju Kobayashi,5.0
2,25-34,Keiju Kobayashi,4.615385
3,35-44,Keiju Kobayashi,4.3125
4,45-49,Keiju Kobayashi,4.333333
5,50-55,Keiju Kobayashi,4.777778
6,56+,Keiju Kobayashi,5.0
7,Under 18,Yukiko Shimazaki,4.5
8,18-24,Yukiko Shimazaki,4.604938
9,25-34,Yukiko Shimazaki,4.549505


### Gender - Genre, Director, Actor

In [106]:
from copy import deepcopy
import json

temp_df = pd.merge(user_df, score_df, on="user_id", how="inner")
temp_df = pd.merge(temp_df, item_df, on="item_id", how="inner")
# temp_df = temp_df[temp_df["year"] == 2003]

gender_genre_csv = temp_df[["user_id", "gender", "item_id", "rating", "item_genres"]]
gender_genre_csv["genre"] = gender_genre_csv["item_genres"]
gender_genre_csv["genre"] = gender_genre_csv["genre"].apply(lambda x: x.split(", "))
gender_genre_csv = gender_genre_csv.explode("genre").reset_index(drop=False)


    
gender_convert_dict = {
    "F": "Female",
    "M": "Male",
}

gender_genre_csv["gender"] = gender_genre_csv["gender"].apply(lambda x: gender_convert_dict[x])

    
gender_genre_csv = gender_genre_csv[["genre", "gender", "rating"]]
gender_genre_csv = gender_genre_csv.groupby("genre").agg({
    "gender": lambda x : x.tolist(),
    "rating": lambda x : x.tolist(),
}).reset_index(drop=False)
gender_genre_csv["rating_len"] = gender_genre_csv["rating"].apply(lambda x: len(x))
gender_genre_csv = gender_genre_csv[gender_genre_csv["rating_len"] >= 10]
gender_genre_csv["avg_rate"] = gender_genre_csv["rating"].apply(lambda x: np.mean(x))
gender_genre_csv = gender_genre_csv.sort_values("avg_rate", ascending=False).iloc[:5, :]

gender_genre_csv = gender_genre_csv.explode(["gender", "rating"])


rating_lst, genre_lst, gender_lst = [], [], []
for genre in gender_genre_csv["genre"].unique():
    for gender in gender_convert_dict.values():
        df = gender_genre_csv[(gender_genre_csv["genre"] == genre) & (gender_genre_csv["gender"] == gender)]
        if(len(df) == 0):
            rating_lst.append(1)
        else:
            rating_lst.append(df["rating"].mean())
        genre_lst.append(genre)
        gender_lst.append(gender)
    
df = pd.DataFrame({
    "variable":gender_lst ,
    "group":genre_lst ,
    "value": rating_lst,
})

df.to_csv("gender_genre.csv", index=False)
print(df["variable"].unique().tolist(), df["group"].unique().tolist())

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gender_genre_csv["genre"] = gender_genre_csv["item_genres"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gender_genre_csv["genre"] = gender_genre_csv["genre"].apply(lambda x: x.split(", "))


['Female', 'Male'] ['Short', 'Film-Noir', 'Documentary', 'History', 'War']


Unnamed: 0,variable,group,value
0,Female,Short,4.461538
1,Male,Short,4.333501
2,Female,Film-Noir,4.202753
3,Male,Film-Noir,4.215889
4,Female,Documentary,4.048972
5,Male,Documentary,3.990557
6,Female,History,4.016496
7,Male,History,3.966391
8,Female,War,3.874227
9,Male,War,3.952407


In [107]:
from copy import deepcopy
import json

temp_df = pd.merge(user_df, score_df, on="user_id", how="inner")
temp_df = pd.merge(temp_df, item_df, on="item_id", how="inner")
# temp_df = temp_df[temp_df["year"] == 2003]

gender_director_csv = temp_df[["user_id", "gender", "item_id", "rating", "director"]]
gender_director_csv["director"] = gender_director_csv["director"]
gender_director_csv["director"] = gender_director_csv["director"].apply(lambda x: x.split(", "))
gender_director_csv = gender_director_csv.explode("director").reset_index(drop=False)

    
gender_convert_dict = {
    "F": "Female",
    "M": "Male",
}

gender_director_csv["gender"] = gender_director_csv["gender"].apply(lambda x: gender_convert_dict[x])

    
gender_director_csv = gender_director_csv[["director", "gender", "rating"]]
gender_director_csv = gender_director_csv.groupby("director").agg({
    "gender": lambda x : x.tolist(),
    "rating": lambda x : x.tolist(),
}).reset_index(drop=False)
gender_director_csv["rating_len"] = gender_director_csv["rating"].apply(lambda x: len(x))
gender_director_csv = gender_director_csv[gender_director_csv["rating_len"] >= 10]
gender_director_csv["avg_rate"] = gender_director_csv["rating"].apply(lambda x: np.mean(x))
gender_director_csv = gender_director_csv.sort_values("avg_rate", ascending=False).iloc[:5, :]

gender_director_csv = gender_director_csv.explode(["gender", "rating"])


rating_lst, director_lst, gender_lst = [], [], []
for director in gender_director_csv["director"].unique():
    for gender in gender_convert_dict.values():
        df = gender_director_csv[(gender_director_csv["director"] == director) & (gender_director_csv["gender"] == gender)]
        if(len(df) == 0):
            rating_lst.append(1)
        else:
            rating_lst.append(df["rating"].mean())
        director_lst.append(director)
        gender_lst.append(gender)
    
df = pd.DataFrame({
    "variable":gender_lst ,
    "group":director_lst ,
    "value": rating_lst,
})

df.to_csv("gender_director.csv", index=False)
print(df["variable"].unique().tolist(), df["group"].unique().tolist())

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gender_director_csv["director"] = gender_director_csv["director"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gender_director_csv["director"] = gender_director_csv["director"].apply(lambda x: x.split(", "))


['Female', 'Male'] ['Akira Kurosawa', 'Frank Darabont', 'M. Night Shyamalan', 'Clyde Bruckman', 'Satyajit Ray']


Unnamed: 0,variable,group,value
0,Female,Akira Kurosawa,4.41841
1,Male,Akira Kurosawa,4.433096
2,Female,Frank Darabont,4.419672
3,Male,Frank Darabont,4.410418
4,Female,M. Night Shyamalan,4.461087
5,Male,M. Night Shyamalan,4.363337
6,Female,Clyde Bruckman,4.575758
7,Male,Clyde Bruckman,4.32948
8,Female,Satyajit Ray,4.714286
9,Male,Satyajit Ray,4.190476


In [108]:
from copy import deepcopy
import json

temp_df = pd.merge(user_df, score_df, on="user_id", how="inner")
temp_df = pd.merge(temp_df, item_df, on="item_id", how="inner")
# temp_df = temp_df[temp_df["year"] == 2003]

gender_actors_csv = temp_df[["user_id", "gender", "item_id", "rating", "actors"]]
gender_actors_csv["actors"] = gender_actors_csv["actors"]
gender_actors_csv["actors"] = gender_actors_csv["actors"].apply(lambda x: x.split(", "))
gender_actors_csv = gender_actors_csv.explode("actors").reset_index(drop=False)

    
gender_convert_dict = {
    "F": "Female",
    "M": "Male",
}

gender_actors_csv["gender"] = gender_actors_csv["gender"].apply(lambda x: gender_convert_dict[x])

    
gender_actors_csv = gender_actors_csv[["actors", "gender", "rating"]]
gender_actors_csv = gender_actors_csv.groupby("actors").agg({
    "gender": lambda x : x.tolist(),
    "rating": lambda x : x.tolist(),
}).reset_index(drop=False)
gender_actors_csv["rating_len"] = gender_actors_csv["rating"].apply(lambda x: len(x))
gender_actors_csv = gender_actors_csv[gender_actors_csv["rating_len"] >= 10]
gender_actors_csv["avg_rate"] = gender_actors_csv["rating"].apply(lambda x: np.mean(x))
gender_actors_csv = gender_actors_csv.sort_values("avg_rate", ascending=False).iloc[:5, :]

gender_actors_csv = gender_actors_csv.explode(["gender", "rating"])


rating_lst, actors_lst, gender_lst = [], [], []
for actors in gender_actors_csv["actors"].unique():
    for gender in gender_convert_dict.values():
        df = gender_actors_csv[(gender_actors_csv["actors"] == actors) & (gender_actors_csv["gender"] == gender)]
        if(len(df) == 0):
            rating_lst.append(1)
        else:
            rating_lst.append(df["rating"].mean())
        actors_lst.append(actors)
        gender_lst.append(gender)
    
df = pd.DataFrame({
    "variable":gender_lst ,
    "group":actors_lst ,
    "value": rating_lst,
})

df.to_csv("gender_actors.csv", index=False)
print(df["variable"].unique().tolist(), df["group"].unique().tolist())

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gender_actors_csv["actors"] = gender_actors_csv["actors"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gender_actors_csv["actors"] = gender_actors_csv["actors"].apply(lambda x: x.split(", "))


['Female', 'Male'] ['Keiju Kobayashi', 'Yukiko Shimazaki', 'Keiko Tsushima', 'Richard S. Castellano', 'Anne Reid']


Unnamed: 0,variable,group,value
0,Female,Keiju Kobayashi,4.375
1,Male,Keiju Kobayashi,4.639344
2,Female,Yukiko Shimazaki,4.481132
3,Male,Yukiko Shimazaki,4.576628
4,Female,Keiko Tsushima,4.481132
5,Male,Keiko Tsushima,4.576628
6,Female,Richard S. Castellano,4.3147
7,Male,Richard S. Castellano,4.583333
8,Female,Anne Reid,4.644444
9,Male,Anne Reid,4.473795


### Time Series

In [25]:
from copy import deepcopy

occup_genre_dict = {}
occup_director_dict = {}
occup_actor_dict = {}

for occup_id in user_df["occupation_code"].unique():
    temp_user_df = deepcopy(user_df[user_df["occupation_code"] == occup_id])
    temp_df = pd.merge(temp_user_df, score_df, on="user_id", how="inner")
    temp_df = pd.merge(temp_df, item_df, on="item_id", how="inner")
    
    occup_genre_csv = temp_df[["user_id", "occupation_code", "item_id", "rating", "item_genres"]]
    occup_director_csv = temp_df[["user_id", "occupation_code", "item_id", "rating", "director"]]
    occup_actor_csv = temp_df[["user_id", "occupation_code", "item_id", "rating", "actors"]]
    
    occup_actor_csv["actors"] = occup_actor_csv["actors"].apply(lambda x: x.split(", "))
    occup_actor_csv = occup_actor_csv.explode("actors")
    # occup_actor_csv = occup_actor_csv.explode("actors").agg({
    #     "user_id": lambda x: x.tolist(),
    #     "occupation_code": lambda x: x.tolist(),
    #     "item_id": lambda x: x.tolist(),
    #     "rating": lambda x: x.tolist(),
    # }).reset_index(drop=False)
    # for idx, row in occup_actor_csv.iterrows():
        
    break
occup_actor_csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  occup_actor_csv["actors"] = occup_actor_csv["actors"].apply(lambda x: x.split(", "))


Unnamed: 0,user_id,occupation_code,item_id,rating,actors
0,1,10,2884,4,Winona Ryder
0,1,10,2884,4,Angelina Jolie
0,1,10,2884,4,Clea DuVall
0,1,10,2884,4,Brittany Murphy
1,99,10,2884,3,Winona Ryder
...,...,...,...,...,...
23162,5973,10,598,2,Tom Gilroy
23163,5973,10,1537,5,Howie Long
23163,5973,10,1537,5,Scott Glenn
23163,5973,10,1537,5,William Forsythe


In [10]:
a = "asdf. asdg. gf."
a.replace(".", " ")

'asdf  asdg  gf '

In [11]:
from copy import deepcopy

actor_dict = {}


for year in score_df["year"].unique().tolist():
    if(not actor_dict.get(year)):
        actor_dict[year] = {}

    tmp_df = deepcopy(score_df)
    tmp_df = tmp_df[tmp_df["year"] == year].reset_index(drop=True)
    tmp_df = pd.merge(tmp_df, item_df, on="item_id", how="inner")
    
    avg_len = {}
    for idx, row in tmp_df.iterrows():
        actors = row["actors"]
        actors = actors.split(", ")
        
        for actor in actors:
            actor = actor.replace(".", " ")
            if(not actor_dict[year].get(actor)):
                actor_dict[year][actor] = 0
                avg_len[actor] = 0
            
            actor_dict[year][actor] += row["rating"]
            avg_len[actor] += 1
            
    for k in avg_len.keys():
        actor_dict[year][k] /= avg_len[k]

for k, v in actor_dict.items():
    v["year"] = k

actor_lst = [actor_dict[2000], actor_dict[2001], actor_dict[2002], actor_dict[2003]]

actor_lst

[{'Marlon Brando': 4.036559139784946,
  'Al Pacino': 3.9719991971095947,
  'James Caan': 3.817672930827669,
  'Richard S  Castellano': 4.514990328820116,
  'Magda Szubanski': 3.288135593220339,
  'James Cromwell': 3.871498172959805,
  'Mary Stein': 3.288135593220339,
  'Mickey Rooney': 3.270087124878993,
  'Jodie Foster': 3.9165890895550177,
  'Lawrence A  Bonney': 4.35679012345679,
  'Kasi Lemmons': 4.217741935483871,
  'Lawrence T  Wrentz': 4.35679012345679,
  'Toshirô Mifune': 4.512309495896835,
  'Takashi Shimura': 4.151162790697675,
  'Keiko Tsushima': 4.572679509632224,
  'Yukiko Shimazaki': 4.572679509632224,
  'Dustin Hoffman': 3.7058119430775367,
  'Tom Cruise': 3.5652911249293386,
  'Valeria Golino': 3.641597510373444,
  'Gerald R  Molen': 4.0505709624796085,
  'Kevin Costner': 3.5345050400620317,
  'Gene Hackman': 3.6152373022481266,
  'Sean Young': 3.8097002357696192,
  'Will Patton': 3.3726256983240224,
  'Sally Field': 3.757822123675782,
  'Lindsay Crouse': 3.228796844181

In [12]:

from copy import deepcopy

genre_dict = {}


for year in score_df["year"].unique().tolist():
    if(not genre_dict.get(year)):
        genre_dict[year] = {}

    tmp_df = deepcopy(score_df)
    tmp_df = tmp_df[tmp_df["year"] == year].reset_index(drop=True)
    tmp_df = pd.merge(tmp_df, item_df, on="item_id", how="inner")
    
    avg_len = {}
    for idx, row in tmp_df.iterrows():
        genres = row["item_genres"]
        genres = genres.split(", ")
        
        for genre in genres:
            
            genre = genre.replace(".", " ")
            if(not genre_dict[year].get(genre)):
                genre_dict[year][genre] = 0
                avg_len[genre] = 0
            
            genre_dict[year][genre] += row["rating"]
            avg_len[genre] += 1
            
    for k in avg_len.keys():
        genre_dict[year][k] /= avg_len[k]

for k, v in genre_dict.items():
    v["year"] = k

genre_lst = [genre_dict[2000], genre_dict[2001], genre_dict[2002], genre_dict[2003]]

genre_lst

[{'Crime': 3.6255160236494177,
  'Drama': 3.7250287996655236,
  'Adventure': 3.5499026149336625,
  'Comedy': 3.494717975812737,
  'Thriller': 3.5786360700470996,
  'Action': 3.399057429343867,
  'War': 3.9473393554494893,
  'Romance': 3.5707533375499017,
  'Family': 3.43328788319826,
  'Fantasy': 3.5523847107903936,
  'Biography': 3.890113050596226,
  'History': 3.986285590871459,
  'Mystery': 3.6239819554559083,
  'Documentary': 4.016116035455278,
  'Sport': 3.483051368809597,
  'Music': 3.543296154024159,
  'Sci-Fi': 3.4305350964338324,
  'Film-Noir': 4.224917913170375,
  'Western': 3.753885074087459,
  'Musical': 3.660860875151688,
  'Animation': 3.695051520162829,
  'Short': 4.382152861144458,
  'Horror': 3.1268472728845573,
  'year': 2000},
 {'Comedy': 3.419481032193231,
  'Action': 3.2775566442131048,
  'Drama': 3.658444653844022,
  'Romance': 3.481997583568264,
  'Thriller': 3.4865962632006497,
  'Crime': 3.5399902264212413,
  'Horror': 3.0394901796907647,
  'Sci-Fi': 3.30595789

In [13]:

from copy import deepcopy

director_dict = {}


for year in score_df["year"].unique().tolist():
    if(not director_dict.get(year)):
        director_dict[year] = {}

    tmp_df = deepcopy(score_df)
    tmp_df = tmp_df[tmp_df["year"] == year].reset_index(drop=True)
    tmp_df = pd.merge(tmp_df, item_df, on="item_id", how="inner")
    
    avg_len = {}
    for idx, row in tmp_df.iterrows():
        directors = row["director"]
        directors = directors.split(", ")
        
        for director in directors:
            director = director.replace(".", " ")
            if(not director_dict[year].get(director)):
                director_dict[year][director] = 0
                avg_len[director] = 0
            
            director_dict[year][director] += row["rating"]
            avg_len[director] += 1
            
    for k in avg_len.keys():
        director_dict[year][k] /= avg_len[k]

for k, v in director_dict.items():
    v["year"] = k

director_lst = [director_dict[2000], director_dict[2001], director_dict[2002], director_dict[2003]]

director_lst

[{'Francis Ford Coppola': 4.057934508816121,
  'George Miller': 3.4418116376724655,
  'Jonathan Demme': 4.122204037097654,
  'Akira Kurosawa': 4.438325991189427,
  'Barry Levinson': 3.5727397888939882,
  'Roger Donaldson': 3.0897338403041825,
  'Robert Benton': 3.629418472063854,
  'Nicolas Roeg': 3.9224137931034484,
  'Lina Wertmüller': 3.760233918128655,
  'Nikita Mikhalkov': 4.107142857142857,
  'Alain Berliner': 3.7170731707317075,
  'Michael Curtiz': 4.232876712328767,
  'Victor Fleming': 4.13993423456339,
  'George Cukor': 4.160966276010524,
  'Mervyn LeRoy': 4.181323662737987,
  'Norman Taurog': 4.125642490005711,
  'King Vidor': 4.216896345116837,
  'Samira Makhmalbaf': 4.666666666666667,
  'Steven Spielberg': 4.0265437695174775,
  'Roman Polanski': 3.9610330765745356,
  'Friðrik Þór Friðriksson': 4.1875,
  'Frank Darabont': 4.424756519007226,
  'Yimou Zhang': 4.09453781512605,
  'Freida Lee Mock': 4.135593220338983,
  'John Madden': 4.1042424242424245,
  'Joel Coen': 4.0309370

In [19]:
import json
with open("year_actor.json", "w") as json_file:
    json.dump(actor_lst, json_file)

In [20]:
import json
with open("year_director.json", "w") as json_file:
    json.dump(director_lst, json_file)

In [21]:
import json
with open("year_genre.json", "w") as json_file:
    json.dump(genre_lst, json_file)

In [22]:
director_keys = {}
for ddick in director_lst:
    for key in ddick.keys():
        if(not director_keys.get(key)):
            director_keys[key] = 1

genre_keys = {}
for ddick in genre_lst:
    for key in ddick.keys():
        if(not genre_keys.get(key)):
            genre_keys[key] = 1

actor_keys = {}
for ddick in actor_lst:
    for key in ddick.keys():
        if(not actor_keys.get(key)):
            actor_keys[key] = 1

len(actor_keys)

7191

In [23]:
import json
with open("year_director_keys.json", "w") as json_file:
    json.dump(list(director_keys.keys()), json_file)
    
import json
with open("year_actor_keys.json", "w") as json_file:
    json.dump(list(actor_keys.keys()), json_file)
    
import json
with open("year_genre_keys.json", "w") as json_file:
    json.dump(list(genre_keys.keys()), json_file)