# BANA 212 - Final Project
### Anime Recommendations using Machine Learning (based on user ratings from MyAnimeList.com)



# DATA PRE-PROCESSING (Part 1)

In [None]:
pip install requests

In [1]:
import requests
import pandas as pd
import time

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# 1.1 Pull list of all Animes on the site (updated 10/25)

*   There was missing values for some of the items we pulled - added if statement that fills column with -1 if missing value
* changed dataframe "studio" to "studios" for continuity



In [None]:
CLIENT_ID = '018269e286b49a7ca078f74376d45377'

# Initialize an empty list to store all the data
all_anime_list = []
df_AAL = pd.DataFrame(columns=['anime_id', 'title', 'mean', 'genres', 'studios', 'synopsis', 'media_type', 'num_episodes'])

In [None]:
#update this (ex. 10400 – 20800)
anime_id_start = 0
anime_id_limit = 10000

In [None]:
for anime_id in range(anime_id_start,anime_id_limit):
  url = f'https://api.myanimelist.net/v2/anime/{anime_id}?fields=id,title,start_date,end_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,media_type,genres,num_episodes,studios'

  response = requests.get(url, headers = {
      'X-MAL-CLIENT-ID': CLIENT_ID
      })

  if response.status_code == 404: #if anime id doesnt exit, skip
    print("404 anime id not found")
    continue

  response.raise_for_status()
  anime_list = response.json()
  response.close()
  print(anime_list)


  anime_entry = anime_list
  all_anime_list.extend(anime_entry)

  anime_data = [{
        'anime_id': anime_entry['id'] if 'id' in anime_entry else -1,
        'title': anime_entry['title'] if 'title' in anime_entry else -1,
        'mean': anime_entry['mean'] if 'mean' in anime_entry else -1,
        'genres': [genre['name'] for genre in anime_entry['genres']] if 'genres' in anime_entry else -1,
        'studios': [studios['name'] for studios in anime_entry['studios']] if 'studios' in anime_entry else -1,
        'synopsis': anime_entry['synopsis'] if 'synopsis' in anime_entry else -1,
        'media_type': anime_entry['media_type'] if 'media_type' in anime_entry else -1,
        'num_episodes': anime_entry['num_episodes'] if 'num_episodes' in anime_entry else -1
    }]


  df_AAL = df_AAL.append(pd.DataFrame(anime_data))

  time.sleep(3)

In [None]:
print(df_AAL)

In [None]:
# Export to CSV
df_AAL.to_csv('all_anime_list.csv', index=False)

# 1.2 MyAnimeList Username Web Scraper

Before fetching a user's ratings, we first need a list of users!

To create a list of users we will scrape the MyAnimeList webpage using Python and store the names in a dataframe & CSV.

In [None]:
#initialize df
user_list = pd.DataFrame(columns=["username"])

In [None]:
from bs4 import BeautifulSoup

# The URL to fetch
url = "https://myanimelist.net/users.php?lucky=1"

# Minimum number of unique usernames you want to collect
minimum_user_count = 50000

# Initialize a set to store unique usernames
unique_usernames = set()

while len(unique_usernames) < minimum_user_count:
    # Send an HTTP GET request to the URL
    response = requests.get(url)

    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract usernames from the page
        usernames = soup.find_all('a', href=lambda href: href and '/profile/' in href)

        # Add the usernames to the set to keep them unique
        unique_usernames.update(name.text for name in usernames)

        print(f"Collected {len(unique_usernames)} unique usernames")

    else:
        print(f"Failed to fetch the page. Status Code: {response.status_code}")

    # Add a 5-second delay before making the next request
    time.sleep(5)

# Now you have collected at least 100 unique usernames
for username in unique_usernames:
    print(username)

In [None]:
user_list = user_list.append(pd.DataFrame({"username": list(unique_usernames)}))

In [None]:
# Export to CSV
user_list.to_csv('user_list_11423.csv', index=False)

## 1.3 MyAnimeList API Implementation

Now that we have a list of users, we can use the MyAnimeList API to fetch each users' anime ratings.
- Due to the high volume of data, we chunked every 1,000,000 rows to a CSV

In [None]:
CLIENT_ID = '018269e286b49a7ca078f74376d45377'

# Initialize an empty list to store all the data
all_user_ratings = []
df_user_ratings = pd.DataFrame(columns=['user_id', 'anime_id', 'title', 'user_status', 'user_score', 'user_eps_watched', 'user_rewatch', 'updated_at'])

In [None]:
#read the username list csv
user_list_cleaned = pd.read_csv('cleaned/user_list_cleaned_random_10000_2.csv')
api_counter = 0
list_counter = 1

In [None]:
for user_name in user_list_cleaned['username']:
  limit = 1000
  url = f'https://api.myanimelist.net/v2/users/{user_name}/animelist?fields=id,title,genres,synopsis,list_status&limit={limit}]&nsfw=true'

  response = requests.get(url, headers = {
      'X-MAL-CLIENT-ID': CLIENT_ID
      })

  if response.status_code == 403: #check if list is private, skip
    print("403 user skipped")
    continue
  if response.status_code == 404: #if id doesnt exit, skip
    print("404 user skipped")
    continue

  response.raise_for_status()
  user_ratings = response.json()
  response.close()

  all_user_ratings.extend(user_ratings['data'])

  user_data = [{
      'user_id': user_name,
      'anime_id': entry['node']['id'],
      'title': entry['node']['title'],
      'user_status': entry['list_status'].get('status', '-1'),
      'user_score': entry['list_status'].get('score', '-1'),
      'user_eps_watched': entry['list_status'].get('num_episodes_watched', '-1'),
      'user_rewatch': entry['list_status'].get('is_rewatching', '-1'),
      'updated_at': entry['list_status'].get('updated_at','-1')
  } for entry in all_user_ratings]

# THIS IS FOR CHUNKING DATA
  if df_user_ratings.shape[0] > 1000000: #if number of rows exceed 1,000,000 (for excel) export to excel
    df_user_ratings.to_csv(f'all_user_ratings_2_{list_counter}.csv', index=False)
    list_counter += 1
    # Initialize an empty list
    all_user_ratings = []
    df_user_ratings = pd.DataFrame(columns=['user_id', 'anime_id', 'title', 'user_status', 'user_score', 'user_eps_watched', 'user_rewatch', 'updated_at'])

  df_user_ratings = df_user_ratings.append(pd.DataFrame(user_data))
  print("Appended:",api_counter, "users")
  api_counter += 1

  time.sleep(3)

In [None]:
# Export the DataFrame to a CSV
df_user_ratings.to_csv('all_user_ratings_2_124.csv', index=False)

# DATA CLEANING (Part 2)

## 2.1 Cleaning All Pulled Anime List
- Cleaned formatting of columns
- Added '_' for readability
- Created a dummy variable version

In [None]:
import pandas as pd
import os

folder_path = 'appended'
combined_data = pd.DataFrame()

for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        data = pd.read_csv(file_path)
        combined_data = combined_data.append(data, ignore_index=True)

In [None]:
combined_data = combined_data.sort_values(by='anime_id')
combined_data

In [None]:
test_df = combined_data.copy()

test_df['genres'] = test_df['genres'].apply(lambda x: ''.join([genre.strip("[]") for genre in x]))
test_df['studios'] = test_df['studios'].apply(lambda x: ''.join([studios.strip("[]") for studios in x]))

test_df['genres'] = test_df['genres'].apply(lambda x: x.replace("'", ""))
test_df['studios'] = test_df['studios'].apply(lambda x: x.replace("'", ""))

In [None]:
test_df['genres'] = test_df['genres'].str.replace(r'(?<=\w) (?=\w)', '_', regex=True)
test_df['studios'] = test_df['studios'].str.replace(r'(?<=\w) (?=\w)', '_', regex=True)

In [None]:
test_df['studios'] = test_df['studios'].replace('', -1)

In [None]:
dummy_variables_genres = test_df['genres'].str.get_dummies(', ').add_prefix('genre_')

In [None]:
dummy_variables_studios = test_df['studios'].str.get_dummies(', ').add_prefix('studio_')

In [None]:
test_df = pd.concat([test_df, dummy_variables_genres], axis=1)

In [None]:
test_df = pd.concat([test_df, dummy_variables_studios], axis=1)

In [None]:
# Save the combined data to a new CSV file or perform further processing
test_df.to_csv('all_anime_list_cleaned_with_dummies.csv', index=False)

## 2.2 Cleaning Username List
- Appended all the username CSVs our team created
- Cleaned the lists to remove duplicates and NA values
- Due to the large number of usernames, we decided it was not feasible to include all 70k usernames we scraped
- Randomly selected 10k usernames from our data
- After running the model, it seemed like this sample size was too small
- Thus, we randomly selected another 10k usernames (hence the 2 seperate instances of sampled_df) [20k usernames in total]

In [None]:
import pandas as pd
import os

folder_path = 'user lists'
combined_data = pd.DataFrame()

#read folder with all username lists and append them
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        data = pd.read_csv(file_path)
        combined_data = combined_data.append(data, ignore_index=True)

In [None]:
#drop any duplicates, NA, and sort 
combined_data = combined_data.drop_duplicates(subset='username')
combined_data = combined_data.dropna(subset=['username'])
combined_data = combined_data.sort_values(by='username')

In [None]:
combined_data.to_csv('user_list_cleaned.csv', index=False)

In [None]:
# Randomly select x amount of usernames
sampled_df = combined_data.sample(n=10000, random_state=12) 
sampled_df.reset_index(drop=True, inplace=True)
sampled_df

In [None]:
sampled_df.to_csv('user_list_cleaned_random_10000.csv', index=False)

In [None]:
#for more usernames
orig_user_list = pd.read_csv('cleaned/user_list_cleaned.csv')
random_10k = pd.read_csv('cleaned/user_list_cleaned_random_10000.csv')

In [None]:
orig_user_list = orig_user_list[~orig_user_list['username'].isin(random_10k['username'])]

In [None]:
# Randomly select x amount of usernames
sampled_df = orig_user_list.sample(n=10000, random_state=12)
sampled_df.reset_index(drop=True, inplace=True)

In [None]:
sampled_df.to_csv('user_list_cleaned_random_10000_2.csv', index=False)

# 2.3 Cleaning User Ratings and Chunking
- Chunked the user ratings into seperate folders
- Combined all user ratings in each folder into 1 chunked dataframe (5 chunks by the end)
- Dropped NA, dropped duplicates
- Removed rows with missing data (-1)
- Removed shows that were not watched yet (only kept Completed and Dropped shows)
- Assumed "0" rating meant the show was not watched



In [None]:
import pandas as pd
import os

#store each user rating csvs in seperate chunked folders
folder_path = 'anime_user_ratings/chunk_5'
combined_data = pd.DataFrame()
file_count = 0
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        data = pd.read_csv(file_path)
        combined_data = combined_data.append(data, ignore_index=True)
        file_count += 1
        print(file_count)

In [None]:
combined_data = combined_data.dropna()

In [None]:
combined_data = combined_data.drop_duplicates()

In [None]:
#remove all rows that contain -1
mask = (combined_data['user_status'] != -1) & (combined_data['user_score'] != -1) & (combined_data['user_eps_watched'] != -1) & (combined_data['user_rewatch'] != -1) & (combined_data['updated_at'] != -1)
filtered_combined_data = combined_data[mask]
filtered_combined_data.reset_index(drop=True, inplace=True)
filtered_combined_data

In [None]:
#remove all rows that are animes that users havent watched
filtered_df = filtered_combined_data[(filtered_combined_data['user_status'] != 'plan_to_watch')]
filtered_df.reset_index(drop=True, inplace=True)

In [None]:
#remove all rows where user rating is 0 - assumes this means they did not rate it
filtered_df = filtered_df[~(filtered_df['user_score'] == 0)]
filtered_df.reset_index(drop=True, inplace=True)

In [None]:
#for counting how many unique usernames in data
unique_usernames_count = filtered_df['user_id'].nunique()
unique_usernames_count

In [None]:
filtered_df.to_csv('anime_user_ratings_2_chunk_5.csv', index=False)

# 2.4 One More User Rating Cleaning...
- Discovered discrepancy in the user ratings
- Users had multiple ratings for the same shows (site tracks every instance where a user rates a show)
- Decided to only keep the most updated rating of show for each user

In [None]:
import pandas as pd
import os

folder_path = 'cleaned/anime_user_ratings_chunked_2'
combined_data = pd.DataFrame()
file_count = 0
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        data = pd.read_csv(file_path)
        combined_data = combined_data.append(data, ignore_index=True)
        file_count += 1
        print(file_count)

In [None]:
combined_data['updated_at'] = pd.to_datetime(combined_data['updated_at'])
combined_data = combined_data.sort_values(by='updated_at', ascending=False)
combined_data

In [None]:
df_no_duplicates = combined_data.drop_duplicates(subset=['user_id', 'anime_id'], keep='first')
df_no_duplicates.reset_index(drop=True, inplace=True)
df_no_duplicates

In [None]:
test3 = (df_no_duplicates[(df_no_duplicates['anime_id'] == 1)])
test3

In [None]:
unique_usernames_count = df_no_duplicates['user_id'].nunique()
unique_usernames_count

In [None]:
test3.to_csv('test3.csv', index=False)

In [None]:
df_no_duplicates.to_csv('anime_user_ratings_cleaned_full_2.csv', index=False)

# MACHINE LEARNING MODEL (Part 3)

## 3.1 KNN Model to predict Anime Recommendations


In [3]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors

# read the user ratings
user_ratings = pd.read_csv('cleaned/anime_user_ratings_cleaned_final.csv')

In [4]:
# Create a user-item matrix with anime_id as rows, user_id as columns, and ratings as values.
user_item_matrix = user_ratings.pivot_table(index='anime_id', columns='user_id', values='user_score', fill_value=0)

In [5]:
# Initialize and fit the KNN model
knn_model = NearestNeighbors(n_neighbors=5, metric='cosine', algorithm='brute')
knn_model.fit(user_item_matrix)

In [6]:
# Load a DataFrame with anime titles
anime_data = pd.read_csv('cleaned/all_anime_list_cleaned_1_52000.csv')

In [7]:
# Choose the target anime for which you want to make recommendations
target_anime_id = 40839 # Replace with the desired anime_id

# Locate the target anime's column in the user-item matrix
target_anime_row = user_item_matrix.loc[target_anime_id].values.reshape(1, -1)

# Find the nearest neighbors for the target anime
distances, neighbor_indices = knn_model.kneighbors(target_anime_row)

# Recommend animes based on the nearest neighbors
recommended_anime_ids = user_item_matrix.index[neighbor_indices.flatten()]

# Exclude the target anime from recommendations
recommended_anime_ids = [anime_id for anime_id in recommended_anime_ids if anime_id != target_anime_id]

# Lookup the titles of the recommended animes
recommended_anime_titles = anime_data.loc[anime_data['anime_id'].isin(recommended_anime_ids)]

recommended_anime_info = pd.DataFrame(recommended_anime_titles[['title', 'genres','studios','synopsis','media_type']])

# Print recommended anime titles
recommended_anime_info



Unnamed: 0,title,genres,studios,synopsis,media_type
16148,Kaguya-sama wa Kokurasetai? Tensai-tachi no Re...,"Comedy, Psychological, Romantic Subtext, Schoo...",A-1 Pictures,"After a slow but eventful summer vacation, Shu...",tv
17202,Horimiya,"Romance, School, Shounen",CloverWorks,"On the surface, the thought of Kyouko Hori and...",tv
17574,Kaguya-sama wa Kokurasetai: Ultra Romantic,"Comedy, Romance, School, Seinen",A-1 Pictures,The elite members of Shuchiin Academy's studen...,tv


## 3.2 Let's try Z-score normalization to improve accuracy

In [8]:
# Z-score normalization for each column (anime)
standardized_item_matrix = (user_item_matrix - user_item_matrix.mean(axis=0)) / user_item_matrix.std(axis=0)

In [9]:
# Initialize and fit the KNN model
knn_model2 = NearestNeighbors(n_neighbors=5, metric='cosine', algorithm='brute')
knn_model2.fit(standardized_item_matrix)

In [10]:
# Choose the target anime for which you want to make recommendations
target_anime_id = 40839 # Replace with the desired anime_id

# Locate the target anime's column in the user-item matrix
target_anime_row = standardized_item_matrix.loc[target_anime_id].values.reshape(1, -1)

# Find the nearest neighbors for the target anime
distances, neighbor_indices = knn_model2.kneighbors(target_anime_row)

# Recommend animes based on the nearest neighbors
recommended_anime_ids = standardized_item_matrix.index[neighbor_indices.flatten()]

# Exclude the target anime from recommendations
recommended_anime_ids = [anime_id for anime_id in recommended_anime_ids if anime_id != target_anime_id]

# Lookup the titles of the recommended animes
recommended_anime_titles = anime_data.loc[anime_data['anime_id'].isin(recommended_anime_ids)]

recommended_anime_info = pd.DataFrame(recommended_anime_titles[['title', 'genres','studios','synopsis','media_type']])

# Print recommended anime titles
recommended_anime_info




Unnamed: 0,title,genres,studios,synopsis,media_type
14558,Kaguya-sama wa Kokurasetai: Tensai-tachi no Re...,"Comedy, Psychological, Romantic Subtext, Schoo...",A-1 Pictures,"At the renowned Shuchiin Academy, Miyuki Shiro...",tv
15707,5-toubun no Hanayome ∬,"Comedy, Harem, Romance, School, Shounen",Bibury Animation Studios,Through their tutor Fuutarou Uesugi's diligent...,tv
16148,Kaguya-sama wa Kokurasetai? Tensai-tachi no Re...,"Comedy, Psychological, Romantic Subtext, Schoo...",A-1 Pictures,"After a slow but eventful summer vacation, Shu...",tv
17234,"Kanojo, Okarishimasu 2nd Season","Adult Cast, Comedy, Harem, Romance, Shounen",TMS Entertainment,"A year after they met, Kazuya Kinoshita and Ch...",tv


## 3.3 Adding Genres as Dummy Variables

In [11]:
#load genre dummy variables
knn_genres = pd.read_csv('cleaned/knn_genres.csv')

In [12]:
#merge previous standardized item matrix with dummies
merged_df = pd.merge(standardized_item_matrix, knn_genres, on='anime_id', how='inner')
standardized_item_matrix2 = merged_df.set_index('anime_id')

In [13]:
# Initialize and fit the KNN model
knn_model3 = NearestNeighbors(n_neighbors=5, metric='cosine', algorithm='brute')
knn_model3.fit(standardized_item_matrix2)

In [14]:
# Choose the target anime for which you want to make recommendations
target_anime_id = 40839 # Replace with the desired anime_id

# Locate the target anime's column in the user-item matrix
target_anime_row = standardized_item_matrix2.loc[target_anime_id].values.reshape(1, -1)

# Find the nearest neighbors for the target anime
distances, neighbor_indices = knn_model3.kneighbors(target_anime_row)

# Recommend animes based on the nearest neighbors
recommended_anime_ids = standardized_item_matrix2.index[neighbor_indices.flatten()]

# Exclude the target anime from recommendations
recommended_anime_ids = [anime_id for anime_id in recommended_anime_ids if anime_id != target_anime_id]

# Lookup the titles of the recommended animes
recommended_anime_titles = anime_data.loc[anime_data['anime_id'].isin(recommended_anime_ids)]

recommended_anime_info = pd.DataFrame(recommended_anime_titles[['title', 'genres','studios','synopsis','media_type']])

# Print recommended anime titles
recommended_anime_info




Unnamed: 0,title,genres,studios,synopsis,media_type
14558,Kaguya-sama wa Kokurasetai: Tensai-tachi no Re...,"Comedy, Psychological, Romantic Subtext, Schoo...",A-1 Pictures,"At the renowned Shuchiin Academy, Miyuki Shiro...",tv
15707,5-toubun no Hanayome ∬,"Comedy, Harem, Romance, School, Shounen",Bibury Animation Studios,Through their tutor Fuutarou Uesugi's diligent...,tv
16148,Kaguya-sama wa Kokurasetai? Tensai-tachi no Re...,"Comedy, Psychological, Romantic Subtext, Schoo...",A-1 Pictures,"After a slow but eventful summer vacation, Shu...",tv
17234,"Kanojo, Okarishimasu 2nd Season","Adult Cast, Comedy, Harem, Romance, Shounen",TMS Entertainment,"A year after they met, Kazuya Kinoshita and Ch...",tv


Results:

target id: 38000 (Demon Slayer) - Popular Anime
model 1:
7430	Shingeki no Kyojin	Action, Award Winning, Drama, Gore, Military, ...	Wit Studio	Centuries ago, mankind was slaughtered to near...	tv
13198	Shingeki no Kyojin Season 3	Action, Drama, Gore, Military, Shounen, Survival	Wit Studio	Still threatened by the "Titans" that rob them...	tv
16068	Kimetsu no Yaiba Movie: Mugen Ressha-hen	Action, Fantasy, Historical, Shounen	ufotable	After a string of mysterious disappearances be...	movie
20429	Kimetsu no Yaiba: Yuukaku-hen	Action, Fantasy, Historical, Shounen	ufotable	The devastation of the Mugen Train incident st...	tv

- Recommendation did not change in each model
- Seems like very popular shows are not affected by the changes

target id: 33206 (Kobayashi Dragon Maid) - Semi Popular Anime

model 1:
title	genres	studios	synopsis	media_type
14240	Mob Psycho 100 II	Action, Comedy, Super Power, Supernatural	Bones	Shigeo "Mob" Kageyama is now maturing and unde...	tv
15376	Kobayashi-san Chi no Maid Dragon S	Fantasy, Slice of Life	Kyoto Animation	As Tooru continues on her quest to become the ...	tv
16148	Kaguya-sama wa Kokurasetai? Tensai-tachi no Re...	Comedy, Psychological, Romantic Subtext, Schoo...	A-1 Pictures	After a slow but eventful summer vacation, Shu...	tv
17574	Kaguya-sama wa Kokurasetai: Ultra Romantic	Comedy, Romance, School, Seinen	A-1 Pictures	The elite members of Shuchiin Academy's studen...	tv

model 2:
title	genres	studios	synopsis	media_type
10629	Kono Subarashii Sekai ni Shukufuku wo!	Adventure, Comedy, Fantasy, Isekai, Parody, Re...	Studio Deen	After dying a laughable and pathetic death on ...	tv
11596	Kono Subarashii Sekai ni Shukufuku wo! 2	Adventure, Comedy, Fantasy, Isekai, Parody, Re...	Studio Deen	When Kazuma Satou died, he was given two choic...	tv
14598	Kono Subarashii Sekai ni Shukufuku wo! Movie: ...	Adventure, Comedy, Fantasy, Isekai, Parody, Re...	J.C.Staff	It is not strange that the Demon Lord's forces...	movie
15376	Kobayashi-san Chi no Maid Dragon S	Fantasy, Slice of Life	Kyoto Animation	As Tooru continues on her quest to become the ...	tv

model 3:
title	genres	studios	synopsis	media_type
10629	Kono Subarashii Sekai ni Shukufuku wo!	Adventure, Comedy, Fantasy, Isekai, Parody, Re...	Studio Deen	After dying a laughable and pathetic death on ...	tv
11596	Kono Subarashii Sekai ni Shukufuku wo! 2	Adventure, Comedy, Fantasy, Isekai, Parody, Re...	Studio Deen	When Kazuma Satou died, he was given two choic...	tv
14598	Kono Subarashii Sekai ni Shukufuku wo! Movie: ...	Adventure, Comedy, Fantasy, Isekai, Parody, Re...	J.C.Staff	It is not strange that the Demon Lord's forces...	movie
15376	Kobayashi-san Chi no Maid Dragon S	Fantasy, Slice of Life	Kyoto Animation	As Tooru continues on her quest to become the ...	tv

- Recommendation did not change after model 2
- Genre may not significantly effect the neighbors
- Model 2 generates a more accurate recommendation

target id: 16417 (Tamako Market) - Below average popularity

model 1:
	title	genres	studios	synopsis	media_type
4956	Suzumiya Haruhi no Shoushitsu	Award Winning, Mystery, School, Sci-Fi, Supern...	Kyoto Animation	On a cold December day, Kyon arrives at school...	movie
8474	Tamako Love Story	Award Winning, Romance, Slice of Life	Kyoto Animation	As the seasons pass by, the end of Mochizou Oo...	movie
12582	Yuru Camp△	CGDCT, Iyashikei, Slice of Life	C-Station	While the perfect getaway for most girls her a...	tv
13982	Violet Evergarden: Kitto "Ai" wo Shiru Hi ga K...	Drama, Fantasy	Kyoto Animation	The CH Postal Company has just received a requ...	special

model 2:
title	genres	studios	synopsis	media_type
4956	Suzumiya Haruhi no Shoushitsu	Award Winning, Mystery, School, Sci-Fi, Supern...	Kyoto Animation	On a cold December day, Kyon arrives at school...	movie
8474	Tamako Love Story	Award Winning, Romance, Slice of Life	Kyoto Animation	As the seasons pass by, the end of Mochizou Oo...	movie
9651	Hibike! Euphonium	Drama, Music, Performing Arts, School	Kyoto Animation	Now that Kumiko Oumae has enrolled in Kitauji ...	tv
12582	Yuru Camp△	CGDCT, Iyashikei, Slice of Life	C-Station	While the perfect getaway for most girls her a...	tv

model 3:
title	genres	studios	synopsis	media_type
4956	Suzumiya Haruhi no Shoushitsu	Award Winning, Mystery, School, Sci-Fi, Supern...	Kyoto Animation	On a cold December day, Kyon arrives at school...	movie
8474	Tamako Love Story	Award Winning, Romance, Slice of Life	Kyoto Animation	As the seasons pass by, the end of Mochizou Oo...	movie
9651	Hibike! Euphonium	Drama, Music, Performing Arts, School	Kyoto Animation	Now that Kumiko Oumae has enrolled in Kitauji ...	tv
12582	Yuru Camp△	CGDCT, Iyashikei, Slice of Life	C-Station	While the perfect getaway for most girls her a...	tv

- Model 2 indicates that the z-score normalization does help with accuracy

target id: 47 (Akira) - Let's see if being a movie affects recommendations

model 1:
title	genres	studios	synopsis	media_type
0	Cowboy Bebop	Action, Adult Cast, Award Winning, Sci-Fi, Space	Sunrise	Crime is timeless. By the year 2071, humanity ...	tv
142	Mononoke Hime	Action, Adventure, Award Winning, Fantasy	Studio Ghibli	When an Emishi village is attacked by a fierce...	movie
409	Perfect Blue	Adult Cast, Avant Garde, Drama, Horror, Psycho...	Madhouse	J-pop idol group CHAM! has spent the last two ...	movie
14240	Mob Psycho 100 II	Action, Comedy, Super Power, Supernatural	Bones	Shigeo "Mob" Kageyama is now maturing and unde...	tv

model 2:
title	genres	studios	synopsis	media_type
0	Cowboy Bebop	Action, Adult Cast, Award Winning, Sci-Fi, Space	Sunrise	Crime is timeless. By the year 2071, humanity ...	tv
24	Koukaku Kidoutai	Action, Adult Cast, Award Winning, Detective, ...	Production I.G	In the year 2029, Niihama City has become a te...	movie
142	Mononoke Hime	Action, Adventure, Award Winning, Fantasy	Studio Ghibli	When an Emishi village is attacked by a fierce...	movie
409	Perfect Blue	Adult Cast, Avant Garde, Drama, Horror, Psycho...	Madhouse	J-pop idol group CHAM! has spent the last two ...	movie

model 3:
title	genres	studios	synopsis	media_type
0	Cowboy Bebop	Action, Adult Cast, Award Winning, Sci-Fi, Space	Sunrise	Crime is timeless. By the year 2071, humanity ...	tv
24	Koukaku Kidoutai	Action, Adult Cast, Award Winning, Detective, ...	Production I.G	In the year 2029, Niihama City has become a te...	movie
142	Mononoke Hime	Action, Adventure, Award Winning, Fantasy	Studio Ghibli	When an Emishi village is attacked by a fierce...	movie
409	Perfect Blue	Adult Cast, Avant Garde, Drama, Horror, Psycho...	Madhouse	J-pop idol group CHAM! has spent the last two ...	movie

- Model 2 recommends "Koukaku Kidoutai" in place of "Mob Psycho" which can objectively be seen as a more accurate recommendation

# Conclusions

- Z-score normalization seems to improve the accuracy of our model
- Genre does not seem have a significant effect due to the nature of our model
- Extremely popular shows with large numbers of rating may have more stable and dense neighbors, meaning with each iteration of the model, the neighbors will remain relatively similar
- In contrast, mid-low level popularity shows are more sensitive to changes (normalization)

## What our results tell us about the data?
- Achieving some level of accuracy from our recommendations indicate a consistency or pattern for how users rate anime shows
- Users who give similar ratings to shows tend to have somewhat similar preferences
- KNN model uses these similarities to recommend a show that other users with similar taste have enjoyed
- Ex. if a user enjoys romance anime, nearby neighbors in the model may be in the romance genre

## Limitations and Future Applications
- Due to the time constraints and memory allocation, our model is limited by sample size (18k users)
- Model can be improved by increasing our training data
- Incorporate other features such as "drop/complete", "studio", "num of eps", etc.
- Model can be applied to "users" instead of "anime id", recommend shows based on other users who have similar taste