In [1]:
import numpy as np
import pandas as  pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import pickle

In [2]:
df = pd.read_csv("datasets/model_data.csv")

In [3]:
df.head()

Unnamed: 0,genres,id,original_title,overview,production_companies,production_countries,release_date,vote_average,vote_count,cast_names,crew_names
0,"Action,Adventure,Fantasy,Science Fiction",19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","Ingenious Film Partners,Twentieth Century Fox ...","United States of America,United Kingdom",2009-12-10,7.2,11800,"Sam Worthington,Zoe Saldana,Sigourney Weaver,S...","James Cameron,Jon Landau"
1,"Adventure,Fantasy,Action",285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","Walt Disney Pictures,Jerry Bruckheimer Films,S...",United States of America,2007-05-19,6.9,4500,"Johnny Depp,Orlando Bloom,Keira Knightley,Stel...","Gore Verbinski,Jerry Bruckheimer,Ted Elliott,T..."
2,"Action,Adventure,Crime",206647,Spectre,A cryptic message from Bond’s past sends him o...,"Columbia Pictures,Danjaq,B24","United Kingdom,United States of America",2015-10-26,6.3,4466,"Daniel Craig,Christoph Waltz,Léa Seydoux,Ralph...","Sam Mendes,John Logan,Barbara Broccoli,Robert ..."
3,"Action,Crime,Drama,Thriller",49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"Legendary Pictures,Warner Bros.,DC Entertainme...",United States of America,2012-07-16,7.6,9106,"Christian Bale,Michael Caine,Gary Oldman,Anne ...","Charles Roven,Christopher Nolan,Jonathan Nolan..."
4,"Action,Adventure,Science Fiction",49529,John Carter,"John Carter is a war-weary, former military ca...",Walt Disney Pictures,United States of America,2012-03-07,6.1,2124,"Taylor Kitsch,Lynn Collins,Samantha Morton,Wil...","Andrew Stanton,Colin Wilson,Jim Morris,Lindsey..."


In [4]:
df2 = df[["original_title", "id", "production_companies", "vote_average", "vote_count"]]

In [5]:
def join_names(x):
    if pd.isna(x):
        return ""
    else:
        n = x.split(",")
        n = [name.replace(" ", "") for name in n]
        return " ".join(n)

In [6]:
df2["production_companies"]

0       Ingenious Film Partners,Twentieth Century Fox ...
1       Walt Disney Pictures,Jerry Bruckheimer Films,S...
2                            Columbia Pictures,Danjaq,B24
3       Legendary Pictures,Warner Bros.,DC Entertainme...
4                                    Walt Disney Pictures
                              ...                        
4794                                    Columbia Pictures
4795                                                  NaN
4796    Front Street Pictures,Muse Entertainment Enter...
4797                                                  NaN
4798            rusty bear entertainment,lucky crow films
Name: production_companies, Length: 4799, dtype: object

In [7]:
df2["production_companies"] = df2["production_companies"].apply(join_names)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["production_companies"] = df2["production_companies"].apply(join_names)


In [8]:
df2.head()

Unnamed: 0,original_title,id,production_companies,vote_average,vote_count
0,Avatar,19995,IngeniousFilmPartners TwentiethCenturyFoxFilmC...,7.2,11800
1,Pirates of the Caribbean: At World's End,285,WaltDisneyPictures JerryBruckheimerFilms Secon...,6.9,4500
2,Spectre,206647,ColumbiaPictures Danjaq B24,6.3,4466
3,The Dark Knight Rises,49026,LegendaryPictures WarnerBros. DCEntertainment ...,7.6,9106
4,John Carter,49529,WaltDisneyPictures,6.1,2124


In [9]:
df2.iloc[123]

original_title                                     The Matrix Revolutions
id                                                                    605
production_companies    VillageRoadshowPictures NPVEntertainment Silve...
vote_average                                                          6.4
vote_count                                                           3096
Name: 123, dtype: object

In [10]:
df2[df2["production_companies"] == "WaltDisneyPictures"]

Unnamed: 0,original_title,id,production_companies,vote_average,vote_count
4,John Carter,49529,WaltDisneyPictures,6.1,2124
255,Home on the Range,13700,WaltDisneyPictures,5.7,389
288,The Hunchback of Notre Dame,10545,WaltDisneyPictures,6.8,1129
465,Fantasia 2000,49948,WaltDisneyPictures,7.0,292
646,The Kid,4244,WaltDisneyPictures,6.0,238
752,My Favorite Martian,9849,WaltDisneyPictures,5.1,80
1045,The Princess Diaries 2: Royal Engagement,11130,WaltDisneyPictures,6.0,697
1055,The Muppets,64328,WaltDisneyPictures,6.5,501
1206,The Odd Life of Timothy Green,71864,WaltDisneyPictures,6.5,386
1496,Snow Dogs,11888,WaltDisneyPictures,5.3,185


In [11]:
C = df2['vote_average'].mean()
m = df2['vote_count'].quantile(0.75)

In [12]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [13]:
df2['weighted_rating'] = df2.apply(weighted_rating, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['weighted_rating'] = df2.apply(weighted_rating, axis=1)


In [14]:
count_vectorizer = CountVectorizer()
count_matrix = count_vectorizer.fit_transform(df2['production_companies'])

cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [15]:
with open('models/cosine_sim_prod.pkl', 'wb') as file:
    pickle.dump(cosine_sim, file)

In [16]:
df_main = df2[["original_title", "weighted_rating"]]

In [17]:
content_based_data = pd.read_csv("datasets/content_based_data.csv")

In [18]:
content_based_data.head()

Unnamed: 0,id,original_title
0,19995,Avatar
1,285,Pirates of the Caribbean: At World's End
2,206647,Spectre
3,49026,The Dark Knight Rises
4,49529,John Carter


In [19]:
df_main.head()

Unnamed: 0,original_title,weighted_rating
0,Avatar,7.134952
1,Pirates of the Caribbean: At World's End,6.786532
2,Spectre,6.27083
3,The Dark Knight Rises,7.487181
4,John Carter,6.098502


In [20]:
merged_df = pd.merge(content_based_data, df_main, on='original_title', how='inner')

In [21]:
merged_df.to_csv("datasets/main_df.csv")

In [22]:
# merged_df.set_index('original_title', inplace=True)

In [23]:
merged_df.head()

Unnamed: 0,id,original_title,weighted_rating
0,19995,Avatar,7.134952
1,285,Pirates of the Caribbean: At World's End,6.786532
2,206647,Spectre,6.27083
3,49026,The Dark Knight Rises,7.487181
4,49529,John Carter,6.098502


In [25]:
def get_recommendations(title, cosine_sim=cosine_sim, df=df2):
    idx = df2.index[df2['original_title'] == title][0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:50]  # Get top 50 similar movies

    # Adjust scores based on weighted rating
    movie_indices = [i[0] for i in sim_scores]
    movies = df2.iloc[movie_indices].copy()
    movies = movies.sort_values('weighted_rating', ascending=False)

    movies = movies[movies["original_title"] != title]
    movies = movies[movies["weighted_rating"] > 5.8]
    movies = movies[['original_title', "id", 'weighted_rating']]
    random_movie = df.sample(n=10)
    return list(random_movie["original_title"]),list(random_movie["id"])

In [29]:
get_recommendations("The Avengers", df =merged_df )

(['The Ladies Man',
  'Before I Go to Sleep',
  'Sublime',
  'Blow Out',
  'RockNRolla',
  'Furry Vengeance',
  'Forty Shades of Blue',
  'Secretary',
  'Casablanca',
  'Frailty'],
 [16888, 204922, 9783, 11644, 13809, 35169, 30082, 11013, 289, 12149])