In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares
from sklearn.model_selection import train_test_split
from scipy.sparse.linalg import svds
from scipy.sparse import csr_matrix

In [2]:
data=pd.read_csv('/Users/antropravin/Desktop/Bezohminds/Task/Movie Recommendation System/Cleaned Movie Data.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9880 entries, 0 to 9879
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            9880 non-null   int64  
 1   movie_id              9880 non-null   int64  
 2   title                 9880 non-null   object 
 3   overview              9880 non-null   object 
 4   genres                9880 non-null   object 
 5   release_date          9880 non-null   object 
 6   release_year          9880 non-null   int64  
 7   runtime               9880 non-null   int64  
 8   original_language     9880 non-null   object 
 9   poster_path           9880 non-null   object 
 10  vote_average          9880 non-null   float64
 11  vote_count            9880 non-null   int64  
 12  popularity            9880 non-null   float64
 13  production_companies  9880 non-null   object 
dtypes: float64(2), int64(5), object(7)
memory usage: 1.1+ MB


In [4]:
data['combined_features']=data['genres']+' '+data['overview']+' '+data['production_companies']
vectorizer=TfidfVectorizer(stop_words='english')
featurematrix=vectorizer.fit_transform(data['combined_features'])

In [5]:
cosine_sim=cosine_similarity(featurematrix,featurematrix)
print(cosine_sim)

[[1.         0.         0.         ... 0.00466848 0.         0.        ]
 [0.         1.         0.04309514 ... 0.         0.         0.        ]
 [0.         0.04309514 1.         ... 0.0278917  0.         0.01019812]
 ...
 [0.00466848 0.         0.0278917  ... 1.         0.01966245 0.01789503]
 [0.         0.         0.         ... 0.01966245 1.         0.0045213 ]
 [0.         0.         0.01019812 ... 0.01789503 0.0045213  1.        ]]


In [6]:
def content_based_recommendations(movie_title, n=10):
    if movie_title not in data['title'].values:
        return "Movie Not Found"
    
    movie_idx=data[data['title']==movie_title].index[0]
    similarity_scores=list(enumerate(cosine_sim[movie_idx]))
    similarity_scores=sorted(similarity_scores,key=lambda x: x[1], reverse=True)
    recommended_movies=[data.iloc[i[0]]['title'] for i in similarity_scores[1:n+1]]
    return recommended_movies
print(content_based_recommendations('The Lake House'))

['The Night House', 'To All the Boys: P.S. I Still Love You', 'The Last Letter from Your Lover', "To All the Boys I've Loved Before", 'Rebecca', 'Companion', 'Scandal in Sorrento', 'Entrapment', 'Forever My Girl', 'Forever My Girl']


In [None]:
vectorizer = TfidfVectorizer(stop_words='english')
feature_matrix = vectorizer.fit_transform(data['combined_features'])
item_sim_matrix = cosine_similarity(feature_matrix)
def item_based_recommendations(movie_title, n=10):
    if movie_title not in data['title'].values:
        return "Movie Not Found"
    movie_idx = data[data['title'] == movie_title].index[0]
    similarity_scores = list(enumerate(item_sim_matrix[movie_idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    recommended_movies = [data.iloc[i[0]]['title'] for i in similarity_scores[1:n+1]]
    return recommended_movies
print(item_based_recommendations('The Lake House'))

['The Night House', 'To All the Boys: P.S. I Still Love You', 'The Last Letter from Your Lover', "To All the Boys I've Loved Before", 'Rebecca', 'Companion', 'Scandal in Sorrento', 'Entrapment', 'Forever My Girl', 'Forever My Girl']


In [None]:
def hybrid_recommendations(movie_title, n=10, content_weight=0.5, item_weight=0.5):
    if movie_title not in data['title'].values:
        return "Movie Not Found"
    content_recs = item_based_recommendations(movie_title, n * 2)
    item_recs = item_based_recommendations(movie_title, n * 2)
    recommendation_scores = {}
    for i, movie in enumerate(content_recs):
        recommendation_scores[movie] = recommendation_scores.get(movie, 0) + content_weight * (n - i)
    for i, movie in enumerate(item_recs):
        recommendation_scores[movie] = recommendation_scores.get(movie, 0) + item_weight * (n - i)
    hybrid_recs = sorted(recommendation_scores.items(), key=lambda x: x[1], reverse=True)
    return [movie[0] for movie in hybrid_recs[:n]]
print(hybrid_recommendations('The Lake House'))


['The Night House', 'To All the Boys: P.S. I Still Love You', 'The Last Letter from Your Lover', "To All the Boys I've Loved Before", 'Rebecca', 'Companion', 'Scandal in Sorrento', 'Entrapment', 'Forever My Girl', 'Sherlock Holmes']


In [None]:
data['user_id'] = np.random.randint(1, 1000, data.shape[0])
ratings_matrix = data.pivot_table(index='user_id', columns='title', values='vote_average').fillna(0)
sparse_ratings = csr_matrix(ratings_matrix.values)
train_data, test_data = train_test_split(sparse_ratings, test_size=0.2, random_state=42) 
U, sigma, Vt = svds(train_data, k=50)
sigma = np.diag(sigma)
predicted_ratings = np.dot(np.dot(U, sigma), Vt)
predictions_df = pd.DataFrame(predicted_ratings, columns=ratings_matrix.columns)
def collaborative_recommendations(user_id, n=10):
    if user_id not in predictions_df.index:
        return "User Not Found"
    user_ratings = predictions_df.iloc[user_id].sort_values(ascending=False)
    return user_ratings.head(n).index.tolist()
print(collaborative_recommendations(10))

['Beauty and the Beast', 'Anonymously Yours', "A Hard Day's Night", 'Notorious', 'Anastasia', "The Devil's Bath", "Babette's Feast", 'The Road', 'Osama', 'Call Jane']


In [31]:
als_model = AlternatingLeastSquares(factors=50, regularization=0.1, iterations=20)
alpha_val = 20 
als_data = (sparse_ratings * alpha_val).astype('double')
als_model.fit(als_data)
def als_recommendations(user_id, n=10):
    if user_id >= len(ratings_matrix):
        return "User Not Found"
    user_items = sparse_ratings[user_id]
    recommendations = als_model.recommend(user_id, user_items, N=n * 2)
    recommended_movies = [ratings_matrix.columns[int(item[0])] for item in recommendations]
    filtered_movies = [movie for movie in recommended_movies if data.loc[data['title'] == movie, 'vote_count'].values[0] > 100]
    return filtered_movies[:n]
print(als_recommendations(10))

  0%|          | 0/20 [00:00<?, ?it/s]

['Touch of Evil']


In [None]:
def hybrid_recommendations(user_id, movie_title, n=10, als_weight=0.6, content_weight=0.4):
    als_recs = als_recommendations(user_id, n * 2)
    content_recs = collaborative_recommendations(user_id, n * 2)
    
    recommendation_scores = {}
    for i, movie in enumerate(als_recs):
        recommendation_scores[movie] = recommendation_scores.get(movie, 0) + als_weight * (n * 2 - i)
    for i, movie in enumerate(content_recs):
        recommendation_scores[movie] = recommendation_scores.get(movie, 0) + content_weight * (n * 2 - i)
    hybrid_recs = sorted(recommendation_scores.items(), key=lambda x: x[1], reverse=True)
    return [movie[0] for movie in hybrid_recs[:n]]
print(hybrid_recommendations(10, 'The Lake House'))

['Touch of Evil', 'Beauty and the Beast', 'Anonymously Yours', "A Hard Day's Night", 'Notorious', 'Anastasia', "The Devil's Bath", "Babette's Feast", 'The Road', 'Osama']


In [None]:
popularity_trend = data[['title', 'release_year', 'popularity']]
popularity_trend = popularity_trend.groupby(['release_year', 'title']).mean().reset_index()
def forecast_popularity(movie_title, steps=5):
    movie_data = popularity_trend[popularity_trend['title'] == movie_title]
    if len(movie_data) < 3:
        return f"Not enough data to forecast popularity for {movie_title}"
    
    model = ARIMA(movie_data['popularity'], order=(5, 1, 0))
    model_fit = model.fit()
    forecast = model_fit.forecast(steps=steps)
    
    plt.plot(range(len(movie_data['popularity'])), movie_data['popularity'], label='History')
    plt.plot(range(len(movie_data['popularity']), len(movie_data['popularity']) + steps), forecast, label='Forecast', color='red')
    plt.legend()
    plt.title(f"Popularity Forecast for {movie_title}")
    plt.show()
    
    return forecast.tolist()
print(forecast_popularity('Civil War'))

Not enough data to forecast popularity for Civil War


In [None]:
popularity_trend = data[['title', 'release_year', 'popularity']]
popularity_trend = popularity_trend.groupby(['release_year', 'title']).mean().reset_index()
overall_popularity_trend = popularity_trend.groupby('release_year')['popularity'].mean().diff().mean()
def forecast_popularity(movie_title, steps=5):
    movie_data = popularity_trend[popularity_trend['title'] == movie_title]
    if len(movie_data) < 3:
        if len(movie_data) == 1:
            base_popularity = movie_data['popularity'].values[0]
            return [base_popularity + overall_popularity_trend * i for i in range(steps)]
        elif len(movie_data) == 2:
            growth = movie_data['popularity'].values[1] - movie_data['popularity'].values[0]
            return [movie_data['popularity'].values[-1] + growth * i for i in range(1, steps + 1)]
        return f"Not enough data to forecast popularity for {movie_title}"
    
    model = ARIMA(movie_data['popularity'], order=(5, 1, 0))
    model_fit = model.fit()
    forecast, stderr, conf_int = model_fit.forecast(steps=steps, alpha=0.05)
    plt.plot(range(len(movie_data['popularity'])), movie_data['popularity'], label='History')
    plt.plot(range(len(movie_data['popularity']), len(movie_data['popularity']) + steps), forecast, label='Forecast', color='red')
    plt.fill_between(range(len(movie_data['popularity']), len(movie_data['popularity']) + steps),
                     conf_int[:, 0], conf_int[:, 1], color='pink', alpha=0.3, label='95% Confidence Interval')
    plt.legend()
    plt.title(f"Popularity Forecast for {movie_title}")
    plt.show()
    return forecast.tolist()
print(forecast_popularity('The Lake House'))

[3.568, 3.6169021967684816, 3.6658043935369635, 3.714706590305445, 3.7636087870739265]
