In [None]:
#Netflix data visualization
import pandas as pd
#this is where i removed all the tv series
movies = pd.read_csv('data/AmbyViewingMoviesOnly.csv')

#grouping by title to remove redundancy and looking at the total time spent on each title

grp = movies.groupby('Title')['Duration_minutes'].sum()
grp.head()
#convert to df and reset index 
movieframe = grp.to_frame()

movief = movieframe.reset_index()

groupvalues = grp.sort_values(ascending = False)

#estimating how much time i spent on a title (each point indicating a particular title)
import matplotlib.pyplot as plt

#making a plot of the same
from matplotlib import cm
fig, ax = plt.subplots()

sc = ax.scatter(groupvalues.index, groupvalues.values, c=groupvalues.values, cmap= 'hot')

# Set colorbar
cbar = plt.colorbar(sc)
cbar.set_label('Minutes Spent', rotation=270, labelpad=15)

# Set font to Arial
plt.rcParams['font.sans-serif'] = 'Arial'

plt.xlabel('Individual title') 
plt.ylabel('Number of minutes spent')  

# Hide x-axis labels
# Draw lines indicating approximate lengths of Bollywood and Hollywood movies
plt.axhline(180, color='red', alpha = 0.2,  label='Approx. length of Bollywood movie')
plt.axhline(105, color='gray', label='Approx. length of Hollywood movie')

plt.xticks([])
plt.legend()

plt.legend(frameon=False)

plt.show()

#pip install IMDbPY

#print(movief.head())

In [None]:
#getting information about titles from the imdb database
#pip install IMDbPY
import imdb

# Create an instance of the IMDb class
ia = imdb.IMDb()

# Function to fetch information about a movie
def get_movie_info(title):
    try:
        search_results = ia.search_movie(title)
        if search_results:
            first_result = search_results[0]
            ia.update(first_result)
            return {
                'Title': first_result.get('title', ''),
                'Year': first_result.get('year', ''),
                'Genres': first_result.get('genres', []),
                'Rating': first_result.get('rating', ''),
                'Runtime (min)': first_result.get('runtimes', [''])[0]  # Get the first runtime if available
            }
        else:
            return None
    except Exception as e:
        print(f"An error occurred while fetching information for '{title}': {e}")
        return None

# Iterate over DataFrame rows and fetch movie information
movie_info_list = []
for index, row in movief.iterrows():
    title = row['Title']
    movie_info = get_movie_info(title)
    if movie_info:
        movie_info_list.append(movie_info)

# Create a new DataFrame from the list of movie information
new_movief = pd.DataFrame(movie_info_list)

# Display the resulting DataFrame
print(new_movief.head())

#writing the files as csv
#new_movief.to_csv('data/ImdbMovieInfo.csv', index = False)
#movief.to_csv('data/Ambymoviesgrouped.csv', index = False)


In [None]:
#note i had to edit the new_movief file manually because spelling and capitalization inconsistencies 
#also language barriers prevented data matching from the imdb library

new_movief= pd.read_csv('data/ImdbMovieInfo.csv')
movief= pd.read_csv('data/Ambymoviesgrouped.csv')

#get unique genre labels 

new_movief['Genres'] = new_movief['Genres'].apply(eval)
exploded_genres = new_movief['Genres'].explode()

unique_genres = exploded_genres.unique()
print(unique_genres)

# Count the occurrences of each genre label
genre_counts = exploded_genres.value_counts()

# Plot the histogram
#this gives a sense of the genres i enjoy most!
genre_counts.plot(kind='bar', xlabel='Genre', ylabel='Number of Movies', title='Movie Genre Distribution')

In [None]:
#merging the two dataframes to combine the information
new_movief= pd.read_csv('data/ImdbMovieInfo.csv')
movief= pd.read_csv('data/Ambymoviesgrouped.csv')

#removing leading and trailing whitespaces
movief['Title'] = movief['Title'].str.strip()
new_movief['Title'] = new_movief['Title'].str.strip()

#changing the case so that doesn't cause issues
movief['Title'] = movief['Title'].str.lower()
new_movief['Title'] = new_movief['Title'].str.lower()

## Assuming movief and new_movief are your DataFrames
# Merge the DataFrames based on the 'Title' column
merged_df = pd.merge(movief, new_movief, on='Title', suffixes=('_movief', '_new_movief'), how='outer')

# Display the reordered DataFrame
print(merged_df.head())

#merged_df.to_csv('data/CompleteMovieDataAmby.csv', index = False)

In [None]:
import pandas as pd
#read the merged dataframe
merged_df = pd.read_csv('data/CompleteMovieDataAmby.csv')

#create a new column called engagement which gives a measure of interest in the movie 
#i did not end up using this metric to recommend similar movies 
merged_df['Engagement'] = merged_df['Duration_minutes']/merged_df['Runtime (min)']
sorted_movies = merged_df.sort_values(by = ['Engagement'], ascending = False)
sorted_movies.reset_index(drop=True, inplace=True)

#this plot just gives a sense of the ratings of the movies i watch 
import matplotlib.pyplot as plt
plt.hist(sorted_movies['Rating'], bins = 30)
plt.xlabel('Rating')
plt.ylabel('Number of movies')

In [None]:
#i tried a couple different algorithms to recommend similar movies, one is nearest neighbors
import pandas as pd
merged_df = pd.read_csv('data/CompleteMovieDataAmby.csv')

from sklearn.neighbors import NearestNeighbors
from ast import literal_eval
import numpy as np

#remove NA values from dataframe
sortednana  = sorted_movies.dropna()

# Extract features from the data
titles = sortednana['Title']
duration_minutes = sortednana['Duration_minutes']
year = sortednana['Year']
genres = sortednana['Genres']
rating = sortednana['Rating']
runtime_minutes = sortednana['Runtime (min)']

#find the unique genres 
geval = genres.apply(eval) #converts to list
exploded_genres = geval.explode() #explodes dataset to give all genre components

unique_genres = exploded_genres.unique()
unique_genres1 = unique_genres[0:22]
#the last one is an n/a

#convert the genres into binary features
import pandas as pd

binary_features = []
for genre in genres:
    bin_features = []
    for unique_genre in unique_genres1:
        if unique_genre in genre:
            bin_features.append(1)
        else:
            bin_features.append(0)
    binary_features.append(bin_features)
    

#print(sortednana[0:5])

# Combine numerical features with genre features - the ones I thought would be relevant
movies_with_genres = np.column_stack((duration_minutes, rating, year, binary_features))
#print(movies_with_genres[0:5])

# Initialize NearestNeighbors model
nn_model = NearestNeighbors(n_neighbors=4, algorithm='ball_tree')
nn_model.fit(movies_with_genres)

# Sample movie to find similar titles for - just used the first movie of the dataset 

sample_movie = movies_with_genres[0]
print(titles[0])
# Find k nearest neighbors
distances, indices = nn_model.kneighbors(sample_movie.reshape(1, -1))

# Output the indices of similar movies
print("Recommended movies:")
for idx in indices[0]:
    print(f"{titles[idx]}")


In [None]:
#next i used cosine similarity 
from sklearn.metrics.pairwise import cosine_similarity

# Calculate similarity matrix between movies
similarity_matrix = cosine_similarity(movies_with_genres)

# Given a movie index 'idx', recommend top similar movies
def recommend_similar_movies(idx, similarity_matrix, top_n=5):
    # Get similarity scores for the given movie index
    sim_scores = similarity_matrix[idx]
    # Sort movie indices based on similarity scores (excluding the movie itself)
    similar_indices = sorted(range(len(sim_scores)), key=lambda i: sim_scores[i], reverse=True)[1:top_n+1]
    return similar_indices

# Example usage:
movie_index = 0  # Index of the movie for which you want to make recommendations
similar_movies = recommend_similar_movies(movie_index, similarity_matrix)

print(titles[movie_index])
print("Recommended movies:")
for idx in similar_movies:
    print(f"{titles[idx]}")
