In [None]:
%matplotlib inline

In [None]:
import csv
import pandas as pd

In [None]:
# Read csv file
file = 'IMDb_All_Genres_etf_clean1.csv'
df = pd.read_csv(file)
df

In [None]:
# Create a dictionary to map each movie to its rating
movie_rating_map = {}

# Create a dictionary to map each movie to its runtime
movie_runtime_map = {}

# Iterate through the DataFrame
for index, row in df.iterrows():
    movie = row['Movie_Title']
    rating = row['Rating']  #'Rating' contains movie ratings
    runtime = row['Runtime(Mins)'] #'Runtime(Mins)' contains the movie runtime
    
    # Add the movie and its rating to the dictionary
    movie_rating_map[movie] = rating

    # Add the movie and its runtime to the dictionary
    movie_runtime_map[movie] = runtime



In [None]:
# The movie for which you want to find similar movies
target_movie = 'The Dark Knight' 

In [None]:
from sklearn.metrics.pairwise import pairwise_distances

In [None]:
# Calculate pairwise Euclidean distance between movies based on their ratings
ratings = df['Rating'].values.reshape(-1, 1)
euclidean_distances_rating = pairwise_distances(ratings, metric='euclidean')

# Calculate pairwise Euclidean distance between movies based on their runtime
runtime = df['Runtime(Mins)'].values.reshape(-1, 1)
euclidean_distances_runtime = pairwise_distances(runtime, metric='euclidean')

# Convert the Euclidean distance matrix into a DataFrame 
euclidean_distances_rating_df = pd.DataFrame(euclidean_distances_rating, index=df['Movie_Title'], columns=df['Movie_Title'])
euclidean_distances_runtime_df = pd.DataFrame(euclidean_distances_runtime, index=df['Movie_Title'], columns=df['Movie_Title'])

# Get the distance scores for the target movie based on rating
distances_rating = euclidean_distances_rating_df[target_movie].sort_values(ascending=True)

# Get the distance scores for the target movie based on rating
distances_runtime = euclidean_distances_runtime_df[target_movie].sort_values(ascending=True)

# Exclude the target movie from the top 10 list of similar movies
similar_movies_rating = distances_rating.drop(target_movie)
similar_movies_runtime = distances_runtime.drop(target_movie)

# Print the top 10 similar movies based on rating
top_similar_movies_rating = similar_movies_rating.head(10)
print(top_similar_movies_rating)

# Print the top 10 similar movies based on runtime
top_similar_movies_runtime = similar_movies_runtime.head(10)
print(top_similar_movies_runtime)

In [None]:
from sklearn.cluster import KMeans

In [None]:
# Select the features for clustering (ratings and runtime)
X = df[['Rating', 'Runtime(Mins)']].values

# Set the number of clusters
num_clusters = 12  

# Initialize KMeans model
kmeans = KMeans(n_clusters=num_clusters)

# Fit the model to the data
kmeans.fit(X)

# Get the cluster labels for each movie
cluster_labels = kmeans.labels_

# Visualize the clusters
plt.figure(figsize=(10, 6))
for cluster in range(num_clusters):
    cluster_data = X[cluster_labels == cluster]
    plt.scatter(cluster_data[:, 0], cluster_data[:, 1], label=f'Cluster {cluster+1}')

plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], marker='x', color='black', label='Centroids')
plt.xlabel('Rating')
plt.ylabel('Runtime(Mins)')
plt.title('Clustering Movies based on Ratings and Runtime')
plt.legend()
plt.show()

# Group similar ratings in clusters
# Print the movies in each cluster
# Group similar ratings in clusters
for cluster in range(num_clusters):
    cluster_movies = df[df['Cluster'] == cluster]['Movie_Title']
    print(f'Cluster {cluster+1}: Size - {cluster_movies.shape[0]}')
    print(", ".join(cluster_movies.sample(5)))