In [1]:
import pandas as pd

In [2]:
# Read the data
movies = pd.read_csv('./movies.csv', delimiter=';')
movies.head()

Unnamed: 0,Movie,Duration (minutes),Budget (millions),Release Date,Genre,Unnamed: 5,Unnamed: 6
0,Movie A,120,100,2015,Action,,
1,Movie B,90,20,2010,Romance,,
2,Movie C,150,150,2020,Action,,
3,Movie D,85,10,2005,Romance,,
4,Movie E,140,200,2018,Action,,


Here the data in the last two columns are Nan which needs to be removed. So use the following code to read the data

In [3]:
movies = pd.read_csv('./movies.csv', delimiter=';', usecols= [0, 1, 2, 3,4])
movies.head()

Unnamed: 0,Movie,Duration (minutes),Budget (millions),Release Date,Genre
0,Movie A,120,100,2015,Action
1,Movie B,90,20,2010,Romance
2,Movie C,150,150,2020,Action
3,Movie D,85,10,2005,Romance
4,Movie E,140,200,2018,Action


Movie to classify:

Suppose we have a new movie with the following features, and we want to classify it as either "Action" or "Romance":

- Duration: 110 minutes
- Budget: 50 million dollars
- Release Date: 2016

Select the k nearest neighbors. Let's choose k = 3 for this example.

In [4]:
unknown = [0,110, 50, 2016]

One can also use drop function as follows (provided all the values are Nan)

In [5]:
movies.dropna(axis=1, how='all')

Unnamed: 0,Movie,Duration (minutes),Budget (millions),Release Date,Genre
0,Movie A,120,100,2015,Action
1,Movie B,90,20,2010,Romance
2,Movie C,150,150,2020,Action
3,Movie D,85,10,2005,Romance
4,Movie E,140,200,2018,Action
5,Movie F,95,30,2012,Romance


Let's calculate the distance between two movies

In [6]:
def distance(movie1, movie2):
  squared_difference = 0
  for i in range(len(movie1)):
    squared_difference += (movie1[i] - movie2[i]) ** 2
  final_distance = squared_difference ** 0.5
  return final_distance

For each movie in the dataset, calculate the distance to the new movie:

### Distance to Movie A:

$$
\sqrt{(120 - 110)^2 + (100 - 50)^2 + (2015 - 2016)^2} \approx \sqrt{10^2 + 50^2 + 1^2} \approx \sqrt{2601} \approx 51
$$


In [7]:
def classify(unknown, dataset, k):
  distances = []
  #Looping through all points in the dataset
  for title in dataset:
    movie = dataset[title]
    print(movie, unknown)
    distance_to_point = distance(movie, unknown)
    #Adding the distance and point associated with that distance
    distances.append([distance_to_point, title])
  distances.sort()
  #Taking only the k closest points
  neighbors = distances[0:k]
  return neighbors

In [8]:
def distance(movie1, movie2):
    squared_difference = 0
    for i in range(len(movie1)):
        squared_difference += (movie1[i] - movie2[i]) ** 2
    final_distance = squared_difference ** 0.5
    return final_distance

def classify(unknown, df, k):
    distances = []
    # Looping through all rows in the DataFrame
    for index, row in df.iterrows():
        movie = row[['Duration (minutes)', 'Budget (millions)', 'Release Date']].tolist()
        title = row['Movie']
        genre = row['Genre']
        print(movie, unknown)
        distance_to_point = distance(movie, unknown)
        # Adding the distance and point associated with that distance
        distances.append([distance_to_point, title, genre])
    distances.sort()
    # Taking only the k closest points
    neighbors = distances[:k]
    return neighbors


In [9]:
# Set the number of neighbors to consider
k = 3

# Classify the unknown movie
neighbors = classify(unknown, movies, k)

print("\nNearest Neighbors:")
for neighbor in neighbors:
    print(f"Distance: {neighbor[0]:.2f}, Movie: {neighbor[1]}, Genre: {neighbor[2]}")

[120, 100, 2015] [0, 110, 50, 2016]
[90, 20, 2010] [0, 110, 50, 2016]
[150, 150, 2020] [0, 110, 50, 2016]
[85, 10, 2005] [0, 110, 50, 2016]
[140, 200, 2018] [0, 110, 50, 2016]
[95, 30, 2012] [0, 110, 50, 2016]

Nearest Neighbors:
Distance: 1959.40, Movie: Movie D, Genre: Romance
Distance: 1964.13, Movie: Movie B, Genre: Romance
Distance: 1965.93, Movie: Movie F, Genre: Romance


It appreas that the new movie is a romantic movie.

In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [16]:
# Features and labels
X = movies[['Duration (minutes)', 'Budget (millions)', 'Release Date']]
y = movies['Genre']

In [17]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the features
X_scaled = scaler.fit_transform(X)

In [18]:
# Initialize the KNeighborsClassifier with k=3
knn = KNeighborsClassifier(n_neighbors=3)

# Fit the classifier
knn.fit(X_scaled, y)

In [25]:
# Define the unknown movie features
unknown_movie = [110, 50, 2016]  # Ensure it has 3 features: Duration, Budget, Release Date

# Scale the unknown movie features
unknown_movie_scaled = scaler.transform([unknown_movie])

# Predict the genre
predicted_genre = knn.predict(unknown_movie_scaled)
print(f"Predicted Genre: {predicted_genre[0]}")

# Find the nearest neighbors
distances, indices = knn.kneighbors(unknown_movie_scaled)

print("\nNearest Neighbors:")
for i in range(len(indices[0])):
    neighbor_index = indices[0][i]
    print(f"Distance: {distances[0][i]:.2f}, Movie: {movies.iloc[neighbor_index]['Title']}, Genre: {movies.iloc[neighbor_index]['Genre']}")


Predicted Genre: Romance

Nearest Neighbors:
Distance: 0.83, Genre: Action
Distance: 1.03, Genre: Romance
Distance: 1.50, Genre: Romance


