In [8]:
# Import the modules
import pandas as pd
from pathlib import Path
import hvplot.pandas
from sklearn.cluster import KMeans
from pathlib import Path
import requests
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
#Define the URL for the movies list API
url="http://127.0.0.1:5000/api/v1.0/movies_list"

In [3]:
# Send a GET request to the API to retrieve the movie list
response=requests.get(url)

In [4]:
# Checking if the request was successful
if response.status_code == 200:
    # Convert the JSON response to a pandas DataFrame
    movies_combined_df = pd.DataFrame(response.json())
else:
    print("Failed to retrieve data:", response.status_code)

In [5]:
# Display the first few rows of the combined movies DataFrame
movies_combined_df.head()

Unnamed: 0,cleaned_genres,movieId,poster_path,rating,timestamp,title,userId
0,Action|Crime|Drama|Thriller,949,/zMyfPUelumio3tiDKPffaUpsQTD.jpg,4.0,956598942,Heat,102
1,Action|Crime|Drama|Thriller,949,/zMyfPUelumio3tiDKPffaUpsQTD.jpg,4.0,942345464,Heat,363
2,Action|Crime|Drama|Thriller,949,/zMyfPUelumio3tiDKPffaUpsQTD.jpg,4.5,1133735550,Heat,452
3,Action|Crime|Drama|Thriller,949,/zMyfPUelumio3tiDKPffaUpsQTD.jpg,3.5,1340405089,Heat,505
4,Action|Crime|Drama|Thriller,949,/zMyfPUelumio3tiDKPffaUpsQTD.jpg,3.0,879503053,Heat,537


In [6]:
# Extracting movies-related data into a new DataFrame
movies_df = movies_combined_df[['movieId', 'title', 'cleaned_genres','rating','userId']].copy()
movies_df.head()

Unnamed: 0,movieId,title,cleaned_genres,rating,userId
0,949,Heat,Action|Crime|Drama|Thriller,4.0,102
1,949,Heat,Action|Crime|Drama|Thriller,4.0,363
2,949,Heat,Action|Crime|Drama|Thriller,4.5,452
3,949,Heat,Action|Crime|Drama|Thriller,3.5,505
4,949,Heat,Action|Crime|Drama|Thriller,3.0,537


In [7]:
# Transform the extracted_genres column using get_dummies
genres_dummies = pd.get_dummies(movies_df["cleaned_genres"])

# Display the transformed data
genres_dummies.tail()

Unnamed: 0,Unnamed: 1,Action,Action|Adventure,Action|Adventure|Animation|Fantasy,Action|Adventure|Comedy,Action|Adventure|Comedy|Crime|Drama|Thriller,Action|Adventure|Comedy|Crime|Thriller,Action|Adventure|Comedy|Drama,Action|Adventure|Comedy|Drama|Thriller,Action|Adventure|Comedy|Drama|Western,...,War|Drama,War|Drama|History,War|Drama|History|Adventure|Romance|Thriller,War|Drama|Romance,War|Drama|Thriller,War|History|Action|Adventure|Drama|Romance,Western,Western|Adventure,Western|Adventure|Romance,Western|Thriller
42995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
42996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
42997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
42998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
42999,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Scale 
movie_data_scaled = StandardScaler().fit_transform(
   movies_df[["rating"]]
)

In [11]:
movie_data_scaled=pd.DataFrame(movie_data_scaled,
                               columns=['rating'])
movie_data_scaled.head()

Unnamed: 0,rating
0,0.421278
1,0.421278
2,0.895752
3,-0.053196
4,-0.52767


In [12]:
movie_data_scaled=pd.concat([movie_data_scaled,genres_dummies], axis=1)
movie_data_scaled.head()

Unnamed: 0,rating,Unnamed: 2,Action,Action|Adventure,Action|Adventure|Animation|Fantasy,Action|Adventure|Comedy,Action|Adventure|Comedy|Crime|Drama|Thriller,Action|Adventure|Comedy|Crime|Thriller,Action|Adventure|Comedy|Drama,Action|Adventure|Comedy|Drama|Thriller,...,War|Drama,War|Drama|History,War|Drama|History|Adventure|Romance|Thriller,War|Drama|Romance,War|Drama|Thriller,War|History|Action|Adventure|Drama|Romance,Western,Western|Adventure,Western|Adventure|Romance,Western|Thriller
0,0.421278,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.421278,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.895752,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,-0.053196,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-0.52767,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# Import the PCA module
from sklearn.decomposition import PCA

In [25]:
# Instantiate the PCA instance and declare the number of PCA variables
pca=PCA(n_components=2)

In [26]:
# Fit the PCA model on the transformed credit card DataFrame
genres_pca = pca.fit_transform(movie_data_scaled)

# Review the first 5 rows of the array of list data
genres_pca[:5]

array([[-0.42223269, -0.09709887],
       [-0.42223269, -0.09709887],
       [-0.89662538, -0.09449521],
       [ 0.05215999, -0.09970253],
       [ 0.52655267, -0.1023062 ]])

In [27]:
# Calculate the PCA explained variance ratio
pca.explained_variance_ratio_

array([0.50512181, 0.04759765])

In [28]:
# Create the PCA DataFrame
genres_pca_df = pd.DataFrame(
    genres_pca,
    columns=["PCA1", "PCA2"]
)

# Review the PCA DataFrame
genres_pca_df.head()

Unnamed: 0,PCA1,PCA2
0,-0.422233,-0.097099
1,-0.422233,-0.097099
2,-0.896625,-0.094495
3,0.05216,-0.099703
4,0.526553,-0.102306


In [29]:
# Import the KMeans module from SKLearn
from sklearn.cluster import KMeans

In [30]:
# Create a a list to store inertia values and the values of k
inertia = []
k = list(range(1, 11))

In [31]:
# Create a for-loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the service_ratings DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=0)
    k_model.fit(genres_pca_df)
    inertia.append(k_model.inertia_)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [32]:
# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head()

Unnamed: 0,k,inertia
0,1,47067.749941
1,2,18175.051193
2,3,10657.084307
3,4,6827.085253
4,5,4904.367089


In [33]:
# Plot the DataFrame
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

In [34]:
# Define the model with 3 clusters
model = KMeans(n_clusters=4, random_state=3)

# Fit the model
model.fit(genres_pca_df)

# Make predictions
k_4 = model.predict(genres_pca_df)

# Create a copy of the preprocessed data
genresinfo_predictions_df = genres_pca_df.copy()

# Add a class column with the labels
genresinfo_predictions_df['genres_segments'] = k_4

  super()._check_params_vs_input(X, default_n_init=10)


In [36]:
# Plot the clusters
genresinfo_predictions_df.hvplot.scatter(
    x="PCA1",
    y="PCA2",
    by="rating"
)

DataError: Supplied data does not contain specified dimensions, the following dimensions were not found: ['rating']

PandasInterface expects tabular data, for more information on supported datatypes see http://holoviews.org/user_guide/Tabular_Datasets.html