In [8]:
import numpy as np # numpy for numerical data
import pandas as pd #pandas for dataframes (structured data)

#random seed ensures you can reprdouce results
np.random.seed(42)

# generate a dataset with 10 users (rows) and attributes (columns)
#age refers to user age
# preference_1 to preference_2 represents how much the user likes certain topics

data = pd.DataFrame({
    'user_id': range(1, 11),  #create 10 user ID's from 1 to 10
    'age' : np.random.randint(18, 45, size = 10), # 10 random ages between 18 and 45
    'preference_1': np.random.rand(10),         # Random preferences between 0 and 1 for topic 1
    'preference_2': np.random.rand(10),         # Random preferences between 0 and 1 for topic 2
    'preference_3': np.random.rand(10),         # Random preferences between 0 and 1 for topic 3
    'preference_4': np.random.rand(10)          # Random preferences between 0 and 1 for topic 4
})

# Setting 'user_id' as the index, because each row should correspond to a unique user
data.set_index('user_id', inplace=True)

# Displaying the first 5 rows of the generated dataset
data.head()

# Normalizing the dataset: scaling age to be between 0 and 1, similar to the preferences
from sklearn.preprocessing import MinMaxScaler

# Selecting only the features we want to normalize (age and preferences)
features = data[['age', 'preference_1', 'preference_2', 'preference_3', 'preference_4']]

# MinMaxScaler scales features to a range between 0 and 1
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(features)

# Convert the scaled features back into a DataFrame
scaled_data = pd.DataFrame(scaled_features, columns=features.columns, index=data.index)

# Displaying the first 5 rows of the normalized dataset
scaled_data.head()

from sklearn.cluster import KMeans

# Defining the K-Means model
# Let's assume we want to group the users into 3 clusters for simplicity
kmeans = KMeans(n_clusters=3, random_state=42)

# Fitting the model on the scaled data
kmeans.fit(scaled_data)

# Predicting the cluster each user belongs to
clusters = kmeans.predict(scaled_data)

# Adding the cluster information to the original DataFrame
data['cluster'] = clusters

# Display the updated DataFrame
data[['age', 'preference_1', 'preference_2', 'preference_3', 'preference_4', 'cluster']].head()


# Calculating the average preferences for each cluster
cluster_means = data.groupby('cluster').mean()
print(cluster_means)


def recommend_items(user_data, cluster_means):
    # Get the user's cluster
    user_cluster = user_data['cluster']
    
    # Get the average preferences for the user's cluster
    recommended_preferences = cluster_means.loc[user_cluster]
    
    # Display recommendations
    return recommended_preferences

# Let's say we want to recommend items for the first user in the DataFrame
user_to_recommend = data.iloc[0]

# Get recommendations for this user
recommendations = recommend_items(user_to_recommend, cluster_means)
print(f"Recommendations for User 0:\n{recommendations}")

import matplotlib.pyplot as plt

# Visualizing the clusters
plt.figure(figsize=(10, 6))
plt.scatter(data['preference_1'], data['preference_2'], c=data['cluster'], cmap='viridis', marker='o')
plt.title('User Clustering')
plt.xlabel('Preference 1')
plt.ylabel('Preference 2')
plt.colorbar(label='Cluster')
plt.grid()
plt.show()


ModuleNotFoundError: No module named 'pandas'