In [None]:
#import statements
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans



# Loading and Exploring the Dataset

In [None]:
# Load the dataset
data = pd.read_csv('/kaggle/input/netflix-userbase-dataset/Netflix Userbase.csv')

# Display the first few rows of the dataset
data.head()


# Data Preparation

In [None]:
# Select relevant features and apply one-hot encoding
features = data[['Age', 'Monthly Revenue', 'Device']]
features_encoded = pd.get_dummies(features, columns=['Device'], drop_first=True)

# Normalize the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_encoded)


# Dimensionality Reduction with PCA

In [None]:
# Apply PCA to reduce data to 2 dimensions
pca = PCA(n_components=2)
features_pca = pca.fit_transform(features_scaled)


# K-means Clustering

In [None]:
# Apply K-means clustering with k=4
kmeans = KMeans(n_clusters=4, random_state=42)
clusters = kmeans.fit_predict(features_pca)


# Visualizing the Clusters

In [None]:
# Visualize the clusters
plt.figure(figsize=(10, 6))
plt.scatter(features_pca[:, 0], features_pca[:, 1], c=clusters, cmap='viridis', s=50)
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.75, marker='X')
plt.title('Clusters in PCA Reduced Space')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()


# Profiling the Clusters

In [None]:
# Add the cluster labels to the original data
data['Cluster'] = clusters

# Analyze the characteristics of each cluster
cluster_profile = data.groupby('Cluster').agg({
    'Age': ['mean', 'median', 'std'],
    'Monthly Revenue': ['mean', 'median', 'std'],
    'Device': lambda x: x.value_counts().index[0],
    'Subscription Type': lambda x: x.value_counts().index[0],
    'Country': lambda x: x.value_counts().index[0]
}).reset_index()

# Rename columns
cluster_profile.columns = [
    'Cluster', 'Mean Age', 'Median Age', 'Age Std. Dev.', 
    'Mean Monthly Revenue', 'Median Monthly Revenue', 'Monthly Revenue Std. Dev.',
    'Most Common Device', 'Most Common Subscription Type', 'Most Common Country'
]

cluster_profile


# Interpretation

**Cluster 0:**
* Age: Users in this cluster have an average age of approximately 38.7 years with a median age of 39. The age distribution is relatively tight, with a standard deviation of around 7.34 years.
* Monthly Revenue: These users contribute an average monthly revenue of about $12.48, with a median value of $13
* Device: The dominant device used by this segment is a Smart TV, suggesting they prefer larger screens for content consumption.
* Subscription Type: The Basic subscription is the most popular among these users.
* Country: A significant portion of users in this segment are from Spain.


**Cluster 1:**
* Age: Users here have an average age of approximately 39 years, with a median also at 39, and a standard deviation around 7.11 years.
* Monthly Revenue: On average, they contribute slightly higher revenue compared to Cluster 0, with a mean of $12.57 and a median of $13.
* Device: This segment predominantly uses Laptops for accessing content.
* Subscription Type: Just like Cluster 0, the Basic subscription type is the most popular among these users.
* Country: What differentiates this cluster significantly is the geography; most of these users are from the United States.


**Cluster 2:**
* Age: The average age of users is roughly 38.7 years with a median age of 39.
* Monthly Revenue: They generate an average monthly revenue of about $12.38, with a slightly lower median of $12 compared to the other clusters.
* Device: Users in this segment prefer to access content via their Smartphones.
* Subscription Type: The dominant subscription type remains Basic.
* Country: Spain is the predominant country for this cluster, similar to Cluster 0.


**Cluster 3:**
* Age: This cluster's age profile is very similar to the others, with an average age of about 38.7 years and a median of 39.
* Monthly Revenue: Users in this segment bring in an average monthly revenue of around $12.59, with a median of $13.
* Device: Tablets are the go-to device for content consumption among these users.
* Subscription Type: The Basic subscription type continues to be the most popular.
Country: Spain is the primary country for this cluster as well.

# Conclusion

**Device Preference** 
* Each cluster predominantly uses a different device: Smart TVs, Laptops, Smartphones, and Tablets. This insight is valuable for optimizing the user interface and experience for each device type.

**Geography**
* While Spain dominates three of the clusters, the United States stands out in Cluster 1, which primarily uses laptops. This geographic distinction can guide region-specific content recommendations or marketing strategies.

**Subscription Type & Revenue**
* Even though the Basic subscription is dominant across all clusters, there are subtle differences in the average monthly revenues, suggesting variations in additional services or content consumption patterns.