In [8]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import accuracy_score

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00488/Live_20210128.csv"
data = pd.read_csv(url)

# Q1: What is the shape of the data?
print("Shape of the data:", data.shape)

# Check and count Null values
null_counts = data.isnull().sum()
null_features = null_counts[null_counts > 0]
# Q2: How many features contain Null values?
print("Number of features with Null values:", len(null_features))

# Drop features with Null values
data = data.drop(columns=null_features.index)

# Save 'status_type' as the target variable
target = data['status_type']

# Drop unwanted features
features_to_drop = ["status_id", "status_type", "status_published"]
data = data.drop(columns=features_to_drop)

# Q3: Column 'status_type' has _ _ _ _ (number) unique values.
label_encoder = LabelEncoder()
encoded_target = label_encoder.fit_transform(target)
print("Number of unique values in 'status_type':", len(label_encoder.classes_))

# Scale the features
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# Q4: Train the model using KMeans clustering (Take Random state=10)
kmeans_2 = KMeans(n_clusters=2, random_state=10)
kmeans_2.fit(data_scaled)

# Q4: Inertia score at k=2
print("Inertia score at k=2:", kmeans_2.inertia_)

# Q5: Predict labels at k=4
kmeans_4 = KMeans(n_clusters=4, random_state=10)
kmeans_4.fit(data_scaled)
kmeans_labels = kmeans_4.labels_

# Calculate the number of correct labels for k=4
true_labels = encoded_target  # Assuming true labels are available for comparison
num_correct_labels = np.sum(kmeans_labels == true_labels)
print("Number of labels predicted accurately at k=4:", num_correct_labels)



# Q6/Q7/Q8: Train the model using Agglomerative Clustering
agg_clustering = AgglomerativeClustering(n_clusters=4, metric='euclidean', linkage='ward')
agg_labels = agg_clustering.fit_predict(data_scaled)

# Q6: Label predicted for the first row
print("Label predicted for first row:", agg_labels[0])

# Q7: Number of leaves in the hierarchical tree
print("Number of leaves in the hierarchical tree:", agg_clustering.n_leaves_)

# Q8: Accuracy of the model (percentage)
accuracy_k4 = accuracy_score(true_labels, agg_labels) * 100
print("Accuracy of the model (percentage):", accuracy_k4)


Shape of the data: (7050, 16)
Number of features with Null values: 4
Number of unique values in 'status_type': 4
Inertia score at k=2: 48802.47810852059
Number of labels predicted accurately at k=4: 176
Label predicted for first row: 3
Number of leaves in the hierarchical tree: 7050
Accuracy of the model (percentage): 28.496453900709216
