In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
iris = datasets.load_iris()
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
X = iris.data
y = iris.target
feature_names = iris.feature_names
target_names = iris.target_names

df = pd.DataFrame(X, columns=feature_names)
df['species'] = y
X_linear = df.drop(['petal length (cm)', 'species'], axis=1)
y_linear = df['petal length (cm)']
X_train_linear, X_test_linear, y_train_linear, y_test_linear = train_test_split(X_linear, y_linear, test_size=0.3, random_state=42)

#LR
linear_model = LinearRegression()
linear_model.fit(X_train_linear, y_train_linear)
y_pred_linear = linear_model.predict(X_test_linear)
mse = mean_squared_error(y_test_linear, y_pred_linear)
print("Linear Regression")
print("Mean Squared Error:", mse)

#LogR
df['is_virginica'] = (df['species'] == 2).astype(int)
X_logistic = df.drop(['species', 'is_virginica'], axis=1)
y_logistic = df['is_virginica']
X_train_logistic, X_test_logistic, y_train_logistic, y_test_logistic = train_test_split(X_logistic, y_logistic, test_size=0.3, random_state=42)

logistic_model = LogisticRegression()
logistic_model.fit(X_train_logistic, y_train_logistic)
y_pred_logistic = logistic_model.predict(X_test_logistic)
accuracy = accuracy_score(y_test_logistic, y_pred_logistic)
print("\nLogistic Regression")
print("Accuracy:", accuracy)

Linear Regression
Mean Squared Error: 0.10913071951125929

Logistic Regression
Accuracy: 1.0


In [None]:
iris = datasets.load_iris()
X = iris.data
y = iris.target
feature_names = iris.feature_names
target_names = iris.target_names

df = pd.DataFrame(X, columns=feature_names)
df['species'] = y
range_n_clusters = list(range(2, 11))
silhouette_avg_scores = []

for n_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(X)
    silhouette_avg = silhouette_score(X, cluster_labels)
    silhouette_avg_scores.append(silhouette_avg)
    print(f"For n_clusters = {n_clusters}, the average silhouette score is {silhouette_avg}")

plt.figure(figsize=(10, 6))
plt.plot(range_n_clusters, silhouette_avg_scores, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Average Silhouette Score')
plt.title('Silhouette Method for Optimal k')
plt.show()

best_k = range_n_clusters[np.argmax(silhouette_avg_scores)]
print(f"Best number of clusters: {best_k}")
kmeans = KMeans(n_clusters=best_k, random_state=42)
clusters = kmeans.fit_predict(X)
df['cluster'] = clusters


plt.figure(figsize=(12, 6))
for cluster in range(best_k):
    plt.scatter(X[clusters == cluster, 0], X[clusters == cluster, 1], label=f'Cluster {cluster + 1}')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('K-means Clustering on Iris Dataset')
plt.legend()
plt.show()