Dataset: Use the famous Iris dataset, which is available in many machine learning libraries. This dataset consists of 150 samples of iris flowers, each belonging to one of three species: setosa, versicolor, or virginica.

In [None]:
import pandas as pd

# Replace 'your_dataset.csv' with the actual filename/path of your dataset
file_path = './dataset.xlsx'

# Read the Excel file into a DataFrame
df = pd.read_excel(file_path)

# Display the first few rows of the DataFrame
print("First few rows of the DataFrame:")
print(df.head())

# Display basic statistics about the dataset
print("\nDataset Statistics:")
print(df.describe())

# Display information about the dataset, including data types and missing values
print("\nDataset Information:")
print(df.info())


This code creates a pair plot to visualize relationships between features and a heatmap of the correlation matrix. Adjust the column names based on your actual dataset structure.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

subset_df = df.head(100)

# Pair plot to visualize relationships between features
sns.pairplot(subset_df, hue='Class')  # Assuming 'class' is the column containing the class labels
plt.show()

# Correlation matrix heatmap
correlation_matrix = subset_df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()


In [None]:
import pandas as pd

def handle_missing_values(df):
    """
    Handle missing values in a DataFrame.

    Parameters:
    - df: pandas DataFrame

    Returns:
    - df_cleaned: DataFrame with missing values handled
    """

    # Display the count of missing values in each column
    print("Missing Values Before Handling:")
    print(df.isnull().sum())

    # Strategy 1: Drop rows with any missing values
    df_cleaned = df.dropna()

    # Strategy 2: Fill missing numeric values with the mean
    # You can customize this strategy based on your data
    df_cleaned = df_cleaned.fillna(df_cleaned.mean())

    # Display the count of missing values after handling
    print("\nMissing Values After Handling:")
    print(df_cleaned.isnull().sum())

    return df_cleaned

# Assuming 'your_dataset.csv' is the filename of your dataset
file_path = './dataset.xlsx'

# Read the Excel file into a DataFrame
original_df = pd.read_excel(file_path)

# Handle missing values
cleaned_df = handle_missing_values(original_df)


In [66]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Assuming 'cleaned_df' is your DataFrame
# Features (X) and target variable (y)
X = cleaned_df.drop("Class", axis=1)
y = cleaned_df["Class"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)



In [None]:
X_test

k-means

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.metrics import adjusted_rand_score



# Use all features for clustering
X_clustering = cleaned_df.drop("Class", axis=1)
true_labels = cleaned_df['Class']


# Implement K-Means clustering
kmeans = KMeans(n_clusters=7, random_state=42)
kmeans.fit(X_clustering)

# Analyze the clusters formed
cleaned_df['Cluster'] = kmeans.labels_
ari_score = adjusted_rand_score(true_labels, cleaned_df['Cluster'])
print(f"Adjusted Rand Index: {ari_score}")

silhouette_avg = silhouette_score(X, cleaned_df['Cluster'])
print(f"Silhouette Score: {silhouette_avg}")

# Visualize the clusters
plt.scatter(X_clustering['Area'], X_clustering['Perimeter'], c=cleaned_df['Cluster'], cmap='viridis', s=50)
plt.title('K-Means Clustering')
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Sepal Width (cm)')
plt.show()




In [None]:


# Assuming cleaned_df is your DataFrame and X_clustering is your features
# Change 'Cluster' to the actual column name if it's different
attributes = ['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength', 'AspectRation',
              'Eccentricity', 'ConvexArea','EquivDiameter', 'Extent', 'Solidity', 'roundness',
              'Compactness', 'ShapeFactor1', 'ShapeFactor2', 'ShapeFactor3', 'ShapeFactor4']

# Create scatter plots for each pair of attributes
for i in range(len(attributes)):
    for j in range(i + 1, len(attributes)):
        attribute1 = attributes[i]
        attribute2 = attributes[j]

        plt.scatter(X_clustering[attribute1], X_clustering[attribute2], c=cleaned_df['Cluster'], cmap='viridis', s=50)
        plt.title(f'K-Means Clustering - {attribute1} vs {attribute2}')
        plt.xlabel(attributes[i])
        plt.ylabel(attributes[j])
        plt.show()
