In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = "mldatasetfinal numbers.csv"
data = pd.read_csv(file_path)

# Assuming your dataset has features and no class labels
features = data.drop(columns=["Price"])  # Adjust the column name if needed

# Handle missing values by imputing with mean
imputer = SimpleImputer(strategy="mean")
features_imputed = imputer.fit_transform(features)

# Apply k-means with k = 3 (or 5 based on your dataset)
k = 3
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(features_imputed)

# Apply PCA for visualization
pca = PCA(n_components=2)
features_pca = pca.fit_transform(features_imputed)

# Create a DataFrame with the PCA components and cluster assignments
df_pca = pd.DataFrame(features_pca, columns=["PC1", "PC2"])
df_pca["Cluster"] = clusters

# Visualize the clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x="PC1", y="PC2", hue="Cluster", data=df_pca, palette="viridis", legend="full")
plt.title("K-Means Clustering Visualization (PCA)")
plt.show()


In [None]:
#2
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

# Load the dataset
file_path = "mldatasetfinal numbers.csv"
data = pd.read_csv(file_path)

# Assuming your dataset has features and no class labels
features = data.drop(columns=["Price"])  # Adjust the column name if needed

# Handle missing values by imputing with mean
imputer = SimpleImputer(strategy="mean")
features_imputed = imputer.fit_transform(features)

# Calculate the average Euclidean distance for a range of k values
k_values = range(1, 32)
distortions = []

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(features_imputed)
    distortions.append(kmeans.inertia_)  # Inertia is the average squared distance from each point to its assigned center

# Plot the elbow method
plt.figure(figsize=(10, 6))
plt.plot(k_values, distortions, marker='o')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Average Euclidean Distance')
plt.show()


In [None]:
#3
import pandas as pd
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.impute import SimpleImputer
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt

# Load the dataset
file_path = "mldatasetfinal numbers.csv"
data = pd.read_csv(file_path)

# Assuming your dataset has features and no class labels
features = data.drop(columns=["Price"])  # Adjust the column name if needed

# Handle missing values by imputing with mean
imputer = SimpleImputer(strategy="mean")
features_imputed = imputer.fit_transform(features)

# Perform hierarchical clustering
agg_cluster = AgglomerativeClustering(n_clusters=None, distance_threshold=0, linkage='ward')
clusters = agg_cluster.fit_predict(features_imputed)

# Create linkage matrix for dendrogram
linkage_matrix = linkage(features_imputed, method='ward')

# Plot the dendrogram
plt.figure(figsize=(15, 8))
dendrogram(linkage_matrix, truncate_mode='level', p=3, leaf_rotation=45, leaf_font_size=12, show_contracted=True)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample Index or (Cluster Size)')
plt.ylabel('Distance')
plt.show()



[2023-11-28 03:51:22] Features: 1/9 -- score: 0.5511450381679389
[2023-11-28 03:51:50] Features: 2/9 -- score: 0.5748091603053436
[2023-11-28 03:52:11] Features: 3/9 -- score: 0.6259541984732824
[2023-11-28 03:52:32] Features: 4/9 -- score: 0.6553435114503816
[2023-11-28 03:52:49] Features: 5/9 -- score: 0.666793893129771
[2023-11-28 03:53:03] Features: 6/9 -- score: 0.6801526717557251
[2023-11-28 03:53:14] Features: 7/9 -- score: 0.6805343511450382
[2023-11-28 03:53:23] Features: 8/9 -- score: 0.6729007633587786
[2023-11-28 03:53:28] Features: 9/9 -- score: 0.668702290076336

NameError: ignored

In [None]:
#4
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from mlxtend.feature_selection import SequentialFeatureSelector
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt

# Load the datamldatasetfinal numbers.csv"
data = pd.read_csv(file_path)

# Assuming your dataset has features and class labels
X = data.drop(columns=["Price"])  # Features
y = data["Price"]  # Target variable

# Handle missing values by imputing with mean
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="mean")
X_imputed = imputer.fit_transform(X)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Use a classifier (Random Forest in this case)
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Forward Sequential Selection
sfs = SequentialFeatureSelector(clf, k_features="best", forward=True, floating=False, verbose=2, scoring='accuracy', cv=5)

# Fit the Sequential Feature Selector to the training data
sfs.fit(X_train, y_train)

# Plot the performance of feature subsets
fig = plot_sfs(sfs.get_metric_dict(), kind='std_dev')

# Print the selected feature indices
print("Selected feature indices:", sfs.k_feature_idx_)

# Transform the data to include only the selected features
X_train_selected = sfs.transform(X_train)
X_test_selected = sfs.transform(X_test)


In [None]:
#5
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

# Load the dataset
file_path = "mldatasetfinal numbers.csv"
data = pd.read_csv(file_path)

# Assuming your dataset has features and no class labels
features = data.drop(columns=["Price"])  # Adjust the column name if needed

# Handle missing values by imputing with mean
imputer = SimpleImputer(strategy="mean")
features_imputed = imputer.fit_transform(features)

# Perform PCA
pca = PCA()
features_pca = pca.fit_transform(features_imputed)

# Calculate the cumulative explained variance ratio
cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)

# Find the number of components needed to capture 95% of the variance
num_components_95_percent = np.argmax(cumulative_variance_ratio >= 0.95) + 1

# Print the results
print("Number of features needed to capture 95% of data variance:", num_components_95_percent)

# Plot the explained variance ratio
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), cumulative_variance_ratio, marker='o')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Explained Variance Ratio vs. Number of Principal Components')
plt.show()


In [None]:
#6
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the dataset
file_path = "mldatasetfinal numbers.csv"
data = pd.read_csv(file_path)

# Assuming your dataset has features and class labels
X = data.drop(columns=["Price"])  # Features
y = data["Price"]  # Target variable

# Handle missing values by imputing with mean
imputer = SimpleImputer(strategy="mean")
X_imputed = imputer.fit_transform(X)

# Perform PCA and transform the data
pca = PCA(n_components=num_components_95_percent)
X_pca = pca.fit_transform(X_imputed)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Use a classifier (Random Forest in this case)
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the classifier on the training data
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy using transformed dataset:", accuracy)
