#  Mini Project 1
### operations that I have performed
- Load collected data from CSV
- Applied **K-Means clustering** to label the data
- Trained **SVM** and **Decision Tree** classifiers
- Visualized results and confusion matrices
- Saved labeled dataset and output plots

## First I have imported required libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from sklearn.model_selection import train_test_split

# Create outputs folder
os.makedirs("outputs", exist_ok=True)

## Step 1: Then I have load my Dataset

In [None]:
# Replace with your actual CSV filename (make sure it's in the same folder)
df = pd.read_csv("stroop_test_data.csv")

# Preview dataset
df.head()

##  Step 2: Then I have featured selection & scaling

In [None]:
# Select features for clustering and classification
X = df[["Accuracy_Percent", "Avg_Reaction_Time_Seconds"]]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

##  Step 3: Then I have applied K-Means clustering

In [None]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=42)
df["Cluster_Label"] = kmeans.fit_predict(X_scaled)

# Preview labeled data
df.head()

##  Step 4: Then plotted functions to visualize clustered data

In [None]:
plt.figure(figsize=(6, 5))
sns.scatterplot(data=df, x="Accuracy_Percent", y="Avg_Reaction_Time_Seconds",
                hue="Cluster_Label", palette='viridis', s=80)
plt.title("Clustered Data")
plt.savefig("outputs/Data_Visualize.jpg")
plt.show()

##  Step 5: Afetr that I have done Train/Test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, df['Cluster_Label'], test_size=0.2, random_state=42
)

print(f"Train size: {len(X_train)} | Test size: {len(X_test)}")

##  Step 6: Then trained SVM classifier

In [None]:
svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train, y_train)
pred_svm = svm.predict(X_test)

# Confusion Matrix for SVM
cm_svm = confusion_matrix(y_test, pred_svm)
ConfusionMatrixDisplay(confusion_matrix=cm_svm).plot()
plt.title("Confusion Matrix - SVM")
plt.savefig("outputs/Confusion_Matrix_SVM.jpg")
plt.show()

print("✅ SVM Accuracy:", accuracy_score(y_test, pred_svm))

##  Step 7: Then I have trained Decision Tree Classifier

In [None]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
pred_dt = dt.predict(X_test)

# Confusion Matrix for Decision Tree
cm_dt = confusion_matrix(y_test, pred_dt)
ConfusionMatrixDisplay(confusion_matrix=cm_dt).plot()
plt.title("Confusion Matrix - Decision Tree")
plt.savefig("outputs/Confusion_Matrix_DT.jpg")
plt.show()

print("✅ Decision Tree Accuracy:", accuracy_score(y_test, pred_dt))

##  Step 8: Then plotted to visualize Decision Tree

In [None]:
plt.figure(figsize=(10, 6))
plot_tree(dt,
          feature_names=["Accuracy_Percent", "Avg_Reaction_Time_Seconds"],
          class_names=['Cluster 0', 'Cluster 1'],
          filled=True)
plt.title("Decision Tree")
plt.show()

## Step 9: Finally i have Saved Labeled Dataset

In [None]:
df.to_csv("stroop_test_data_labeled.csv", index=False)
print("Labeled data saved as stroop_test_data_labeled.csv")