# Task 1: Exploratory Data Analysis (EDA)

In [5]:
# Step 1: Load the Mushroom dataset
import pandas as pd

In [6]:
mushroom_data = pd.read_csv("mushroom.csv")

In [7]:
# Step 2: Data Exploration
print(mushroom_data.head())  # Display the first few rows of the dataset

   Unnamed: 0 cap_shape cap_surface cap_color bruises   odor gill_attachment  \
0        1167    sunken       scaly     white      no  anise      descending   
1        1037    sunken     fibrous       red      no  anise         notched   
2         309      flat     grooves    purple     yes   foul      descending   
3         282      bell       scaly      pink     yes  fishy         notched   
4         820      flat      smooth    yellow     yes  musty            free   

  gill_spacing gill_size gill_color  ...  veil_type veil_color ring_number  \
0      distant     broad       pink  ...    partial      brown         two   
1      crowded    narrow  chocolate  ...  universal      brown         two   
2      crowded     broad     purple  ...  universal     yellow         two   
3        close     broad     orange  ...    partial     yellow         two   
4      crowded    narrow     orange  ...  universal      white        none   

   ring_type spore_print_color population  habitat

In [None]:
print(mushroom_data.describe())  # Get basic statistics of the dataset

In [None]:
# Step 3: Feature Distributions
import matplotlib.pyplot as plt

In [None]:
mushroom_data.hist(figsize=(10, 10))  # Plot histograms for all features
plt.show()

# Task 2: Data Preprocessing

In [8]:
# Step 1: Encode Categorical Variables
from sklearn.preprocessing import LabelEncoder

In [9]:
label_encoder = LabelEncoder()
for column in mushroom_data.columns:
    mushroom_data[column] = label_encoder.fit_transform(mushroom_data[column])

In [10]:
mushroom_data.head()

Unnamed: 0.1,Unnamed: 0,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat,class,stalk_height,cap_diameter
0,889,4,2,7,0,1,1,2,0,7,...,0,0,2,6,3,1,5,1,916,190
1,786,4,0,6,0,1,3,1,1,3,...,1,0,2,6,1,2,5,0,128,750
2,246,3,1,5,1,4,1,1,0,8,...,1,3,2,6,6,0,5,1,537,281
3,221,0,2,4,1,3,3,0,0,6,...,0,3,2,0,4,1,0,1,234,829
4,630,3,3,8,1,5,2,1,1,6,...,1,2,0,4,8,1,4,1,895,484


In [11]:
# Step 2: Split Dataset into Training and Testing Sets
from sklearn.model_selection import train_test_split

In [12]:
X = mushroom_data.drop('class', axis=1)
y = mushroom_data['class']

In [13]:
X

Unnamed: 0.1,Unnamed: 0,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat,stalk_height,cap_diameter
0,889,4,2,7,0,1,1,2,0,7,...,8,0,0,2,6,3,1,5,916,190
1,786,4,0,6,0,1,3,1,1,3,...,0,1,0,2,6,1,2,5,128,750
2,246,3,1,5,1,4,1,1,0,8,...,3,1,3,2,6,6,0,5,537,281
3,221,0,2,4,1,3,3,0,0,6,...,5,0,3,2,0,4,1,0,234,829
4,630,3,3,8,1,5,2,1,1,6,...,7,1,2,0,4,8,1,4,895,484
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,656,4,2,0,1,4,0,1,1,3,...,4,0,1,1,2,6,0,1,283,680
1996,885,4,2,5,1,5,0,2,1,9,...,4,0,2,1,7,8,2,5,335,521
1997,205,2,1,3,1,3,3,0,1,7,...,6,0,0,2,1,8,0,6,859,438
1998,329,0,3,7,0,7,2,0,1,6,...,5,1,3,1,0,1,5,6,679,320


In [14]:
y

0       1
1       0
2       1
3       1
4       1
       ..
1995    1
1996    1
1997    1
1998    1
1999    1
Name: class, Length: 2000, dtype: int32

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Task 3: Data Visualization

In [None]:
# Step 1: Visualize Feature Distributions and Relationships
import seaborn as sns

sns.pairplot(mushroom_data)
plt.show()

In [None]:
# Step 2: Visualize Class Distribution
sns.countplot(x='class', data=mushroom_data)
plt.show()

# Task 4: SVM Implementation

In [None]:
# Step 1: Implement Basic SVM Classifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [None]:
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

In [None]:
# Step 2: Model Evaluation
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Task 5: Visualization of SVM Results

In [None]:
# Visualize Classification Results
sns.scatterplot(x=X_test.iloc[:, 0], y=X_test.iloc[:, 1], hue=y_pred)
plt.title('SVM Classification Results')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

In [None]:
# Step 1: Experiment with different SVM hyperparameters
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['rbf', 'poly', 'linear']}
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=3)
grid.fit(X_train, y_train)

In [None]:
print("Best Parameters:", grid.best_params_)
print("Best Estimator:", grid.best_estimator_)

In [None]:
# Retrain SVM with best parameters
best_svm_model = grid.best_estimator_
best_svm_model.fit(X_train, y_train)

In [None]:
# Evaluate model performance
y_pred_best = best_svm_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
print("Accuracy (Best Parameters):", accuracy_best)
print("Classification Report (Best Parameters):\n", classification_report(y_test, y_pred_best))

In [None]:
# Step 1: Compare SVM performance with different kernels
svm_kernels = ['linear', 'poly', 'rbf']
for kernel in svm_kernels:
    svm_model = SVC(kernel=kernel)
    svm_model.fit(X_train, y_train)
    y_pred_kernel = svm_model.predict(X_test)
    accuracy_kernel = accuracy_score(y_test, y_pred_kernel)
    print("Kernel:", kernel)
    print("Accuracy:", accuracy_kernel)
    print("Classification Report:\n", classification_report(y_test, y_pred_kernel))
    print()

In [None]:
# Step 2: Analyze SVM strengths and weaknesses
# - Linear kernel might perform well with high-dimensional datasets like the Mushroom dataset.
# - Polynomial and RBF kernels might capture more complex relationships but could be prone to overfitting.

# Step 3: Discuss practical implications
# - SVM can be used for classification tasks where the decision boundary is not necessarily linear.
# - SVM's ability to handle high-dimensional data and nonlinear relationships makes it suitable for various real-world applications, including image recognition, text classification, and bioinformatics.

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Sigmoid function definition
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Simulate weight range (feature) for fruits
weights = np.linspace(100, 200, 100)  # grams
color = 1  # let's say it's red (binary feature)

# Let's assume logistic regression weights for weight and color
w_weight = 0.05
w_color = 0.8
bias = -9

# Compute z and sigmoid for each weight
z = w_weight * weights + w_color * color + bias
probabilities = sigmoid(z)

# Plot the sigmoid curve
plt.figure(figsize=(10, 6))
plt.plot(weights, probabilities, label="P(Fruit is Apple | Weight)", color="green")
plt.axhline(y=0.5, color='red', linestyle='--', label='Decision Threshold = 0.5')
plt.xlabel("Fruit Weight (grams)")
plt.ylabel("Probability of being Apple")
plt.title("Logistic Regression: Apple vs Orange Based on Weight")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
