# Support Vector Machines 
## Instructions
1. Use any dataset from https://archive.ics.uci.edu/datasets?Task=Clustering&skip=0&take=10&sort=desc&orderBy=NumHits&search=&Area=Biology
2. Each student should have a unique dataset otherwise no points will be given, so you need to discuss with your classmate.

In [11]:
pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.


In [12]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
mice_protein_expression = fetch_ucirepo(id=342) 
  
# data (as pandas dataframes) 
X = mice_protein_expression.data.features 
y = mice_protein_expression.data.targets 
  
# metadata 
print(mice_protein_expression.metadata) 
  
# variable information 
print(mice_protein_expression.variables) 


{'uci_id': 342, 'name': 'Mice Protein Expression', 'repository_url': 'https://archive.ics.uci.edu/dataset/342/mice+protein+expression', 'data_url': 'https://archive.ics.uci.edu/static/public/342/data.csv', 'abstract': 'Expression levels of 77 proteins measured in the cerebral cortex of 8 classes of control and Down syndrome mice exposed to context fear conditioning, a task used to assess associative learning.', 'area': 'Biology', 'tasks': ['Classification', 'Clustering'], 'characteristics': ['Multivariate'], 'num_instances': 1080, 'num_features': 80, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['class'], 'index_col': ['MouseID'], 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2015, 'last_updated': 'Tue Apr 16 2024', 'dataset_doi': '10.24432/C50S3Z', 'creators': ['Clara Higuera', 'Katheleen Gardiner', 'Krzysztof Cios'], 'intro_paper': {'title': 'Self-Organizing Feature Maps Identify Proteins Critical to Learning in a Mouse Model

**TASK: Take 10 moderately sized subsamples your dataset and create a correlation plot for each subsample**

In [14]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# metadata
metadata = mice_protein_expression.metadata

# variable information
variables = mice_protein_expression.variables

# data (as pandas dataframes)
X = mice_protein_expression.data.features
y = mice_protein_expression.data.targets

# generate 10 moderately sized subsamples
num_samples = 10
sample_sizes = np.random.randint(50, 200, num_samples)
subsamples = []

for sample_size in sample_sizes:
    X_sample, y_sample, _ = train_test_split(X, y, test_size=0.5, random_state=42)
    sample = pd.concat([X_sample, y_sample], axis=1)
    subsample = sample.sample(n=sample_size, random_state=42)
    subsamples.append(subsample)

# create correlation plots for each subsample
for i, subsample in enumerate(subsamples):
    plt.figure(figsize=(10, 8))
    sns.heatmap(subsample.corr(), annot=True, cmap='coolwarm')
    plt.title(f'Correlation Plot - Subsample {i+1}')
    plt.show()

ValueError: too many values to unpack (expected 3)

**TASK (Classification): If your problem is a classification problem visually check if the target variable is imbalanced**
**TASK (Regression): If your problem is a regression problem visually the distribution of your target variabe**

In [15]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

# Load your imbalanced dataset
# X = ... (features)
# y = ... (target variable)

# Preprocessing
# ... (perform necessary cleaning and preprocessing steps)

# Over-sampling
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

# Under-sampling
rus = RandomUnderSampler()
X_resampled, y_resampled = rus.fit_resample(X, y)

# Ensemble methods
rf = RandomForestClassifier()
rf.fit(X_resampled, y_resampled)

# Model selection
param_grid = {'n_estimators': [10, 50, 100], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10]}
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)
grid_search.fit(X_resampled, y_resampled)

# Model evaluation
y_pred = grid_search.predict(X)
print(classification_report(y, y_pred))
print(confusion_matrix(y, y_pred))

ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (C:\Users\USER\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import ADASYN, SMOTE
from imblearn.under_sampling import NearMiss, RandomUnderSampler
from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, classification_report, confusion_matrix

# metadata
metadata = mice_protein_expression.metadata

# variable information
variables = mice_protein_expression.variables

# data (as pandas dataframes)
X = mice_protein_expression.data.features
y = mice_protein_expression.data.targets

# generate 10 moderately sized subsamples
num_samples = 10
sample_sizes = np.random.randint(50, 200, num_samples)
subsamples = []
for sample_size in sample_sizes:
    X_sample, y_sample, _ = train_test_split(X, y, test_size=0.5, random_state=42)
    sample = pd.concat([X_sample, y_sample], axis=1)
    subsample = sample.sample(n=sample_size, random_state=42)
    subsamples.append(subsample)

# create correlation plots for each subsample
for i, subsample in enumerate(subsamples):
    plt.figure(figsize=(10, 8))
    sns.heatmap(subsample.corr(), annot=True, cmap='coolwarm')
    plt.title(f'Correlation Plot - Subsample {i+1}')
    plt.show()

# Preprocessing
# ... (perform necessary cleaning and preprocessing steps)

# Over-sampling
adasyn = ADASYN()
X_resampled, y_resampled = adasyn.fit_resample(X, y)

# Under-sampling
nm = NearMiss()
X_resampled, y_resampled = nm.fit_resample(X, y)

# Ensemble methods
gbr = GradientBoostingRegressor()
gbr.fit(X_resampled, y_resampled)

# Model selection
param_grid = {'n_estimators': [10, 50, 100], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10]}
grid_search = GridSearchCV(estimator=gbr, param_grid=param_grid, cv=5)
grid_search.fit(X_resampled, y_resampled)

# Model evaluation
y_pred = grid_search.predict(X)
print('Mean Absolute Error:', mean_absolute_error(y, y_pred))
print('Mean Squared Error:', mean_squared_error(y, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y, y_pred)))
print('R2 Score:', r2_score(y, y_pred))

**TASK: Take 10 moderately sized subsamples and create a pairplot of the feature variable in relation to the target variable**

**TASK: Create a clustermap with seaborn to explore the relationships between variables.**

In [3]:
pip install seaborn

Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
                                              0.0/294.9 kB ? eta -:--:--
     -----                                 41.0/294.9 kB 960.0 kB/s eta 0:00:01
     -------------                          102.4/294.9 kB 1.2 MB/s eta 0:00:01
     -------------------                    153.6/294.9 kB 1.1 MB/s eta 0:00:01
     -----------------------                184.3/294.9 kB 1.2 MB/s eta 0:00:01
     ----------------------------------     266.2/294.9 kB 1.3 MB/s eta 0:00:01
     -------------------------------------- 294.9/294.9 kB 1.2 MB/s eta 0:00:00
Collecting numpy!=1.24.0,>=1.20 (from seaborn)
  Downloading numpy-1.26.4-cp311-cp311-win_amd64.whl (15.8 MB)
                                              0.0/15.8 MB ? eta -:--:--
                                              0.1/15.8 MB 2.6 MB/s eta 0:00:07
                                              0.2/15.8 MB 1.8 MB/s eta 0:00:09
                                 


[notice] A new release of pip is available: 22.3 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import seaborn as sns
sns.clustermap(X, figsize=(10, 10))
plt.show()

ModuleNotFoundError: No module named 'seaborn'

**TASK: Perform SVM modelling and check the performance of your model, improve the performance of your model using grdisearch**

In [None]:
from sklearn.svm import SVC 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

X = mice_protein_expression.data.features
y = mice_protein_expression.data.targets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
svm_model = SVC()
svm_model.fit(X_train, y_train)


In [None]:
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 1],
}
grid_search = GridSearchCV(estimator=SVC(), param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy with GridSearchCV:", accuracy)
print(classification_report(y_test, y_pred))
