<a href="https://colab.research.google.com/github/aidanbolinger/MachineLearning/blob/main/ML_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
Grid Search
Exhaustively searches all possible combinations of hyperparameters specified in a parameter grid.
Deterministic: Always finds the same best parameters given the same data and grid.
Can be computationally expensive, especially with a large parameter grid or dataset.
Best when the search space is relatively small.

Random Search
Randomly samples hyperparameter combinations from a distribution over possible values.
Non-deterministic: Can find different best parameters in different runs.
Usually more efficient than grid search, particularly when the search space is large.
More likely to find good hyperparameter combinations in less time, especially if some hyperparameters are less important than others.

In [1]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_neighbors': np.arange(1, 31),  # Test neighbors from 1 to 30
    'weights': ['uniform', 'distance'],  # Test different weighting schemes
    'p': [1, 2]  # Test Manhattan and Euclidean distances
}

# Create a KNN classifier
knn = KNeighborsClassifier()

# Perform GridSearchCV
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', n_jobs=-1)  # Use all cores
grid_search.fit(X_train, y_train)

# Print the best parameters and accuracy for GridSearchCV
print("GridSearchCV best parameters:", grid_search.best_params_)
print("GridSearchCV best accuracy:", grid_search.best_score_)


# Define the parameter distribution for RandomizedSearchCV
param_dist = {
    'n_neighbors': np.arange(1, 31),
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(knn, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

# Print the best parameters and accuracy for RandomizedSearchCV
print("\nRandomizedSearchCV best parameters:", random_search.best_params_)
print("RandomizedSearchCV best accuracy:", random_search.best_score_)

# Evaluate the best models on the test set (optional)
print("\nGridSearchCV test accuracy:", grid_search.score(X_test, y_test))
print("RandomizedSearchCV test accuracy:", random_search.score(X_test, y_test))


GridSearchCV best parameters: {'n_neighbors': np.int64(14), 'p': 1, 'weights': 'uniform'}
GridSearchCV best accuracy: 0.9666666666666666

RandomizedSearchCV best parameters: {'weights': 'uniform', 'p': 2, 'n_neighbors': np.int64(3)}
RandomizedSearchCV best accuracy: 0.9583333333333334

GridSearchCV test accuracy: 1.0
RandomizedSearchCV test accuracy: 1.0


In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

# Load your data (replace 'your_data.csv' with your actual file)
try:
    data = pd.read_csv('/content/drive/MyDrive/diabetes_012_health_indicators_BRFSS2015.csv')
except FileNotFoundError:
    print("Error: 'your_data.csv' not found. Please upload your data file.")
    data = None

if data is not None:
    # Assuming your target variable is in a column named 'target'
    # and features are in other columns.  Adjust column names as needed.
    X = data.drop('Diabetes_012', axis=1)
    y = data['Diabetes_012']

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize the Decision Tree Classifier
    clf = DecisionTreeClassifier(random_state=42)

    # Train the classifier
    clf.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = clf.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Accuracy: 0.7673446862188584


In [7]:
# Define the parameter grid for GridSearchCV with DecisionTreeClassifier
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a Decision Tree Classifier
dt = DecisionTreeClassifier(random_state=42)

# Perform GridSearchCV for Decision Tree
grid_search_dt = GridSearchCV(dt, param_grid_dt, cv=5, scoring='accuracy')
grid_search_dt.fit(X_train, y_train)

# Print the best parameters and accuracy for Decision Tree GridSearchCV
print("\nDecision Tree GridSearchCV best parameters:", grid_search_dt.best_params_)
print("Decision Tree GridSearchCV best accuracy:", grid_search_dt.best_score_)

# Evaluate the best Decision Tree model on the test set
print("Decision Tree GridSearchCV test accuracy:", grid_search_dt.score(X_test, y_test))



Decision Tree GridSearchCV best parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Decision Tree GridSearchCV best accuracy: 0.9818181818181818
Decision Tree GridSearchCV test accuracy: 1.0


In [8]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier

# Define the parameter distribution for RandomizedSearchCV with DecisionTreeClassifier
param_dist_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 50, 100],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': [None, 'sqrt', 'log2']
}

# Create a Decision Tree Classifier
dt = DecisionTreeClassifier(random_state=42)

# Perform RandomizedSearchCV for Decision Tree
random_search_dt = RandomizedSearchCV(dt, param_distributions=param_dist_dt, n_iter=100, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_dt.fit(X_train, y_train)

# Print the best parameters and accuracy for Decision Tree RandomizedSearchCV
print("\nDecision Tree RandomizedSearchCV best parameters:", random_search_dt.best_params_)
print("Decision Tree RandomizedSearchCV best accuracy:", random_search_dt.best_score_)

# Evaluate the best Decision Tree model on the test set
print("Decision Tree RandomizedSearchCV test accuracy:", random_search_dt.score(X_test, y_test))



Decision Tree RandomizedSearchCV best parameters: {'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 50, 'criterion': 'gini'}
Decision Tree RandomizedSearchCV best accuracy: 0.9818181818181818
Decision Tree RandomizedSearchCV test accuracy: 1.0


In [1]:
from sklearn import svm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from google.colab import drive
drive.mount('/content/drive')

# Load your data (replace 'your_data.csv' with your actual file)
try:
    data = pd.read_csv('/content/drive/MyDrive/Data.csv')
except FileNotFoundError:
    print("Error: 'your_data.csv' not found. Please upload your data file.")
    exit() # Exit if the file is not found


# Assuming your target variable is in a column named 'target'
# and features are in other columns. Adjust column names as needed.
X = data.drop('Grade', axis=1)
y = data['Grade']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an SVM classifier
clf = svm.SVC(kernel='linear') # You can change the kernel (e.g., 'rbf', 'poly')

# Train the classifier
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Accuracy: 1.0


In [5]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import svm
from sklearn.metrics import accuracy_score, classification_report
from google.colab import drive
drive.mount('/content/drive')

# Load your data (replace 'your_data.csv' with your actual file)
try:
    data = pd.read_csv('/content/drive/MyDrive/Data.csv')
except FileNotFoundError:
    print("Error: 'your_data.csv' not found. Please upload your data file.")
    exit()

# Assuming your target variable is in a column named 'target'
# and features are in other columns. Adjust column names as needed.
X = data.drop('Grade', axis=1)
y = data['Grade']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with StandardScaler and SVC
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', svm.SVC())
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'svm__C': [0.1, 1, 10, 100],
    'svm__kernel': ['linear', 'rbf', 'poly'],
    'svm__gamma': ['scale', 'auto', 0.01, 0.1, 1]
}


# Perform GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters and accuracy for GridSearchCV
print("SVM GridSearchCV best parameters:", grid_search.best_params_)
print("SVM GridSearchCV best accuracy:", grid_search.best_score_)

# Evaluate the best SVM model on the test set
print("SVM GridSearchCV test accuracy:", grid_search.score(X_test, y_test))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
SVM GridSearchCV best parameters: {'svm__C': 0.1, 'svm__gamma': 'scale', 'svm__kernel': 'linear'}
SVM GridSearchCV best accuracy: 0.9818181818181818
SVM GridSearchCV test accuracy: 0.8571428571428571


In [6]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# Define the parameter distribution for RandomizedSearchCV for SVM
param_dist_svm = {
    'svm__C': np.logspace(-3, 2, 6),  # Example: 6 values between 0.001 and 100
    'svm__kernel': ['linear', 'rbf', 'poly'],
    'svm__gamma': ['scale', 'auto'] + list(np.logspace(-3, 1, 5)) # Example: 5 values between 0.001 and 10
}

# Perform RandomizedSearchCV for SVM
random_search_svm = RandomizedSearchCV(pipeline, param_distributions=param_dist_svm, n_iter=10, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_svm.fit(X_train, y_train)

# Print the best parameters and accuracy for SVM RandomizedSearchCV
print("\nSVM RandomizedSearchCV best parameters:", random_search_svm.best_params_)
print("SVM RandomizedSearchCV best accuracy:", random_search_svm.best_score_)

# Evaluate the best SVM model on the test set
print("SVM RandomizedSearchCV test accuracy:", random_search_svm.score(X_test, y_test))



SVM RandomizedSearchCV best parameters: {'svm__kernel': 'rbf', 'svm__gamma': np.float64(0.01), 'svm__C': np.float64(10.0)}
SVM RandomizedSearchCV best accuracy: 0.9636363636363636
SVM RandomizedSearchCV test accuracy: 0.8571428571428571
