# Scikit-learn

**Overview**

A comprehensive library for machine learning in Python, focusing on simple and efficient tools for data mining and data analysis.

**Advantages**
- Wide range of machine learning algorithms for classification, regression, clustering, and more.
- Excellent documentation and ease of use.
- Built-in tools for model selection, evaluation, and preprocessing.

**Best Use Cases**
- General-purpose machine learning tasks.
- Rapid prototyping and experimentation.
- Educational purposes due to its simplicity and extensive documentation.

## Cheat Sheet

### Importing

In [None]:
import sklearn
from sklearn import datasets, model_selection, preprocessing, metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

### Data Loading and Preprocessing

In [None]:
# Loading Datasets
# Load iris dataset
iris = datasets.load_iris()
X, y = iris.data, iris.target

# Load digits dataset
digits = datasets.load_digits()
X_digits, y_digits = digits.data, digits.target


# Splitting Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Scaling Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Model Building and Training

In [None]:
# Importing Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Training a Model
# Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)

### Model Evaluation

In [None]:
# Making Predictions
y_pred = model.predict(X_test)


# Evaluating Performance
# Accuracy
print(accuracy_score(y_test, y_pred))

# Confusion Matrix
print(confusion_matrix(y_test, y_pred))

# Classification Report
print(classification_report(y_test, y_pred))

### Cross-Validation

In [None]:
from sklearn.model_selection import cross_val_score

# Cross-validation scores
scores = cross_val_score(model, X, y, cv=5)
print(scores)
print(scores.mean())

### Hyperparameter Tuning

In [None]:
# Grid Search
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'C': [0.1, 1, 10],
    'solver': ['lbfgs', 'liblinear']
}

# Instantiate GridSearchCV
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)

# Fit the model
grid_search.fit(X_train, y_train)

# Best parameters and best score
print(grid_search.best_params_)
print(grid_search.best_score_)


# Random Search
from sklearn.model_selection import RandomizedSearchCV

# Define parameter grid
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [4, 6, 8, 10, 12],
    'criterion': ['gini', 'entropy']
}

# Instantiate RandomizedSearchCV
random_search = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_dist, n_iter=10, cv=5)

# Fit the model
random_search.fit(X_train, y_train)

# Best parameters and best score
print(random_search.best_params_)
print(random_search.best_score_)

### Pipelines

In [None]:
# Creating a Pipeline
from sklearn.pipeline import Pipeline

# Create a pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', SVC())
])

# Fit the pipeline
pipeline.fit(X_train, y_train)


# Pipeline with Grid Search
# Define parameter grid
param_grid = {
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf']
}

# Instantiate GridSearchCV with pipeline
grid_search = GridSearchCV(pipeline, param_grid, cv=5)

# Fit the model
grid_search.fit(X_train, y_train)

# Best parameters and best score
print(grid_search.best_params_)
print(grid_search.best_score_)

### Common Preprocessing Techniques

In [None]:
# Imputation
from sklearn.impute import SimpleImputer

# Create an imputer
imputer = SimpleImputer(strategy='mean')

# Fit and transform the data
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)


# Encoding Categorical Features
from sklearn.preprocessing import OneHotEncoder

# Create an encoder
encoder = OneHotEncoder()

# Fit and transform the data
X_encoded = encoder.fit_transform(X)

### Feature Selection

In [None]:
# Univariate Feature Selection
from sklearn.feature_selection import SelectKBest, f_classif

# Select top 2 features
selector = SelectKBest(f_classif, k=2)

# Fit and transform the data
X_new = selector.fit_transform(X, y)


# Recursive Feature Elimination
from sklearn.feature_selection import RFE

# Create an RFE object
rfe = RFE(estimator=LogisticRegression(), n_features_to_select=2)

# Fit and transform the data
X_rfe = rfe.fit_transform(X, y)

### Clustering

In [None]:
# K-Means Clustering
from sklearn.cluster import KMeans

# Instantiate and fit the model
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)

# Predict cluster labels
labels = kmeans.predict(X)


# Agglomerative Clustering
from sklearn.cluster import AgglomerativeClustering

# Instantiate and fit the model
agg_clustering = AgglomerativeClustering(n_clusters=3)
labels = agg_clustering.fit_predict(X)

### Dimensionality Reduction

In [None]:
# Principal Component Analysis (PCA)
from sklearn.decomposition import PCA

# Instantiate and fit the model
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Explained variance ratio
print(pca.explained_variance_ratio_)


# t-Distributed Stochastic Neighbor Embedding (t-SNE)
from sklearn.manifold import TSNE

# Instantiate and fit the model
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)

### Model Persistence

In [None]:
# Saving and Loading Models
import joblib

# Save the model
joblib.dump(model, 'model.pkl')

# Load the model
loaded_model = joblib.load('model.pkl')

### Example

In [3]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

# Load the dataset
iris = datasets.load_iris()
X, y = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data
X_test_scaled = scaler.transform(X_test)

# Initialize the model
model = LogisticRegression(random_state=42)

# Fit the model on the training data
model.fit(X_train_scaled, y_train)

# Predict on the test data
y_pred = model.predict(X_test_scaled)

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Classification Report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'solver': ['lbfgs', 'liblinear']
}

# Initialize GridSearchCV
grid_search = GridSearchCV(LogisticRegression(random_state=42), param_grid, cv=5)

# Fit the grid search to the data
grid_search.fit(X_train_scaled, y_train)

# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Get the best model from grid search
best_model = grid_search.best_estimator_

# Fit the best model on the entire training data
best_model.fit(X_train_scaled, y_train)

# Predict on the test data with the best model
y_pred_best = best_model.predict(X_test_scaled)

# Accuracy of the best model
print("Accuracy of Best Model:", accuracy_score(y_test, y_pred_best))

# Save the best model
#joblib.dump(best_model, 'best_logistic_regression_model.pkl')

# Load the model
loaded_model = joblib.load('best_logistic_regression_model.pkl')

# Predict using the loaded model
y_pred_loaded = loaded_model.predict(X_test_scaled)

# Accuracy of the loaded model
print("Accuracy of Loaded Model:", accuracy_score(y_test, y_pred_loaded))



Accuracy: 1.0
Confusion Matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

Best Parameters: {'C': 1, 'solver': 'lbfgs'}
Best Score: 0.9583333333333334
Accuracy of Best Model: 1.0
Accuracy of Loaded Model: 1.0


# TensorFlow

**Overview**

An open-source platform developed by Google for machine learning and deep learning.

**Advantages**
- Highly flexible and scalable.
- Support for both high-level and low-level API operations.
- Strong support for deploying models in production, including mobile and embedded devices.
- Extensive ecosystem including TensorBoard for visualization, TensorFlow Lite for mobile, - and TensorFlow Serving for production.

**Best Use Cases**
- Deep learning, especially for neural networks.
- Large-scale machine learning tasks.
- Production-grade deployment.

## Cheat Sheet

### Importing

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

### Basic TensorFlow Concepts

In [None]:
# Tensors
# Create a constant tensor
a = tf.constant([[1, 2], [3, 4]])

# Create a tensor with random values
b = tf.random.normal([2, 2], mean=0, stddev=1)

# Perform basic operations
c = tf.add(a, b)
d = tf.matmul(a, b)


# Variables
# Create a variable
v = tf.Variable(tf.random.normal([2, 2], mean=0, stddev=1))

# Assign a new value
v.assign([[1, 2], [3, 4]])

# Perform operations
v.assign_add([[1, 1], [1, 1]])

### Building Neural Networks

In [None]:
# Sequential Model
# Define a sequential model
model = Sequential([
    layers.Dense(64, activation='relu', input_shape=(784,)),
    layers.Dense(64, activation='relu'),
    layers.Dense(10, activation='softmax')
])

# Compiling the Model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Training the Model / Fit the model
history = model.fit(X_train, y_train, epochs=10, validation_split=0.2)

# Evaluating the Model
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)

# Making Predictions
predictions = model.predict(X_test)

### Common Layers and Activation Functions

In [None]:
# Dense Layer
layer = layers.Dense(64, activation='relu')

# Activation Functions
layer = layers.Dense(64, activation='sigmoid')
layer = layers.Dense(64, activation='tanh')
layer = layers.Dense(64, activation='softmax')

# Convolutional Layers
layer = layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1))
layer = layers.MaxPooling2D((2, 2))
layer = layers.Conv2D(64, (3, 3), activation='relu')
layer = layers.MaxPooling2D((2, 2))
layer = layers.Conv2D(64, (3, 3), activation='relu')

# Recurrent Layers
layer = layers.LSTM(64, return_sequences=True)
layer = layers.GRU(64)

### Data Preprocessing

In [None]:
# Normalization
X_train = X_train / 255.0
X_test = X_test / 255.0

# One-hot encode labels
y_train = tf.keras.utils.to_categorical(y_train)
y_test = tf.keras.utils.to_categorical(y_test)

### Callbacks

In [None]:
# Early Stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

# Model Checkpoint
model_checkpoint = ModelCheckpoint('best_model.h5', save_best_only=True)

# Using Callbacks in Training
history = model.fit(X_train, y_train, epochs=10, validation_split=0.2, callbacks=[early_stopping, model_checkpoint])

### Model Saving and Loading

In [None]:
# Saving a Model
model.save('my_model.h5')

# # Load a model
model = tf.keras.models.load_model('my_model.h5')

### Advanced Topics

In [None]:
# Custom Layers
class MyLayer(layers.Layer):
    def __init__(self, output_dim, **kwargs):
        self.output_dim = output_dim
        super(MyLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.kernel = self.add_weight(name='kernel',
                                      shape=(input_shape[1], self.output_dim),
                                      initializer='uniform',
                                      trainable=True)
        super(MyLayer, self).build(input_shape)

    def call(self, inputs):
        return tf.matmul(inputs, self.kernel)

# Use custom layer
model = Sequential([
    MyLayer(64, input_shape=(784,)),
    layers.Dense(10, activation='softmax')
])


# Custom Training Loops
# Define a custom training loop
loss_object = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam()

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

@tf.function
def train_step(images, labels):
    with tf.GradientTape() as tape:
        predictions = model(images, training=True)
        loss = loss_object(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    train_loss(loss)
    train_accuracy(labels, predictions)

for epoch in range(EPOCHS):
    for images, labels in train_dataset:
        train_step(images, labels)
    print(f'Epoch {epoch+1}, Loss: {train_loss.result()}, Accuracy: {train_accuracy.result()*100}')

### TensorBoard

In [None]:
# Load TensorBoard
%load_ext tensorboard

# Define a callback for TensorBoard
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='./logs')

# Use TensorBoard callback during training
model.fit(X_train, y_train, epochs=10, validation_split=0.2, callbacks=[tensorboard_callback])

# Start TensorBoard
%tensorboard --logdir logs

### Distributed Training

In [None]:
# Define a distribution strategy
strategy = tf.distribute.MirroredStrategy()

# Open a strategy scope and define the model
with strategy.scope():
    model = Sequential([
        layers.Dense(64, activation='relu', input_shape=(784,)),
        layers.Dense(64, activation='relu'),
        layers.Dense(10, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, validation_split=0.2)

### Example

In [None]:
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import matplotlib.pyplot as plt
import numpy as np


# Load the dataset
(X_train, y_train), (X_test, y_test) = datasets.mnist.load_data()

# Normalize the pixel values
X_train, X_test = X_train / 255.0, X_test / 255.0

# Reshape the data to include a channel dimension
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)

# Define the model
model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(10, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_model.keras', save_best_only=True)

# Train the model
history = model.fit(X_train, y_train, epochs=10, validation_split=0.2, 
                    callbacks=[early_stopping, model_checkpoint])

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)

# Plot training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Make predictions on the test data
predictions = model.predict(X_test)

# Display the first prediction
print("Predicted label:", np.argmax(predictions[0]))
print("True label:", y_test[0])

# Plot the first test image
plt.imshow(X_test[0].reshape(28, 28), cmap='gray')
plt.title("True Label: " + str(y_test[0]) + ", Predicted: " + str(np.argmax(predictions[0])))
plt.show()

# Save the model
model.save('mnist_cnn_model.h5')

# Load the model
loaded_model = tf.keras.models.load_model('mnist_cnn_model.h5')

# Evaluate the loaded model
loaded_test_loss, loaded_test_acc = loaded_model.evaluate(X_test, y_test)
print('Loaded model test accuracy:', loaded_test_acc)
