# Linear transformations and equations

These equations and transformations play crucial roles in various aspects of drug discovery, from data preprocessing and feature extraction to similarity measurement and statistical analysis. They provide different ways to represent, compare, and analyze the complex data encountered in these fields.

These examples demonstrate the versatility and importance of PLS Regression in various aspects of drug discovery:

* **QSAR modeling** for predicting molecular activity
* **Protein-ligand binding affinity prediction**
* **Protein structure quality assessment**
* **Drug solubility prediction**
* **Protein-protein interaction prediction**

PLS Regression is particularly useful in these contexts because it can handle high-dimensional data with correlated features, which is common in biological and chemical datasets. It performs dimensionality reduction and regression simultaneously, making it efficient for modeling complex relationships in biomolecular data.

## Instances where we use the Partial Least Squares (PLS) Regression: Equation: Y = XB + E

### QSAR (Quantitative Structure-Activity Relationship) Modeling 

In [None]:
import numpy as np
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

# Generate synthetic data
np.random.seed(42)
n_samples, n_features = 100, 50
X = np.random.randn(n_samples, n_features)
true_B = np.random.randn(n_features, 1)
Y = X.dot(true_B) + np.random.randn(n_samples, 1) * 0.1

# Split data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# PLS regression
pls = PLSRegression(n_components=5)
pls.fit(X_train, Y_train)

# Predictions
Y_pred = pls.predict(X_test)

# Evaluate
r2 = r2_score(Y_test, Y_pred)
print(f"R-squared score: {r2:.4f}")

# Plot
plt.figure(figsize=(10, 6))
plt.scatter(Y_test, Y_pred)
plt.plot([Y_test.min(), Y_test.max()], [Y_test.min(), Y_test.max()], 'r--', lw=2)
plt.xlabel("Actual Activity")
plt.ylabel("Predicted Activity")
plt.title("QSAR Model using PLS Regression")
plt.show()

### Protein-Ligand Binding Affinity Prediction:

In [None]:
import numpy as np
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

# Generate synthetic data
np.random.seed(42)
n_samples, n_protein_features, n_ligand_features = 200, 30, 20
X_protein = np.random.randn(n_samples, n_protein_features)
X_ligand = np.random.randn(n_samples, n_ligand_features)
X = np.hstack((X_protein, X_ligand))
Y = np.sum(X, axis=1) + np.random.randn(n_samples) * 0.1

# PLS regression with cross-validation
pls = PLSRegression(n_components=10)
cv_scores = cross_val_score(pls, X, Y, cv=5, scoring='neg_mean_squared_error')
mse_scores = -cv_scores

# Plot MSE vs number of components
n_components_range = range(1, 21)
mse_scores = []
for n_components in n_components_range:
    pls = PLSRegression(n_components=n_components)
    scores = cross_val_score(pls, X, Y, cv=5, scoring='neg_mean_squared_error')
    mse_scores.append(-scores.mean())

plt.figure(figsize=(10, 6))
plt.plot(n_components_range, mse_scores, marker='o')
plt.xlabel("Number of PLS Components")
plt.ylabel("Mean Squared Error")
plt.title("PLS Components vs MSE in Binding Affinity Prediction")
plt.show()

### Protein Structure Quality Assessment

In [None]:
import numpy as np
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

# Generate synthetic data
np.random.seed(42)
n_samples, n_features = 150, 40
X = np.random.randn(n_samples, n_features)
Y = np.sum(X[:, :10], axis=1) + np.random.randn(n_samples) * 0.1

# Split data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# PLS regression
pls = PLSRegression(n_components=5)
pls.fit(X_train, Y_train)

# Predictions
Y_pred = pls.predict(X_test)

# Evaluate
mse = mean_squared_error(Y_test, Y_pred)
print(f"Mean Squared Error: {mse:.4f}")

# Plot feature importance
feature_importance = np.sum(np.abs(pls.coef_), axis=0)
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5

plt.figure(figsize=(12, 6))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, sorted_idx)
plt.xlabel('Absolute Importance')
plt.ylabel('Feature Index')
plt.title('Feature Importance in Protein Structure Quality Assessment')
plt.show()

### Drug Solubility Prediction:

In [None]:
import numpy as np
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt

# Generate synthetic data
np.random.seed(42)
n_samples, n_features = 300, 25
X = np.random.randn(n_samples, n_features)
Y = np.exp(np.sum(X[:, :5], axis=1)) + np.random.randn(n_samples) * 0.1

# PLS regression
pls = PLSRegression(n_components=10)

# Learning curve
train_sizes, train_scores, test_scores = learning_curve(
    pls, X, Y, train_sizes=np.linspace(0.1, 1.0, 10), cv=5, scoring='neg_mean_squared_error'
)

# Calculate mean and std
train_scores_mean = -np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = -np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

# Plot learning curve
plt.figure(figsize=(10, 6))
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
plt.xlabel("Training examples")
plt.ylabel("Mean Squared Error")
plt.title("Learning Curve for Drug Solubility Prediction")
plt.legend(loc="best")
plt.show()

### Protein-Protein Interaction Prediction:

In [None]:
import numpy as np
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Generate synthetic data
np.random.seed(42)
n_samples, n_features = 500, 60
X = np.random.randn(n_samples, n_features)
Y = (np.sum(X[:, :10], axis=1) > 0).astype(int)

# Split data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

# PLS regression
pls = PLSRegression(n_components=15)
pls.fit(X_train, Y_train)

# Predictions
Y_pred = pls.predict(X_test)

# ROC curve
fpr, tpr, _ = roc_curve(Y_test, Y_pred)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Protein-Protein Interaction Prediction')
plt.legend(loc="lower right")
plt.show()

## Instances where we use the Linear Discriminant Analysis (LDA): Equation: y = w^T x + w_0

* **Protein Structure Classification**:  This snippet uses LDA to classify protein structures into alpha helices and beta sheets based on their features. It showcases LDA's ability to find the optimal linear combination of features that separates different structural classes.

* **Drug Activity Classification**:  This example demonstrates how LDA can be used to classify drugs as active or inactive based on their molecular properties. It uses cross-validation to assess the model's performance and visualizes the results with a confusion matrix.

* **Protein-Ligand Binding Site Prediction**:  This snippet uses LDA to predict protein-ligand binding sites. It demonstrates how LDA can be used for binary classification tasks in structural biology and visualizes the model's performance using a ROC curve.

* **Protein Solubility Prediction**:  This example applies LDA to predict protein solubility. It generates a learning curve to show how the model's performance changes with increasing training data, which is useful for understanding the model's behavior and potential overfitting.

* **Drug Target Classification**:  This snippet uses LDA for multi-class classification of drug targets. It demonstrates LDA's capability to handle multiple classes, which is often necessary in drug discovery when dealing with different types of drug targets. The classification report provides detailed performance metrics, and the visualization shows how LDA projects the data onto a 2D space for separation.

### Protein Structure Classification: 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Generate synthetic data
np.random.seed(42)
n_samples = 300
n_features = 10

# Class 0: Alpha helices, Class 1: Beta sheets
X = np.random.randn(n_samples, n_features)
y = (np.sum(X[:, :3], axis=1) > 0).astype(int)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Apply LDA
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

# Predict and evaluate
y_pred = lda.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Visualize LDA projection
X_lda = lda.transform(X)
plt.figure(figsize=(10, 6))
plt.scatter(X_lda[y==0], np.zeros(sum(y==0)), c='r', label='Alpha helices')
plt.scatter(X_lda[y==1], np.zeros(sum(y==1)), c='b', label='Beta sheets')
plt.legend()
plt.title("LDA Projection of Protein Structures")
plt.xlabel("LDA Component")
plt.show()

### Drug Activity Classification:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Generate synthetic data
np.random.seed(42)
n_samples = 200
n_features = 5

X = np.random.randn(n_samples, n_features)
y = (np.sum(X, axis=1) > 0).astype(int)  # 0: Inactive, 1: Active

# Apply LDA with cross-validation
lda = LinearDiscriminantAnalysis()
cv_scores = cross_val_score(lda, X, y, cv=5)
print(f"Cross-validation accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# Fit LDA and get confusion matrix
lda.fit(X, y)
y_pred = lda.predict(X)
cm = confusion_matrix(y, y_pred)

# Plot confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Inactive', 'Active'])
disp.plot(cmap='Blues')
plt.title("Confusion Matrix for Drug Activity Classification")
plt.show()

### Protein-Ligand Binding Site Prediction: 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc

# Generate synthetic data
np.random.seed(42)
n_samples = 500
n_features = 8

X = np.random.randn(n_samples, n_features)
y = (np.sum(X[:, :4], axis=1) > 0).astype(int)  # 0: Non-binding, 1: Binding

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Apply LDA
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

# Predict probabilities
y_pred_proba = lda.predict_proba(X_test)[:, 1]

# Compute ROC curve and AUC
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Protein-Ligand Binding Site Prediction')
plt.legend(loc="lower right")
plt.show()

### Protein Solubility Prediction:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import learning_curve

# Generate synthetic data
np.random.seed(42)
n_samples = 300
n_features = 6

X = np.random.randn(n_samples, n_features)
y = (np.sum(X, axis=1) > 0).astype(int)  # 0: Insoluble, 1: Soluble

# Apply LDA
lda = LinearDiscriminantAnalysis()

# Generate learning curve
train_sizes, train_scores, test_scores = learning_curve(
    lda, X, y, train_sizes=np.linspace(0.1, 1.0, 10), cv=5, scoring="accuracy"
)

# Calculate mean and std
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

# Plot learning curve
plt.figure(figsize=(10, 6))
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
plt.xlabel("Training examples")
plt.ylabel("Accuracy")
plt.title("Learning Curve for Protein Solubility Prediction")
plt.legend(loc="best")
plt.show()

### Drug Target Classification:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Generate synthetic data
np.random.seed(42)
n_samples = 400
n_features = 10

X = np.random.randn(n_samples, n_features)
y = np.argmax(X[:, :3], axis=1)  # 3 classes of drug targets

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Apply LDA
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

# Predict
y_pred = lda.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred, target_names=['Class 0', 'Class 1', 'Class 2']))

# Visualize LDA projection
X_lda = lda.transform(X)
plt.figure(figsize=(10, 6))
colors = ['r', 'g', 'b']
for color, i, target_name in zip(colors, [0, 1, 2], ['Class 0', 'Class 1', 'Class 2']):
    plt.scatter(X_lda[y == i, 0], X_lda[y == i, 1], alpha=.8, color=color, label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('LDA of Drug Target Dataset')
plt.show()