# Linear transformations and equations 

In each of these examples, the linear transformation serves as a fundamental building block that can be part of more complex models or analyses. In practice, these transformations are often combined with non-linear activations and other layers in deep learning models for improved performance and capability to capture complex relationships in biological data. Remember that while these linear transformations are useful, many biological processes are inherently non-linear. Therefore, these linear models are often used as components within more complex, non-linear models (like neural networks) in advanced bioinformatics applications.

I have put together some of these equations together with some viualizations and code snippets in Python.

These visualizations provide different ways to interpret the data and results:

Histograms show the distribution of predicted scores or values.
Heatmaps visualize 2D data, useful for showing relationships between features or samples.
Box plots display the distribution of values across different categories or groups.
Scatter plots can reveal relationships or clusters between two variables.

Remember to adjust the figure sizes, color schemes, and other parameters as needed to best represent your specific data. These visualizations can be very helpful in understanding the patterns in your data and the outputs of your models.

## Instances where we the Position-Specific Scoring Matrix (PSSM) Equation:   S = Σ(i=1 to L) log(P_i / B_i)  

### Transcription Factor Binding Site Prediction

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

def pssm_score(sequence, pssm, background_probs):
    score = 0
    pseudo_count = 1e-10  # Small pseudo-count to avoid log(0)
    for i, nucleotide in enumerate(sequence):
        p = pssm[nucleotide][i] + pseudo_count
        b = background_probs[nucleotide] + pseudo_count
        score += np.log2(p / b)
    return score

# Example PSSM for a transcription factor binding site
pssm = {
    'A': [0.1, 0.6, 0.1, 0.1, 0.9],
    'C': [0.1, 0.1, 0.1, 0.8, 0.0],
    'G': [0.7, 0.1, 0.1, 0.0, 0.1],
    'T': [0.1, 0.2, 0.7, 0.1, 0.0]
}

background_probs = {'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25}

# Generate random DNA sequences
sequences = [''.join(np.random.choice(['A', 'C', 'G', 'T'], 5)) for _ in range(1000)]

scores = [pssm_score(seq, pssm, background_probs) for seq in sequences]

# Filter out any remaining -inf values (should be rare or none with pseudo-count)
scores = [score for score in scores if np.isfinite(score)]

plt.figure(figsize=(10, 6))
plt.hist(scores, bins=30, edgecolor='black')
plt.title('Distribution of PSSM Scores for Random Sequences')
plt.xlabel('PSSM Score')
plt.ylabel('Frequency')
plt.show()

# Visualize PSSM
plt.figure(figsize=(10, 6))
sns.heatmap(pd.DataFrame(pssm), annot=True, cmap='YlGnBu')
plt.title('Position-Specific Scoring Matrix (PSSM)')
plt.xlabel('Position')
plt.ylabel('Nucleotide')
plt.show()

### Protein Family Classification

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

def generate_pssm(sequences):
    length = len(sequences[0])
    pssm = np.zeros((length, 20))
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    for i in range(length):
        for j, aa in enumerate(amino_acids):
            pssm[i, j] = sum(seq[i] == aa for seq in sequences) / len(sequences)
    return pssm

def pssm_score(sequence, pssm, background_probs):
    score = 0
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    for i, aa in enumerate(sequence):
        if aa in amino_acids:
            j = amino_acids.index(aa)
            score += np.log2(pssm[i, j] / background_probs[aa])
    return score

# Generate synthetic data
def generate_family(length, num_sequences):
    consensus = ''.join(np.random.choice(list('ACDEFGHIKLMNPQRSTVWY'), length))
    sequences = []
    for _ in range(num_sequences):
        seq = ''.join(np.random.choice([aa, np.random.choice(list('ACDEFGHIKLMNPQRSTVWY'))] 
                                       , p=[0.9, 0.1]) for aa in consensus)
        sequences.append(seq)
    return sequences

family1 = generate_family(50, 100)
family2 = generate_family(50, 100)

# Create PSSMs
pssm1 = generate_pssm(family1)
pssm2 = generate_pssm(family2)

background_probs = {aa: 0.05 for aa in 'ACDEFGHIKLMNPQRSTVWY'}

# Score sequences
X = family1 + family2
y = [0] * len(family1) + [1] * len(family2)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Simple classifier
def classify(sequence):
    score1 = pssm_score(sequence, pssm1, background_probs)
    score2 = pssm_score(sequence, pssm2, background_probs)
    return 0 if score1 > score2 else 1

y_pred = [classify(seq) for seq in X_test]

# Visualize results
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Family 1', 'Family 2'])
disp.plot(cmap='Blues')
plt.title('Confusion Matrix for Protein Family Classification')
plt.show()

### Multiple Sequence Alignment Scoring

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def generate_pssm(alignment):
    length = len(alignment[0])
    pssm = np.zeros((length, 20))
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    for i in range(length):
        for j, aa in enumerate(amino_acids):
            pssm[i, j] = sum(seq[i] == aa for seq in alignment) / len(alignment)
    return pssm

def pssm_score(sequence, pssm, background_probs):
    score = 0
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    for i, aa in enumerate(sequence):
        if aa in amino_acids:
            j = amino_acids.index(aa)
            score += np.log2(pssm[i, j] / background_probs[aa])
    return score

# Generate a mock multiple sequence alignment
alignment = [
    "MAEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGP",
    "MEEPQSDPSVEPPLSQETFSDLWKLLPPNNVLSTLPSSDSIEELFLSENVAGWLEDPDEAP",
    "MEEPQSDLSEEPPLSQETFSDLWNLLPENKLLDLPLSPEDIEQWLSEDPGPDEAPRMPEAA",
    "MEEPQSDLSIELPLSQETFSGLWKLLPPEDILPSPHCMDDLLLPQDVEEFFEGPSEALRVS"
]

pssm = generate_pssm(alignment)
background_probs = {aa: 0.05 for aa in 'ACDEFGHIKLMNPQRSTVWY'}

# Score each sequence in the alignment
scores = [pssm_score(seq, pssm, background_probs) for seq in alignment]

# Visualize PSSM
plt.figure(figsize=(15, 8))
sns.heatmap(pssm, cmap='YlGnBu', xticklabels=list('ACDEFGHIKLMNPQRSTVWY'))
plt.title('Position-Specific Scoring Matrix (PSSM) for Multiple Sequence Alignment')
plt.xlabel('Amino Acid')
plt.ylabel('Position')
plt.show()

# Visualize scores
plt.figure(figsize=(10, 6))
plt.bar(range(len(scores)), scores)
plt.title('PSSM Scores for Sequences in Multiple Sequence Alignment')
plt.xlabel('Sequence Index')
plt.ylabel('PSSM Score')
plt.show()

### Motif Discovery

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def generate_pssm(motifs):
    length = len(motifs[0])
    pssm = np.zeros((length, 4))
    nucleotides = 'ACGT'
    for i in range(length):
        for j, nt in enumerate(nucleotides):
            pssm[i, j] = sum(seq[i] == nt for seq in motifs) / len(motifs)
    return pssm

def pssm_score(sequence, pssm, background_probs):
    score = 0
    nucleotides = 'ACGT'
    for i, nt in enumerate(sequence):
        j = nucleotides.index(nt)
        if pssm[i, j] > 0:
            score += np.log2(pssm[i, j] / background_probs[nt])
        else:
            score += np.log2(1e-300 / background_probs[nt])  # Very low score for zero probability
    return score

def generate_sequences_with_motif(num_sequences, seq_length, motif):
    sequences = []
    for _ in range(num_sequences):
        seq = ''.join(np.random.choice(['A', 'C', 'G', 'T']) for _ in range(seq_length))
        start = np.random.randint(0, seq_length - len(motif) + 1)
        seq = seq[:start] + motif + seq[start+len(motif):]
        sequences.append(seq)
    return sequences

def scan_sequence(sequence, pssm, background_probs, window_size):
    scores = []
    for i in range(len(sequence) - window_size + 1):
        subseq = sequence[i:i+window_size]
        scores.append(pssm_score(subseq, pssm, background_probs))
    return scores

motif = "ATGCATGC"
sequences = generate_sequences_with_motif(100, 50, motif)

pssm = generate_pssm([motif])
background_probs = {'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25}

all_scores = []
for seq in sequences:
    scores = scan_sequence(seq, pssm, background_probs, len(motif))
    all_scores.extend(scores)

# Visualize PSSM
plt.figure(figsize=(10, 6))
sns.heatmap(pssm, annot=True, cmap='YlGnBu', xticklabels=['A', 'C', 'G', 'T'])
plt.title('Position-Specific Scoring Matrix (PSSM) for Motif')
plt.xlabel('Nucleotide')
plt.ylabel('Position')
plt.show()

# Visualize score distribution
plt.figure(figsize=(10, 6))
plt.hist(all_scores, bins=50, edgecolor='black')
plt.title('Distribution of PSSM Scores for Sequence Windows')
plt.xlabel('PSSM Score')
plt.ylabel('Frequency')
plt.show()

### Protein Secondary Structure Prediction

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

def generate_pssm(sequences, structures):
    length = len(sequences[0])
    pssm = np.zeros((length, 20, 3))  # 20 amino acids, 3 structure types
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    structure_types = 'HEC'  # Helix, Sheet (Extended), Coil
    for i in range(length):
        for j, aa in enumerate(amino_acids):
            for k, st in enumerate(structure_types):
                count = sum((seq[i] == aa and struct[i] == st) for seq, struct in zip(sequences, structures))
                total = sum(seq[i] == aa for seq in sequences)
                pssm[i, j, k] = count / total if total > 0 else 0
    return pssm

def pssm_score(sequence, pssm, background_probs):
    scores = np.zeros((len(sequence), 3))
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    for i, aa in enumerate(sequence):
        if aa in amino_acids:
            j = amino_acids.index(aa)
            for k in range(3):
                scores[i, k] = np.log2(pssm[i, j, k] / background_probs[aa])
    return scores

# Generate synthetic data
def generate_data(num_sequences, length):
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    structure_types = 'HEC'
    sequences = [''.join(np.random.choice(list(amino_acids)) for _ in range(length)) for _ in range(num_sequences)]
    structures = [''.join(np.random.choice(list(structure_types)) for _ in range(length)) for _ in range(num_sequences)]
    return sequences, structures

sequences, structures = generate_data(1000, 50)

# Create PSSM
pssm = generate_pssm(sequences, structures)
background_probs = {aa: 0.05 for aa in 'ACDEFGHIKLMNPQRSTVWY'}

# Split data
X_train, X_test, y_train, y_test = train_test_split(sequences, structures, test_size=0.2, random_state=42)

# Predict secondary structure
def predict_structure(sequence):
    scores = pssm_score(sequence, pssm, background_probs)
    return ''.join('HEC'[np.argmax(score)] for score in scores)

y_pred = [predict_structure(seq) for seq in X_test]

# Flatten predictions and true values
y_test_flat = ''.join(y_test)
y_pred_flat = ''.join(y_pred)

# Visualize results
cm = confusion_matrix(list(y_test_flat), list(y_pred_flat))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['H', 'E', 'C'])
disp.plot(cmap='Blues')
plt.title('Confusion Matrix for Secondary Structure Prediction')
plt.show()

# Visualize PSSM for a specific position
position = 25  # Choose a specific position to visualize
plt.figure(figsize=(12, 8))
sns.heatmap(pssm[position], annot=True, cmap='YlGnBu', 
            xticklabels=['H', 'E', 'C'], 
            yticklabels=list('ACDEFGHIKLMNPQRSTVWY'))
plt.title(f'PSSM for Position {position} in Secondary Structure Prediction')
plt.xlabel('Secondary Structure')
plt.ylabel('Amino Acid')
plt.show()

## Instances where we use the Hidden Markov Model (HMM) Forward Algorithm Equation: αt(i) = Σ(j) [α(t-1)(j) * a_ji] * b_i(o_t)

* **Protein Secondary Structure Prediction**: Helps in understanding protein folding and potential drug binding sites.
* **Protein Disorder Prediction**: Identifies flexible regions that could be important for protein-protein interactions or drug binding.
* **Protein-Ligand Binding Site Prediction**: Directly applicable to drug discovery by predicting where small molecules might bind to a protein.
* **Protein Stability Prediction**: Useful for assessing the impact of mutations or drug binding on protein stability.
* **Protein-Protein Interaction Site Prediction**: Important for understanding protein function and identifying potential sites for therapeutic intervention.

### Protein Secondary Structure Prediction:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import entropy

def forward_algorithm(obs, states, start_p, trans_p, emit_p):
    alpha = np.zeros((len(obs), len(states)))
    alpha[0] = start_p * emit_p[:, obs[0]]
    for t in range(1, len(obs)):
        for i in range(len(states)):
            alpha[t, i] = np.sum(alpha[t-1] * trans_p[:, i]) * emit_p[i, obs[t]]
    return alpha

# Define states and observations
states = ['Helix', 'Sheet', 'Coil']
observations = ['Hydrophobic', 'Polar', 'Charged']

# Example parameters (these should be learned from data in practice)
start_p = np.array([0.3, 0.4, 0.3])
trans_p = np.array([[0.7, 0.2, 0.1],
                    [0.1, 0.8, 0.1],
                    [0.2, 0.2, 0.6]])
emit_p = np.array([[0.4, 0.3, 0.3],
                   [0.3, 0.5, 0.2],
                   [0.2, 0.3, 0.5]])

# Example protein sequence
protein_seq = [0, 1, 2, 1, 0, 2, 1, 0]  # 0: Hydrophobic, 1: Polar, 2: Charged

# Run forward algorithm
alpha = forward_algorithm(protein_seq, states, start_p, trans_p, emit_p)

# Visualize results
plt.figure(figsize=(12, 6))
plt.imshow(alpha.T, aspect='auto', cmap='viridis')
plt.colorbar(label='Forward Probability')
plt.title('HMM Forward Algorithm: Protein Secondary Structure Prediction')
plt.xlabel('Position in Sequence')
plt.ylabel('State')
plt.yticks(range(len(states)), states)
plt.show()

# Calculate structure probabilities
structure_probs = alpha / alpha.sum(axis=1, keepdims=True)

plt.figure(figsize=(12, 6))
plt.stackplot(range(len(protein_seq)), structure_probs.T, labels=states)
plt.title('Protein Secondary Structure Probabilities')
plt.xlabel('Position in Sequence')
plt.ylabel('Probability')
plt.legend(loc='upper left')
plt.show()

### Protein Disorder Prediction

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def forward_algorithm(obs, states, start_p, trans_p, emit_p):
    alpha = np.zeros((len(obs), len(states)))
    alpha[0] = start_p * emit_p[:, obs[0]]
    for t in range(1, len(obs)):
        for i in range(len(states)):
            alpha[t, i] = np.sum(alpha[t-1] * trans_p[:, i]) * emit_p[i, obs[t]]
    return alpha

# Define states and observations
states = ['Ordered', 'Disordered']
observations = ['Hydrophobic', 'Polar', 'Charged']

# Example parameters
start_p = np.array([0.6, 0.4])
trans_p = np.array([[0.8, 0.2],
                    [0.3, 0.7]])
emit_p = np.array([[0.5, 0.3, 0.2],
                   [0.2, 0.4, 0.4]])

# Example protein sequence
protein_seq = [0, 1, 2, 1, 0, 2, 1, 0, 2, 1]  # 0: Hydrophobic, 1: Polar, 2: Charged

# Run forward algorithm
alpha = forward_algorithm(protein_seq, states, start_p, trans_p, emit_p)

# Calculate disorder probability
disorder_prob = alpha[:, 1] / alpha.sum(axis=1)

# Visualize results
plt.figure(figsize=(12, 6))
plt.plot(disorder_prob, label='Disorder Probability')
plt.fill_between(range(len(protein_seq)), 0, disorder_prob, alpha=0.3)
plt.title('Protein Disorder Prediction using HMM Forward Algorithm')
plt.xlabel('Position in Sequence')
plt.ylabel('Disorder Probability')
plt.legend()
plt.ylim(0, 1)
plt.show()

# Visualize state probabilities
state_probs = alpha / alpha.sum(axis=1, keepdims=True)
plt.figure(figsize=(12, 6))
plt.stackplot(range(len(protein_seq)), state_probs.T, labels=states)
plt.title('Protein Order/Disorder State Probabilities')
plt.xlabel('Position in Sequence')
plt.ylabel('Probability')
plt.legend(loc='upper left')
plt.show()

### Protein-Ligand Binding Site Prediction

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def forward_algorithm(obs, states, start_p, trans_p, emit_p):
    alpha = np.zeros((len(obs), len(states)))
    alpha[0] = start_p * emit_p[:, obs[0]]
    for t in range(1, len(obs)):
        for i in range(len(states)):
            alpha[t, i] = np.sum(alpha[t-1] * trans_p[:, i]) * emit_p[i, obs[t]]
    return alpha

# Define states and observations
states = ['Non-binding', 'Binding']
observations = ['Hydrophobic', 'Polar', 'Charged', 'Glycine', 'Proline']

# Example parameters
start_p = np.array([0.8, 0.2])
trans_p = np.array([[0.9, 0.1],
                    [0.2, 0.8]])
emit_p = np.array([[0.3, 0.3, 0.2, 0.1, 0.1],
                   [0.1, 0.2, 0.4, 0.2, 0.1]])

# Example protein sequence
protein_seq = [0, 1, 2, 3, 4, 2, 1, 0, 2, 1, 3, 2, 0, 1, 4]

# Run forward algorithm
alpha = forward_algorithm(protein_seq, states, start_p, trans_p, emit_p)

# Calculate binding site probability
binding_prob = alpha[:, 1] / alpha.sum(axis=1)

# Visualize results
plt.figure(figsize=(12, 6))
plt.plot(binding_prob, label='Binding Site Probability')
plt.fill_between(range(len(protein_seq)), 0, binding_prob, alpha=0.3)
plt.title('Protein-Ligand Binding Site Prediction using HMM Forward Algorithm')
plt.xlabel('Position in Sequence')
plt.ylabel('Binding Site Probability')
plt.legend()
plt.ylim(0, 1)
plt.show()

# Visualize emission probabilities
plt.figure(figsize=(10, 6))
plt.imshow(emit_p, aspect='auto', cmap='YlOrRd')
plt.colorbar(label='Emission Probability')
plt.title('Emission Probabilities for Binding Site Prediction')
plt.xlabel('Amino Acid Type')
plt.ylabel('State')
plt.xticks(range(len(observations)), observations)
plt.yticks(range(len(states)), states)
for i in range(len(states)):
    for j in range(len(observations)):
        plt.text(j, i, f'{emit_p[i, j]:.2f}', ha='center', va='center')
plt.show()

### Protein Stability Prediction

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def forward_algorithm(obs, states, start_p, trans_p, emit_p):
    alpha = np.zeros((len(obs), len(states)))
    alpha[0] = start_p * emit_p[:, obs[0]]
    for t in range(1, len(obs)):
        for i in range(len(states)):
            alpha[t, i] = np.sum(alpha[t-1] * trans_p[:, i]) * emit_p[i, obs[t]]
    return alpha

# Define states and observations
states = ['Unstable', 'Moderately Stable', 'Highly Stable']
observations = ['Hydrophobic', 'Polar', 'Charged']

# Example parameters
start_p = np.array([0.2, 0.5, 0.3])
trans_p = np.array([[0.7, 0.2, 0.1],
                    [0.1, 0.7, 0.2],
                    [0.1, 0.2, 0.7]])
emit_p = np.array([[0.5, 0.3, 0.2],
                   [0.3, 0.4, 0.3],
                   [0.2, 0.3, 0.5]])

# Example protein sequence
protein_seq = [0, 1, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 2, 0]

# Run forward algorithm
alpha = forward_algorithm(protein_seq, states, start_p, trans_p, emit_p)

# Calculate stability probabilities
stability_probs = alpha / alpha.sum(axis=1, keepdims=True)

# Visualize results
plt.figure(figsize=(12, 6))
plt.stackplot(range(len(protein_seq)), stability_probs.T, labels=states)
plt.title('Protein Stability Prediction using HMM Forward Algorithm')
plt.xlabel('Position in Sequence')
plt.ylabel('Probability')
plt.legend(loc='upper left')
plt.show()

# Calculate overall stability score
stability_score = np.sum(stability_probs[:, 1] + 2 * stability_probs[:, 2]) / len(protein_seq)

plt.figure(figsize=(8, 6))
plt.bar(['Unstable', 'Moderately Stable', 'Highly Stable'], 
        [np.mean(stability_probs[:, 0]), np.mean(stability_probs[:, 1]), np.mean(stability_probs[:, 2])])
plt.title(f'Average Stability Distribution\nOverall Stability Score: {stability_score:.2f}')
plt.ylabel('Average Probability')
plt.show()

### Protein-Protein Interaction Site Prediction

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import find_peaks

def forward_algorithm(obs, states, start_p, trans_p, emit_p):
    alpha = np.zeros((len(obs), len(states)))
    alpha[0] = start_p * emit_p[:, obs[0]]
    for t in range(1, len(obs)):
        for i in range(len(states)):
            alpha[t, i] = np.sum(alpha[t-1] * trans_p[:, i]) * emit_p[i, obs[t]]
    return alpha

# Define states and observations
states = ['Non-interacting', 'Interacting']
observations = ['Hydrophobic', 'Polar', 'Charged', 'Special']

# Example parameters
start_p = np.array([0.7, 0.3])
trans_p = np.array([[0.8, 0.2],
                    [0.3, 0.7]])
emit_p = np.array([[0.4, 0.3, 0.2, 0.1],
                   [0.2, 0.3, 0.4, 0.1]])

# Example protein sequence
protein_seq = [0, 1, 2, 3, 1, 2, 0, 1, 2, 3, 0, 1, 2, 1, 0, 2, 1, 3, 2, 1]

# Run forward algorithm
alpha = forward_algorithm(protein_seq, states, start_p, trans_p, emit_p)

# Calculate interaction probability
interaction_prob = alpha[:, 1] / alpha.sum(axis=1)

# Visualize results
plt.figure(figsize=(12, 6))
plt.plot(interaction_prob, label='Interaction Probability')
plt.fill_between(range(len(protein_seq)), 0, interaction_prob, alpha=0.3)
plt.title('Protein-Protein Interaction Site Prediction using HMM Forward Algorithm')
plt.xlabel('Position in Sequence')
plt.ylabel('Interaction Probability')
plt.legend()
plt.ylim(0, 1)

# Find peaks (potential interaction sites)
peaks, _ = find_peaks(interaction_prob, height=0.5, distance=3)
plt.plot(peaks, interaction_prob[peaks], "rx", label='Potential Interaction Sites')
plt.legend()
plt.show()

# Visualize state probabilities
state_probs = alpha / alpha.sum(axis=1, keepdims=True)
plt.figure(figsize=(12, 6))
plt.stackplot(range(len(protein_seq)), state_probs.T, labels=states)
plt.title('Protein-Protein Interaction State Probabilities')
plt.xlabel('Position in Sequence')
plt.ylabel('Probability')
plt.legend(loc='upper left')
plt.show()

## Instances where we use the Needleman-Wunsch Algorithm (for global sequence alignment): Equation: F(i,j) = max(F(i-1,j-1) + s(x_i, y_j), F(i-1,j) + g, F(i,j-1) + g) 

* **Protein Homology Detection**: Helps identify similar proteins that might have similar functions or be affected by the same drugs.
* **Drug Target Identification**: Assists in finding proteins that are most similar to known drug targets.
* **Protein-Ligand Binding Site Prediction**: Aids in identifying potential binding sites for drugs on a protein sequence.
* **Protein Family Classification**: Helps categorize proteins into families, which can be useful for predicting function and drug interactions.
* **Drug Resistance Mutation Analysis**: Identifies mutations that might confer drug resistance and assesses their impact on the protein sequence.

Protein Homology Detection: 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def needleman_wunsch(seq1, seq2, match_score=1, mismatch_score=-1, gap_penalty=-2):
    n, m = len(seq1), len(seq2)
    score_matrix = np.zeros((n+1, m+1))
    
    for i in range(n+1):
        score_matrix[i][0] = i * gap_penalty
    for j in range(m+1):
        score_matrix[0][j] = j * gap_penalty
    
    for i in range(1, n+1):
        for j in range(1, m+1):
            match = score_matrix[i-1][j-1] + (match_score if seq1[i-1] == seq2[j-1] else mismatch_score)
            delete = score_matrix[i-1][j] + gap_penalty
            insert = score_matrix[i][j-1] + gap_penalty
            score_matrix[i][j] = max(match, delete, insert)
    
    return score_matrix

# Example protein sequences
protein1 = "MAEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGP"
protein2 = "MEEPQSDLSIELPLSQETFSGLWKLLPPEDILPSPHCMDDLLLPQDVEEFFEGPSEALRVS"

# Perform alignment
alignment_matrix = needleman_wunsch(protein1, protein2)

# Visualize alignment matrix
plt.figure(figsize=(12, 10))
sns.heatmap(alignment_matrix, cmap='YlGnBu')
plt.title('Needleman-Wunsch Alignment Matrix')
plt.xlabel('Protein 2')
plt.ylabel('Protein 1')
plt.show()

# Calculate similarity score
similarity_score = alignment_matrix[-1, -1] / max(len(protein1), len(protein2))

print(f"Similarity Score: {similarity_score:.2f}")

# Visualize sequence similarity
plt.figure(figsize=(12, 4))
for i in range(len(protein1)):
    if i < len(protein2) and protein1[i] == protein2[i]:
        plt.axvline(x=i, color='green', alpha=0.5)
    else:
        plt.axvline(x=i, color='red', alpha=0.2)
plt.title('Sequence Similarity Visualization')
plt.xlabel('Sequence Position')
plt.yticks([])
plt.show()

Drug Target Identification:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from Bio import SeqIO
from Bio.Seq import Seq

def needleman_wunsch(seq1, seq2, match_score=1, mismatch_score=-1, gap_penalty=-2):
    n, m = len(seq1), len(seq2)
    score_matrix = np.zeros((n+1, m+1))
    
    for i in range(n+1):
        score_matrix[i][0] = i * gap_penalty
    for j in range(m+1):
        score_matrix[0][j] = j * gap_penalty
    
    for i in range(1, n+1):
        for j in range(1, m+1):
            match = score_matrix[i-1][j-1] + (match_score if seq1[i-1] == seq2[j-1] else mismatch_score)
            delete = score_matrix[i-1][j] + gap_penalty
            insert = score_matrix[i][j-1] + gap_penalty
            score_matrix[i][j] = max(match, delete, insert)
    
    return score_matrix[-1, -1]

# Example drug target sequence
drug_target = "MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTSGLLYGSQTPNEECLFLERLEENHYNTYTSKKHAEKNWFVGLKKNGSCKRGPRTHYGQKAILFLPLPVSSD"

# Example protein database (simplified for demonstration)
protein_database = {
    "Protein A": "MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTSGLLYGSQTPNEECLFLERLEENHYNTYTSKKHAEKNWFVGLKKNGSCKRGPRTHYGQKAILFLPLPVSSD",
    "Protein B": "MSAEGPHPPTLQFNLPPGNYKRPKRLYCKNOGGHFLRILPNGTVDGTRDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTSGLLYGSQTPNEECLFLERLEENHYNTYTSKKHADKNWFVGLKKNGSCKRGPRTHYGQKAILFLPLPVSSD",
    "Protein C": "MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTSGLLYGSQTPNEECLFLERLEENHYNTYTSKKHAEKNWFVGLKKNGSCKRGPRTHYGQKAILFLPLPV",
    "Protein D": "MTTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTSGLLYGSQTPNEECLFLERLEENHYNTYTSKKHAEKNWFVGLKKNGSCKRGPRTHYGQKAILFLPLPVSSD",
    "Protein E": "MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTSGLLYGSQTPNEECLFLERLEENHYNTYTSKKHAEKNWFVGLKKNGSCKRGPRTHYGQKAILFLPLPVSSDCVP"
}

# Calculate alignment scores
alignment_scores = {}
for name, sequence in protein_database.items():
    alignment_scores[name] = needleman_wunsch(drug_target, sequence)

# Visualize alignment scores
plt.figure(figsize=(10, 6))
sns.barplot(x=list(alignment_scores.keys()), y=list(alignment_scores.values()))
plt.title('Alignment Scores for Potential Drug Targets')
plt.xlabel('Proteins')
plt.ylabel('Alignment Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Identify top drug target
top_target = max(alignment_scores, key=alignment_scores.get)
print(f"Top drug target: {top_target}")

Protein-Ligand Binding Site Prediction:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def needleman_wunsch(seq1, seq2, match_score=1, mismatch_score=-1, gap_penalty=-2):
    n, m = len(seq1), len(seq2)
    score_matrix = np.zeros((n+1, m+1))
    
    for i in range(n+1):
        score_matrix[i][0] = i * gap_penalty
    for j in range(m+1):
        score_matrix[0][j] = j * gap_penalty
    
    for i in range(1, n+1):
        for j in range(1, m+1):
            match = score_matrix[i-1][j-1] + (match_score if seq1[i-1] == seq2[j-1] else mismatch_score)
            delete = score_matrix[i-1][j] + gap_penalty
            insert = score_matrix[i][j-1] + gap_penalty
            score_matrix[i][j] = max(match, delete, insert)
    
    return score_matrix

def predict_binding_sites(protein_seq, ligand_seq, window_size=10):
    n = len(protein_seq)
    m = len(ligand_seq)
    binding_scores = []
    
    for i in range(n - window_size + 1):
        window = protein_seq[i:i+window_size]
        score_matrix = needleman_wunsch(window, ligand_seq)
        binding_scores.append(score_matrix[-1, -1])
    
    return binding_scores

# Example protein and ligand sequences
protein_sequence = "MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTSGLLYGSQTPNEECLFLERLEENHYNTYTSKKHAEKNWFVGLKKNGSCKRGPRTHYGQKAILFLPLPVSSD"
ligand_sequence = "YCSNGG"

# Predict binding sites
binding_scores = predict_binding_sites(protein_sequence, ligand_sequence)

# Visualize binding site predictions
plt.figure(figsize=(12, 6))
plt.plot(binding_scores)
plt.title('Protein-Ligand Binding Site Prediction')
plt.xlabel('Protein Sequence Position')
plt.ylabel('Binding Score')
plt.show()

# Identify top binding sites
top_sites = sorted(range(len(binding_scores)), key=lambda i: binding_scores[i], reverse=True)[:3]

print("Top 3 predicted binding sites:")
for site in top_sites:
    print(f"Position {site}: {protein_sequence[site:site+10]}")

# Visualize protein sequence with highlighted binding sites
plt.figure(figsize=(12, 4))
plt.imshow([binding_scores], cmap='YlOrRd', aspect='auto')
plt.colorbar(label='Binding Score')
plt.title('Protein Sequence with Predicted Binding Sites')
plt.xlabel('Protein Sequence Position')
plt.yticks([])
for site in top_sites:
    plt.axvline(x=site, color='blue', linestyle='--', alpha=0.7)
plt.show()

Protein Family Classification:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

def needleman_wunsch(seq1, seq2, match_score=1, mismatch_score=-1, gap_penalty=-2):
    n, m = len(seq1), len(seq2)
    score_matrix = np.zeros((n+1, m+1))
    
    for i in range(n+1):
        score_matrix[i][0] = i * gap_penalty
    for j in range(m+1):
        score_matrix[0][j] = j * gap_penalty
    
    for i in range(1, n+1):
        for j in range(1, m+1):
            match = score_matrix[i-1][j-1] + (match_score if seq1[i-1] == seq2[j-1] else mismatch_score)
            delete = score_matrix[i-1][j] + gap_penalty
            insert = score_matrix[i][j-1] + gap_penalty
            score_matrix[i][j] = max(match, delete, insert)
    
    return score_matrix[-1, -1]

# Example protein sequences from different families
protein_families = {
    "Kinase Family": [
        "MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTSGLLYGSQTPNEECLFLERLEENHYNTYTSKKHAEKNWFVGLKKNGSCKRGPRTHYGQKAILFLPLPVSSD",
        "MSAEGPHPPTLQFNLPPGNYKRPKRLYCKNGGHFLRILPNGTVDGTRDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTSGLLYGSQTPNEECLFLERLEENHYNTYTSKKHADKNWFVGLKKNGSCKRGPRTHYGQKAILFLPLPVSSD",
        "MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTSGLLYGSQTPNEECLFLERLEENHYNTYTSKKHAEKNWFVGLKKNGSCKRGPRTHYGQKAILFLPLPV"
    ],
    "Receptor Family": [
        "MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSNFGPIFMTIPAFFAKSSSIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA",
        "MDVLPGNDSLALPWPLEYELDLHSSEQKELLRRQLSGLLPPGSYLYSLTVVFVALFVLGVTNSLVVSVILWFRKLHCTRNYFIINLYVVNLLLADDLLVSVSLPFKIAYHLGENTEFYVDLCVDCFFVTDLLSVSASVMNLLIISFDRYFSVTRPLSYRAKRTPRRAALMIGLAWVISFVLWAPAILFWQYIVGMEAEGGTVCTVLSFIVYALGYSLLFIAVISLDRYLAIVHPLKPRLSQRTLYLVGTYPESTKTWDVHIQMTLNDLLWFLNFVLALIICNAIVGVVSHEVRKRKWRVHLHIFKVTIQSAGPWTINRVFLDTLRRLRKARVVTPPRKVQQ",
        "MKTIIALSYIFCLVFADYKDDDDLEVLFQGPGSRIHSQEDFNFLNGSRDLDRQFTFSVEFDGELQVTLGEPEGMVRGADATGGTLQVVYGTGGPILPSQVFTDGELKEVQLVLDGSFKAGVDAGQYQADFSLNLRSFYAALGKEGEAQGLTSILEGKSCNLRSFDVSDVYGRGAGTGNPGMTGLDGAGGFIDVEQNGAPGVPAGAPGNLLDIQVAPAGAPLPGDILDKPDAPIKSAPLPLQTAPGVPGFSDEELGTEVPAGHAPNIQVTRGAKDALTAVGLDMPPDKPYGRTVMVTAQSLDKYFGIAPTVKVTFLNIRNGENAQGVVGPWWDVETRDSARDFHINLTVGVDKLGELVIELNANDPKVEGADISFTYDSKRDKWVLIGTDDGSVSVGRNERLYVNVDPSAQRAVAALREKLDGTPVFAVSVDLPSSSVSGVKPIAVDTKPNDSALKVEYTTAGKKASTVTFALRPSNLNVRAVTAAPAGNVTLDGNQRVLTVNFTVTPEIPAAGKRVLVLQSAEVNWGYGRRFEVDKAVNLNAGALLIDSGQRSVTFVSDGARVALSALEFLTAESVAAGADLRLTFDGRNGRWVLV"
    ]
}

# Calculate pairwise alignment scores
all_sequences = [seq for family in protein_families.values() for seq in family]
n_sequences = len(all_sequences)
alignment_matrix = np.zeros((n_sequences, n_sequences))

for i in range(n_sequences):
    for j in range(i, n_sequences):
        score = needleman_wunsch(all_sequences[i], all_sequences[j])
        alignment_matrix[i, j] = score
        alignment_matrix[j, i] = score

# Perform dimensionality reduction using PCA
pca = PCA(n_components=2)
embedded = pca.fit_transform(alignment_matrix)

# Visualize protein family classification
plt.figure(figsize=(12, 8))
colors = ['r', 'b']
markers = ['o', 's']
for i, (family, sequences) in enumerate(protein_families.items()):
    family_indices = range(i*3, (i+1)*3)
    plt.scatter(embedded[family_indices, 0], embedded[family_indices, 1], 
                c=colors[i], marker=markers[i], label=family, s=100)

plt.title('Protein Family Classification using Needleman-Wunsch Algorithm')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()

# Visualize alignment score matrix
plt.figure(figsize=(10, 8))
sns.heatmap(alignment_matrix, cmap='YlGnBu', annot=True, fmt='.0f')
plt.title('Pairwise Alignment Score Matrix')
plt.xlabel('Protein Sequence Index')
plt.ylabel('Protein Sequence Index')
plt.show()

# Calculate and print average within-family and between-family scores
kinase_indices = range(0, 3)
receptor_indices = range(3, 6)

kinase_within = np.mean([alignment_matrix[i, j] for i in kinase_indices for j in kinase_indices if i != j])
receptor_within = np.mean([alignment_matrix[i, j] for i in receptor_indices for j in receptor_indices if i != j])
between_family = np.mean([alignment_matrix[i, j] for i in kinase_indices for j in receptor_indices])

print(f"Average Kinase Family Score: {kinase_within:.2f}")
print(f"Average Receptor Family Score: {receptor_within:.2f}")
print(f"Average Between-Family Score: {between_family:.2f}")

#### Drug Resistance Mutation Analysis:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def needleman_wunsch(seq1, seq2, match_score=1, mismatch_score=-1, gap_penalty=-2):
    n, m = len(seq1), len(seq2)
    score_matrix = np.zeros((n+1, m+1))
    
    for i in range(n+1):
        score_matrix[i][0] = i * gap_penalty
    for j in range(m+1):
        score_matrix[0][j] = j * gap_penalty
    
    for i in range(1, n+1):
        for j in range(1, m+1):
            match = score_matrix[i-1][j-1] + (match_score if seq1[i-1] == seq2[j-1] else mismatch_score)
            delete = score_matrix[i-1][j] + gap_penalty
            insert = score_matrix[i][j-1] + gap_penalty
            score_matrix[i][j] = max(match, delete, insert)
    
    return score_matrix

def backtrack(score_matrix, seq1, seq2):
    align1, align2 = [], []
    i, j = len(seq1), len(seq2)
    
    while i > 0 and j > 0:
        score_current = score_matrix[i][j]
        score_diagonal = score_matrix[i-1][j-1]
        score_up = score_matrix[i][j-1]
        score_left = score_matrix[i-1][j]
        
        if score_current == score_diagonal + (1 if seq1[i-1] == seq2[j-1] else -1):
            align1.append(seq1[i-1])
            align2.append(seq2[j-1])
            i -= 1
            j -= 1
        elif score_current == score_left - 2:
            align1.append(seq1[i-1])
            align2.append('-')
            i -= 1
        elif score_current == score_up - 2:
            align1.append('-')
            align2.append(seq2[j-1])
            j -= 1
    
    while i > 0:
        align1.append(seq1[i-1])
        align2.append('-')
        i -= 1
    while j > 0:
        align1.append('-')
        align2.append(seq2[j-1])
        j -= 1
    
    return ''.join(align1[::-1]), ''.join(align2[::-1])

# Example protein sequences (wild type and mutant)
wild_type = "MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTSGLLYGSQTPNEECLFLERLEENHYNTYTSKKHAEKNWFVGLKKNGSCKRGPRTHYGQKAILFLPLPVSSD"
mutant = "MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTSGLLYGSQTPNEECLFLERLEENHYNTYTSKKHAEKNWFVGLKKNGSCKRGPRTHYGQKAILFLPLPVSSY"

# Perform alignment
score_matrix = needleman_wunsch(wild_type, mutant)
aligned_wild, aligned_mutant = backtrack(score_matrix, wild_type, mutant)

# Visualize alignment
plt.figure(figsize=(15, 6))
for i in range(len(aligned_wild)):
    if aligned_wild[i] == aligned_mutant[i]:
        plt.axvline(x=i, color='green', alpha=0.3)
    elif aligned_wild[i] == '-' or aligned_mutant[i] == '-':
        plt.axvline(x=i, color='red', alpha=0.3)
    else:
        plt.axvline(x=i, color='orange', alpha=0.7)

plt.title('Alignment of Wild Type and Mutant Sequences')
plt.xlabel('Sequence Position')
plt.yticks([])
plt.show()

# Identify mutations
mutations = []
for i, (wt, mt) in enumerate(zip(aligned_wild, aligned_mutant)):
    if wt != mt:
        mutations.append(f"{wt}{i+1}{mt}")

print("Detected mutations:", ', '.join(mutations))

# Calculate mutation impact scores (simplified example)
impact_scores = [score_matrix[i, i] for i in range(1, min(len(wild_type), len(mutant)) + 1)]

# Visualize mutation impact
plt.figure(figsize=(12, 6))
plt.plot(impact_scores)
plt.title('Mutation Impact Scores Along the Sequence')
plt.xlabel('Sequence Position')
plt.ylabel('Impact Score')
plt.show()

## Instances where we use the Support Vector Machine (SVM) Decision Function Equation: f(x) = Σ(i=1 to n) [α_i * y_i * K(x_i, x)] + b

* **Protein-Ligand Binding Affinity Prediction**:
This example uses SVR (Support Vector Regression), an extension of SVM for regression problems. The decision function here is used to predict a continuous value (binding affinity) rather than a class. While we don't explicitly see the decision function, it's used internally by the model.predict() method to generate predictions. The scatter plot of actual vs. predicted values visualizes how well the SVM's decision function approximates the true binding affinities.

* **Protein Solubility Prediction**:
This uses SVC (Support Vector Classification) for binary classification. The decision function separates soluble from insoluble proteins in the feature space. We don't directly visualize the decision function, but its effect is shown in the confusion matrix, which displays how well the SVM's decision boundary classifies the proteins.

* **Drug-Target Interaction Prediction**:
This example also uses SVC, but with probability estimates. The decision function is transformed into probabilities, which are then used to create the ROC curve. The curve essentially shows how the decision function's threshold affects the true positive and false positive rates.

* **Protein Structure Quality Assessment**:
Similar to the first example, this uses SVR. The decision function is used to predict a continuous quality score. The scatter plot of actual vs. predicted scores visualizes how well the SVM's decision function approximates the true quality scores across the range of values.

* **Protein-Protein Interaction Prediction**:
This example most explicitly showcases the SVM decision function. After training the SVC model, we use the decision_function() method to visualize the decision boundary and the margin in a 2D projection of the feature space. The contour plot shows the decision function values, with the decision boundary at 0. Points on either side of this boundary are classified differently, and the magnitude of the decision function indicates the confidence of the classification.

#### Protein-Ligand Binding Affinity Prediction:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Generate synthetic data
np.random.seed(42)
n_samples = 100
protein_features = np.random.rand(n_samples, 5)  # 5 protein features
ligand_features = np.random.rand(n_samples, 3)   # 3 ligand features
X = np.hstack((protein_features, ligand_features))
y = np.sum(X, axis=1) + np.random.normal(0, 0.1, n_samples)  # Binding affinity

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train SVM model
model = svm.SVR(kernel='rbf', C=1.0, epsilon=0.1)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared Score: {r2:.4f}")

# Visualize predictions vs actual
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel("Actual Binding Affinity")
plt.ylabel("Predicted Binding Affinity")
plt.title("SVM Prediction of Protein-Ligand Binding Affinity")
plt.show()

#### Protein Solubility Prediction:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Generate synthetic data
np.random.seed(42)
n_samples = 200
X = np.random.rand(n_samples, 10)  # 10 protein features
y = (np.sum(X, axis=1) > 5).astype(int)  # Binary solubility (0: insoluble, 1: soluble)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train SVM model
model = svm.SVC(kernel='rbf', C=1.0)
model.fit(X_train_scaled, y_train)

# Predict
y_pred = model.predict(X_test_scaled)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")

# Visualize confusion matrix
plt.figure(figsize=(8, 6))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title("Confusion Matrix for Protein Solubility Prediction")
plt.colorbar()
tick_marks = np.arange(2)
plt.xticks(tick_marks, ["Insoluble", "Soluble"])
plt.yticks(tick_marks, ["Insoluble", "Soluble"])
plt.ylabel("True Label")
plt.xlabel("Predicted Label")
for i in range(2):
    for j in range(2):
        plt.text(j, i, str(cm[i, j]), horizontalalignment="center")
plt.show()

#### Drug-Target Interaction Prediction:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import StandardScaler

# Generate synthetic data
np.random.seed(42)
n_samples = 300
drug_features = np.random.rand(n_samples, 5)   # 5 drug features
target_features = np.random.rand(n_samples, 7) # 7 target features
X = np.hstack((drug_features, target_features))
y = (np.sum(X, axis=1) > 6).astype(int)  # Binary interaction (0: no interaction, 1: interaction)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train SVM model
model = svm.SVC(kernel='rbf', C=1.0, probability=True)
model.fit(X_train_scaled, y_train)

# Predict probabilities
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

# Compute ROC curve and AUC
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

# Visualize ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Drug-Target Interaction Prediction')
plt.legend(loc="lower right")
plt.show()

#### Protein Structure Quality Assessment:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

# Generate synthetic data
np.random.seed(42)
n_samples = 150
X = np.random.rand(n_samples, 8)  # 8 structure quality features
y = 10 * np.sum(X, axis=1) + np.random.normal(0, 1, n_samples)  # Quality score

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train SVM model
model = svm.SVR(kernel='rbf', C=1.0, epsilon=0.1)
model.fit(X_train_scaled, y_train)

# Predict
y_pred = model.predict(X_test_scaled)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.4f}")

# Visualize predictions vs actual
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel("Actual Quality Score")
plt.ylabel("Predicted Quality Score")
plt.title("SVM Prediction of Protein Structure Quality")
plt.show()

#### Protein-Protein Interaction Prediction:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Generate synthetic data
np.random.seed(42)
n_samples = 250
X = np.random.rand(n_samples, 12)  # 12 features (6 for each protein)
y = (np.sum(X, axis=1) > 6).astype(int)  # Binary interaction (0: no interaction, 1: interaction)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train SVM model
model = svm.SVC(kernel='rbf', C=1.0)
model.fit(X_train_scaled, y_train)

# Predict
y_pred = model.predict(X_test_scaled)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Perform cross-validation
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV score: {np.mean(cv_scores):.4f}")

# Use PCA to reduce to 2 dimensions for visualization
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)

# Create a mesh to plot in
x_min, x_max = X_train_pca[:, 0].min() - 1, X_train_pca[:, 0].max() + 1
y_min, y_max = X_train_pca[:, 1].min() - 1, X_train_pca[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
                     np.arange(y_min, y_max, 0.02))

# Train a new SVM model on the PCA-transformed data for visualization
model_2d = svm.SVC(kernel='rbf', C=1.0)
model_2d.fit(X_train_pca, y_train)

# Plot the decision boundary
Z = model_2d.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.figure(figsize=(10, 8))
plt.imshow(Z, interpolation='nearest',
           extent=(xx.min(), xx.max(), yy.min(), yy.max()), aspect='auto',
           origin='lower', cmap=plt.cm.PuOr_r)
contours = plt.contour(xx, yy, Z, levels=[0], linewidths=2, linestyles='dashed')
plt.scatter(X_train_pca[:, 0], X_train_pca[:, 1], c=y_train, cmap=plt.cm.Paired,
            edgecolors='black')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('SVM Decision Function for Protein-Protein Interaction (PCA)')
plt.colorbar(label='Decision function value')
plt.show()

## Instances where we use the Principal Component Analysis (PCA) Equation: X' = X - μ [U, S, V] = SVD(X') Y = X' · V

Each of these examples shows how PCA can take high-dimensional biological data and transform it into a more manageable form, allowing for visualization, analysis of important features, and use in further machine learning tasks. They collectively demonstrate the versatility of PCA in handling various types of data relevant to protein quality scoring and drug discovery, from structural information to gene expression and drug sensitivity data.  Some of these include:
* 1. Reducing the dimensionality of complex biological data
* 2. Visualizing high-dimensional data in 2D or 3D space
* 3. Identifying the most important features or directions of variation
* 4. Preprocessing data for machine learning models
* 5. Analyzing conformational changes and structural variations in proteins
* 6. Exploring gene expression patterns related to drug response
 
     

 **Protein Structure Analysis**

 **Gene Expression Analysis for Drug Discovery**:

 **Protein-Ligand Interaction Analysis**:

 **Protein Conformational Changes Analysis**:

 **Drug Sensitivity Prediction**:


### Protein Structure Analysis:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D

# Generate synthetic protein structure data
np.random.seed(42)
n_residues = 100
n_features = 20  # e.g., coordinates, angles, distances

X = np.random.rand(n_residues, n_features)

# Perform PCA
pca = PCA()
X_pca = pca.fit_transform(X)

# Plot explained variance ratio
plt.figure(figsize=(10, 6))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Explained Variance Ratio in Protein Structure Analysis')
plt.show()

# Visualize first 3 principal components
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(X_pca[:, 0], X_pca[:, 1], X_pca[:, 2], c=range(n_residues), cmap='viridis')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')
plt.colorbar(scatter, label='Residue Index')
plt.title('Protein Structure in PCA Space')
plt.show()

### Gene Expression Analysis for Drug Discovery:

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Generate synthetic gene expression data
np.random.seed(42)
n_genes = 1000
n_samples = 50

X = np.random.rand(n_samples, n_genes)
drug_response = np.random.rand(n_samples)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Plot explained variance ratio
plt.figure(figsize=(10, 6))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Explained Variance Ratio in Gene Expression Analysis')
plt.show()

# Visualize first 2 principal components with drug response
plt.figure(figsize=(12, 8))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=drug_response, cmap='viridis')
plt.colorbar(scatter, label='Drug Response')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Gene Expression PCA with Drug Response')
plt.show()

# Heatmap of top contributing genes
n_top_genes = 20
top_genes = np.abs(pca.components_[0]).argsort()[-n_top_genes:][::-1]
plt.figure(figsize=(12, 8))
sns.heatmap(X[:, top_genes], cmap='YlGnBu')
plt.xlabel('Top Contributing Genes')
plt.ylabel('Samples')
plt.title('Heatmap of Top Contributing Genes')
plt.show()

### Protein-Ligand Interaction Analysis:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Generate synthetic protein-ligand interaction data
np.random.seed(42)
n_complexes = 200
n_features = 50  # e.g., interaction energies, distances, angles

X = np.random.rand(n_complexes, n_features)
binding_affinity = np.random.rand(n_complexes)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Plot explained variance ratio
plt.figure(figsize=(10, 6))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Explained Variance Ratio in Protein-Ligand Interaction Analysis')
plt.show()

# Visualize first 2 principal components with binding affinity
plt.figure(figsize=(12, 8))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=binding_affinity, cmap='viridis')
plt.colorbar(scatter, label='Binding Affinity')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Protein-Ligand Interactions in PCA Space')
plt.show()

# Feature importance analysis
feature_importance = np.abs(pca.components_[0])
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5

plt.figure(figsize=(12, 6))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, sorted_idx)
plt.xlabel('Absolute Importance')
plt.ylabel('Feature Index')
plt.title('Feature Importance in First Principal Component')
plt.show()

### Protein Conformational Changes Analysis:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D

# Generate synthetic protein conformational data
np.random.seed(42)
n_conformations = 1000
n_atoms = 50

X = np.random.rand(n_conformations, n_atoms * 3)  # 3D coordinates for each atom

# Perform PCA
pca = PCA()
X_pca = pca.fit_transform(X)

# Plot explained variance ratio
plt.figure(figsize=(10, 6))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Explained Variance Ratio in Protein Conformational Analysis')
plt.show()

# Visualize first 3 principal components
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(X_pca[:, 0], X_pca[:, 1], X_pca[:, 2], c=range(n_conformations), cmap='viridis')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')
plt.colorbar(scatter, label='Conformation Index')
plt.title('Protein Conformational Space in PCA')
plt.show()

# Visualize the most significant motion
most_significant_motion = pca.components_[0].reshape(-1, 3)
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')
ax.quiver(X[0, ::3], X[0, 1::3], X[0, 2::3], 
          most_significant_motion[:, 0], most_significant_motion[:, 1], most_significant_motion[:, 2],
          length=1.0, normalize=True)
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
plt.title('Most Significant Motion (First Principal Component)')
plt.show()

### Drug Sensitivity Prediction:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Generate synthetic drug sensitivity data
np.random.seed(42)
n_samples = 500
n_features = 100  # e.g., gene expression levels, mutation status

X = np.random.rand(n_samples, n_features)
y = np.sum(X[:, :10], axis=1) + np.random.normal(0, 0.1, n_samples)  # Drug sensitivity

# Perform PCA
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared Score: {r2:.4f}")

# Plot predicted vs actual values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel("Actual Drug Sensitivity")
plt.ylabel("Predicted Drug Sensitivity")
plt.title("Drug Sensitivity Prediction using PCA-reduced Features")
plt.show()

# Feature importance in PCA space
feature_importance = np.abs(model.coef_)
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5

plt.figure(figsize=(12, 6))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, [f'PC{i+1}' for i in sorted_idx])
plt.xlabel('Absolute Importance')
plt.ylabel('Principal Component')
plt.title('Importance of Principal Components in Drug Sensitivity Prediction')
plt.show()

## Instances where we use the Affine Transformation equation: y = x · A^T + b 

The Affine Transformation equation is a fundamental equation in linear algebra and has numerous applications in various fields, including bioinformatics, machine learning, and computational biology.
In the context of protein quality characterization and affinity binding studies for drug discovery, this equation is extremely important for several reasons:

**Feature Transformation**:

It can transform input features (x) into a new feature space (y) that might be more informative or suitable for analysis.
In protein analysis, it can transform amino acid sequences or structural features into a space that better represents protein quality or drug interactions.


**Dimensionality Reduction**:

When the matrix A has fewer rows than columns, this equation can reduce the dimensionality of the data, similar to PCA but in a more general form.
This is crucial for handling high-dimensional protein or drug data.


**Linear Regression**:

In its simplest form (when A is a vector), this equation represents linear regression, which is widely used in quantitative structure-activity relationship (QSAR) studies in drug discovery.


**Neural Network Layers**:

This equation forms the basis of fully connected layers in neural networks, which are increasingly used in protein structure prediction and drug-target interaction prediction.


**Protein Structure Alignment**:

When used with 3D coordinates, this equation can represent rotations and translations, which are fundamental operations in protein structure alignment algorithms.


**Encoding Biological Information**:

The matrix A can encode biological knowledge or learned patterns about protein quality or drug interactions.


**Scoring Functions**:

It can be used to create scoring functions for protein-ligand docking, where x might represent features of the protein-ligand complex, and y the binding score.


**Projection of Data**:

It allows for the projection of protein or drug data onto specific directions that might be biologically relevant.


**Data Augmentation**:

In machine learning applications for drug discovery, this transformation can be used for data augmentation to generate new, synthetic data points.


**Feature Extraction**:

It can be used to extract relevant features from raw protein or drug data, making subsequent analysis more effective.

### Gene expression analysis: Transforming gene expression data

In [None]:
import torch
import numpy as np

# Simulated gene expression data
num_samples = 100
num_genes = 1000
gene_expression = torch.randn(num_samples, num_genes)

# Linear transformation for gene expression analysis
output_dim = 50
A = torch.randn(output_dim, num_genes)
b = torch.randn(output_dim)

# Transform gene expression data
transformed_expression = torch.matmul(gene_expression, A.t()) + b

print("Original shape:", gene_expression.shape)
print("Transformed shape:", transformed_expression.shape)

# Example analysis: Find genes with highest average expression
avg_expression = torch.mean(transformed_expression, dim=0)
top_genes = torch.argsort(avg_expression, descending=True)[:10]
print("Top 10 gene indices after transformation:", top_genes.tolist())

### Protein Structure Prediction:

In [None]:
import torch

# Simulated protein features (e.g., amino acid properties)
num_residues = 200
feature_dim = 20
protein_features = torch.randn(1, num_residues, feature_dim)

# Linear transformation for structure prediction
output_dim = 3  # 3D coordinates
A = torch.randn(output_dim, feature_dim)
b = torch.randn(output_dim)

# Predict 3D coordinates
predicted_structure = torch.matmul(protein_features, A.t()) + b

print("Protein features shape:", protein_features.shape)
print("Predicted structure shape:", predicted_structure.shape)

# Visualize first few predicted coordinates
print("First 5 predicted 3D coordinates:")
print(predicted_structure[0, :5, :].numpy())

### Sequence Analysis (DNA encoding):

In [None]:
import torch

# DNA sequence
dna_seq = "ATCGATCGATCG"

# One-hot encoding
nucleotide_to_index = {'A': 0, 'T': 1, 'C': 2, 'G': 3}
one_hot = torch.zeros(len(dna_seq), 4)
for i, nucleotide in enumerate(dna_seq):
    one_hot[i, nucleotide_to_index[nucleotide]] = 1

# Linear transformation for sequence analysis
output_dim = 8
A = torch.randn(output_dim, 4)
b = torch.randn(output_dim)

# Transform encoded sequence
transformed_seq = torch.matmul(one_hot, A.t()) + b

print("One-hot encoded shape:", one_hot.shape)
print("Transformed sequence shape:", transformed_seq.shape)

# Example analysis: Find position with highest transformed value
max_pos = torch.argmax(torch.max(transformed_seq, dim=1)[0])
print("Position with highest transformed value:", max_pos.item())

### Dimensionality Reduction:

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Enable interactive mode for matplotlib
plt.ion()

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Simulated high-dimensional biological data
num_samples = 1000
original_dim = 100
data = torch.randn(num_samples, original_dim)

# Linear transformation for dimensionality reduction
reduced_dim = 2
A = torch.randn(reduced_dim, original_dim)
b = torch.randn(reduced_dim)

# Reduce dimensionality using our linear transformation
reduced_data = torch.matmul(data, A.t()) + b

print("Original data shape:", data.shape)
print("Reduced data shape:", reduced_data.shape)

# Use PCA for comparison
pca = PCA(n_components=2)
pca_reduced_data = pca.fit_transform(data.numpy())

# Plot 1: Original vs Our Method
fig1, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Original data (first 2 dimensions)
ax1.scatter(data[:, 0].numpy(), data[:, 1].numpy(), alpha=0.5)
ax1.set_title("Original Data (First 2 Dimensions)")
ax1.set_xlabel("Dimension 1")
ax1.set_ylabel("Dimension 2")

# Reduced data (our method)
ax2.scatter(reduced_data[:, 0].numpy(), reduced_data[:, 1].numpy(), alpha=0.5)
ax2.set_title("Reduced Data (Our Method)")
ax2.set_xlabel("Dimension 1")
ax2.set_ylabel("Dimension 2")

plt.tight_layout()
plt.show()

# Plot 2: Original vs PCA
fig2, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Original data (first 2 dimensions)
ax1.scatter(data[:, 0].numpy(), data[:, 1].numpy(), alpha=0.5)
ax1.set_title("Original Data (First 2 Dimensions)")
ax1.set_xlabel("Dimension 1")
ax1.set_ylabel("Dimension 2")

# Reduced data (PCA)
ax2.scatter(pca_reduced_data[:, 0], pca_reduced_data[:, 1], alpha=0.5)
ax2.set_title("Reduced Data (PCA)")
ax2.set_xlabel("Principal Component 1")
ax2.set_ylabel("Principal Component 2")

plt.tight_layout()
plt.show()

print("Explained variance ratio (PCA):", pca.explained_variance_ratio_)

# Keep the plot windows open
plt.ioff()
plt.show()

### Protein-Protein Interaction Prediction:

#### This could be used as part of a larger model to predict protein-protein interactions based on protein features.

In [None]:
import torch
import matplotlib.pyplot as plt

# Protein features (e.g., amino acid composition, hydrophobicity, etc.)
protein_features = torch.randn(100, 50)  # 100 proteins, 50 features each

# Linear transformation for interaction prediction
A = torch.randn(1, 50)
b = torch.randn(1)

# Predict interaction scores
interaction_scores = torch.matmul(protein_features, A.t()) + b

# Visualize interaction scores
plt.figure(figsize=(10, 6))
plt.hist(interaction_scores.numpy(), bins=20, edgecolor='black')
plt.title('Distribution of Protein-Protein Interaction Scores')
plt.xlabel('Interaction Score')
plt.ylabel('Frequency')
plt.show()

# Heatmap of protein features
plt.figure(figsize=(12, 8))
plt.imshow(protein_features.numpy(), aspect='auto', cmap='viridis')
plt.colorbar(label='Feature Value')
plt.title('Heatmap of Protein Features')
plt.xlabel('Feature Index')
plt.ylabel('Protein Index')
plt.show()

### Genomic Variant Effect Prediction:

#### This could be used to predict the functional impact of genomic variants on different cellular processes.

In [None]:
import torch
import matplotlib.pyplot as plt

# Genomic variant features (e.g., position, surrounding sequence properties)
variant_features = torch.randn(1000, 30)  # 1000 variants, 30 features each

# Linear transformation for effect prediction
A = torch.randn(5, 30)  # 5 different effect categories
b = torch.randn(5)

# Predict variant effects
effect_predictions = torch.matmul(variant_features, A.t()) + b

# Visualize effect predictions
plt.figure(figsize=(10, 6))
plt.boxplot(effect_predictions.numpy())
plt.title('Distribution of Predicted Effects Across Categories')
plt.xlabel('Effect Category')
plt.ylabel('Predicted Effect Score')
plt.show()

# Scatter plot of two effect categories
plt.figure(figsize=(8, 8))
plt.scatter(effect_predictions[:, 0], effect_predictions[:, 1], alpha=0.5)
plt.title('Scatter Plot of Two Effect Categories')
plt.xlabel('Effect Category 1')
plt.ylabel('Effect Category 2')
plt.show()

### Drug Response Prediction:

#### This could be used to predict drug responses across different cell lines based on molecular features.

In [None]:
import torch
import matplotlib.pyplot as plt
import seaborn as sns

# Drug molecular features and cell line features
drug_features = torch.randn(50, 100)  # 50 drugs, 100 features each
cell_features = torch.randn(20, 50)   # 20 cell lines, 50 features each

# Combine features
combined_features = torch.cat([drug_features.unsqueeze(1).expand(-1, 20, -1),
                               cell_features.unsqueeze(0).expand(50, -1, -1)], dim=2)

# Linear transformation for response prediction
A = torch.randn(1, 150)  # 100 drug features + 50 cell features
b = torch.randn(1)

# Predict drug response
response_predictions = torch.matmul(combined_features.view(-1, 150), A.t()) + b
response_predictions = response_predictions.view(50, 20)

# Visualize drug response predictions as a heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(response_predictions.numpy(), cmap='coolwarm', center=0)
plt.title('Heatmap of Predicted Drug Responses')
plt.xlabel('Cell Line Index')
plt.ylabel('Drug Index')
plt.show()

# Distribution of response predictions
plt.figure(figsize=(10, 6))
plt.hist(response_predictions.numpy().flatten(), bins=30, edgecolor='black')
plt.title('Distribution of Predicted Drug Responses')
plt.xlabel('Predicted Response')
plt.ylabel('Frequency')
plt.show()

### Metabolic Pathway Flux Analysis:

#### This could be used to estimate reaction fluxes in metabolic pathways based on metabolite concentrations.

In [None]:
import torch
import matplotlib.pyplot as plt

# Metabolite concentrations
metabolite_concentrations = torch.randn(100, 50)  # 100 samples, 50 metabolites

# Linear transformation for flux prediction
A = torch.randn(20, 50)  # 20 different reactions
b = torch.randn(20)

# Predict reaction fluxes
predicted_fluxes = torch.matmul(metabolite_concentrations, A.t()) + b

# Visualize predicted fluxes
plt.figure(figsize=(12, 6))
plt.boxplot(predicted_fluxes.numpy())
plt.title('Distribution of Predicted Fluxes Across Reactions')
plt.xlabel('Reaction Index')
plt.ylabel('Predicted Flux')
plt.show()

# Heatmap of metabolite concentrations
plt.figure(figsize=(12, 8))
plt.imshow(metabolite_concentrations.numpy(), aspect='auto', cmap='viridis')
plt.colorbar(label='Concentration')
plt.title('Heatmap of Metabolite Concentrations')
plt.xlabel('Metabolite Index')
plt.ylabel('Sample Index')
plt.show()

### Transcription Factor Binding Site Prediction:

#### This could be part of a model to predict transcription factor binding sites in DNA sequences.

In [None]:
import torch
import matplotlib.pyplot as plt

# DNA sequence features (e.g., one-hot encoded)
dna_sequences = torch.randn(1000, 100, 4)  # 1000 sequences, 100 bp length, 4 nucleotides

# Linear transformation for binding prediction
A = torch.randn(1, 400)  # Flattened filter
b = torch.randn(1)

# Predict binding scores
binding_scores = torch.matmul(dna_sequences.view(1000, -1), A.t()) + b

# Visualize binding scores
plt.figure(figsize=(10, 6))
plt.hist(binding_scores.numpy(), bins=30, edgecolor='black')
plt.title('Distribution of Predicted Binding Scores')
plt.xlabel('Binding Score')
plt.ylabel('Frequency')
plt.show()

# Visualize a sample DNA sequence
sample_sequence = dna_sequences[0].numpy().T
plt.figure(figsize=(12, 4))
plt.imshow(sample_sequence, aspect='auto', cmap='viridis')
plt.colorbar(label='Nucleotide Encoding')
plt.title('Visualization of a Sample DNA Sequence')
plt.xlabel('Position in Sequence')
plt.ylabel('Nucleotide (A, T, C, G)')
plt.yticks([0, 1, 2, 3], ['A', 'T', 'C', 'G'])
plt.show()