# Linear transformations and equations 

In each of these examples, the linear transformation serves as a fundamental building block that can be part of more complex models or analyses. In practice, these transformations are often combined with non-linear activations and other layers in deep learning models for improved performance and capability to capture complex relationships in biological data. Remember that while these linear transformations are useful, many biological processes are inherently non-linear. Therefore, these linear models are often used as components within more complex, non-linear models (like neural networks) in advanced bioinformatics applications.

I have put together some of these equations together with some viualizations and code snippets in Python.

These visualizations provide different ways to interpret the data and results:

Histograms show the distribution of predicted scores or values.
Heatmaps visualize 2D data, useful for showing relationships between features or samples.
Box plots display the distribution of values across different categories or groups.
Scatter plots can reveal relationships or clusters between two variables.

Remember to adjust the figure sizes, color schemes, and other parameters as needed to best represent your specific data. These visualizations can be very helpful in understanding the patterns in your data and the outputs of your models.

## Instances where we the Position-Specific Scoring Matrix (PSSM) Equation:   S = Σ(i=1 to L) log(P_i / B_i)  

### Transcription Factor Binding Site Prediction

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

def pssm_score(sequence, pssm, background_probs):
    score = 0
    pseudo_count = 1e-10  # Small pseudo-count to avoid log(0)
    for i, nucleotide in enumerate(sequence):
        p = pssm[nucleotide][i] + pseudo_count
        b = background_probs[nucleotide] + pseudo_count
        score += np.log2(p / b)
    return score

# Example PSSM for a transcription factor binding site
pssm = {
    'A': [0.1, 0.6, 0.1, 0.1, 0.9],
    'C': [0.1, 0.1, 0.1, 0.8, 0.0],
    'G': [0.7, 0.1, 0.1, 0.0, 0.1],
    'T': [0.1, 0.2, 0.7, 0.1, 0.0]
}

background_probs = {'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25}

# Generate random DNA sequences
sequences = [''.join(np.random.choice(['A', 'C', 'G', 'T'], 5)) for _ in range(1000)]

scores = [pssm_score(seq, pssm, background_probs) for seq in sequences]

# Filter out any remaining -inf values (should be rare or none with pseudo-count)
scores = [score for score in scores if np.isfinite(score)]

plt.figure(figsize=(10, 6))
plt.hist(scores, bins=30, edgecolor='black')
plt.title('Distribution of PSSM Scores for Random Sequences')
plt.xlabel('PSSM Score')
plt.ylabel('Frequency')
plt.show()

# Visualize PSSM
plt.figure(figsize=(10, 6))
sns.heatmap(pd.DataFrame(pssm), annot=True, cmap='YlGnBu')
plt.title('Position-Specific Scoring Matrix (PSSM)')
plt.xlabel('Position')
plt.ylabel('Nucleotide')
plt.show()

### Protein Family Classification

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

def generate_pssm(sequences):
    length = len(sequences[0])
    pssm = np.zeros((length, 20))
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    for i in range(length):
        for j, aa in enumerate(amino_acids):
            pssm[i, j] = sum(seq[i] == aa for seq in sequences) / len(sequences)
    return pssm

def pssm_score(sequence, pssm, background_probs):
    score = 0
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    for i, aa in enumerate(sequence):
        if aa in amino_acids:
            j = amino_acids.index(aa)
            score += np.log2(pssm[i, j] / background_probs[aa])
    return score

# Generate synthetic data
def generate_family(length, num_sequences):
    consensus = ''.join(np.random.choice(list('ACDEFGHIKLMNPQRSTVWY'), length))
    sequences = []
    for _ in range(num_sequences):
        seq = ''.join(np.random.choice([aa, np.random.choice(list('ACDEFGHIKLMNPQRSTVWY'))] 
                                       , p=[0.9, 0.1]) for aa in consensus)
        sequences.append(seq)
    return sequences

family1 = generate_family(50, 100)
family2 = generate_family(50, 100)

# Create PSSMs
pssm1 = generate_pssm(family1)
pssm2 = generate_pssm(family2)

background_probs = {aa: 0.05 for aa in 'ACDEFGHIKLMNPQRSTVWY'}

# Score sequences
X = family1 + family2
y = [0] * len(family1) + [1] * len(family2)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Simple classifier
def classify(sequence):
    score1 = pssm_score(sequence, pssm1, background_probs)
    score2 = pssm_score(sequence, pssm2, background_probs)
    return 0 if score1 > score2 else 1

y_pred = [classify(seq) for seq in X_test]

# Visualize results
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Family 1', 'Family 2'])
disp.plot(cmap='Blues')
plt.title('Confusion Matrix for Protein Family Classification')
plt.show()

### Multiple Sequence Alignment Scoring

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def generate_pssm(alignment):
    length = len(alignment[0])
    pssm = np.zeros((length, 20))
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    for i in range(length):
        for j, aa in enumerate(amino_acids):
            pssm[i, j] = sum(seq[i] == aa for seq in alignment) / len(alignment)
    return pssm

def pssm_score(sequence, pssm, background_probs):
    score = 0
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    for i, aa in enumerate(sequence):
        if aa in amino_acids:
            j = amino_acids.index(aa)
            score += np.log2(pssm[i, j] / background_probs[aa])
    return score

# Generate a mock multiple sequence alignment
alignment = [
    "MAEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGP",
    "MEEPQSDPSVEPPLSQETFSDLWKLLPPNNVLSTLPSSDSIEELFLSENVAGWLEDPDEAP",
    "MEEPQSDLSEEPPLSQETFSDLWNLLPENKLLDLPLSPEDIEQWLSEDPGPDEAPRMPEAA",
    "MEEPQSDLSIELPLSQETFSGLWKLLPPEDILPSPHCMDDLLLPQDVEEFFEGPSEALRVS"
]

pssm = generate_pssm(alignment)
background_probs = {aa: 0.05 for aa in 'ACDEFGHIKLMNPQRSTVWY'}

# Score each sequence in the alignment
scores = [pssm_score(seq, pssm, background_probs) for seq in alignment]

# Visualize PSSM
plt.figure(figsize=(15, 8))
sns.heatmap(pssm, cmap='YlGnBu', xticklabels=list('ACDEFGHIKLMNPQRSTVWY'))
plt.title('Position-Specific Scoring Matrix (PSSM) for Multiple Sequence Alignment')
plt.xlabel('Amino Acid')
plt.ylabel('Position')
plt.show()

# Visualize scores
plt.figure(figsize=(10, 6))
plt.bar(range(len(scores)), scores)
plt.title('PSSM Scores for Sequences in Multiple Sequence Alignment')
plt.xlabel('Sequence Index')
plt.ylabel('PSSM Score')
plt.show()

### Motif Discovery

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def generate_pssm(motifs):
    length = len(motifs[0])
    pssm = np.zeros((length, 4))
    nucleotides = 'ACGT'
    for i in range(length):
        for j, nt in enumerate(nucleotides):
            pssm[i, j] = sum(seq[i] == nt for seq in motifs) / len(motifs)
    return pssm

def pssm_score(sequence, pssm, background_probs):
    score = 0
    nucleotides = 'ACGT'
    for i, nt in enumerate(sequence):
        j = nucleotides.index(nt)
        if pssm[i, j] > 0:
            score += np.log2(pssm[i, j] / background_probs[nt])
        else:
            score += np.log2(1e-300 / background_probs[nt])  # Very low score for zero probability
    return score

def generate_sequences_with_motif(num_sequences, seq_length, motif):
    sequences = []
    for _ in range(num_sequences):
        seq = ''.join(np.random.choice(['A', 'C', 'G', 'T']) for _ in range(seq_length))
        start = np.random.randint(0, seq_length - len(motif) + 1)
        seq = seq[:start] + motif + seq[start+len(motif):]
        sequences.append(seq)
    return sequences

def scan_sequence(sequence, pssm, background_probs, window_size):
    scores = []
    for i in range(len(sequence) - window_size + 1):
        subseq = sequence[i:i+window_size]
        scores.append(pssm_score(subseq, pssm, background_probs))
    return scores

motif = "ATGCATGC"
sequences = generate_sequences_with_motif(100, 50, motif)

pssm = generate_pssm([motif])
background_probs = {'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25}

all_scores = []
for seq in sequences:
    scores = scan_sequence(seq, pssm, background_probs, len(motif))
    all_scores.extend(scores)

# Visualize PSSM
plt.figure(figsize=(10, 6))
sns.heatmap(pssm, annot=True, cmap='YlGnBu', xticklabels=['A', 'C', 'G', 'T'])
plt.title('Position-Specific Scoring Matrix (PSSM) for Motif')
plt.xlabel('Nucleotide')
plt.ylabel('Position')
plt.show()

# Visualize score distribution
plt.figure(figsize=(10, 6))
plt.hist(all_scores, bins=50, edgecolor='black')
plt.title('Distribution of PSSM Scores for Sequence Windows')
plt.xlabel('PSSM Score')
plt.ylabel('Frequency')
plt.show()

### Protein Secondary Structure Prediction

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

def generate_pssm(sequences, structures):
    length = len(sequences[0])
    pssm = np.zeros((length, 20, 3))  # 20 amino acids, 3 structure types
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    structure_types = 'HEC'  # Helix, Sheet (Extended), Coil
    for i in range(length):
        for j, aa in enumerate(amino_acids):
            for k, st in enumerate(structure_types):
                count = sum((seq[i] == aa and struct[i] == st) for seq, struct in zip(sequences, structures))
                total = sum(seq[i] == aa for seq in sequences)
                pssm[i, j, k] = count / total if total > 0 else 0
    return pssm

def pssm_score(sequence, pssm, background_probs):
    scores = np.zeros((len(sequence), 3))
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    for i, aa in enumerate(sequence):
        if aa in amino_acids:
            j = amino_acids.index(aa)
            for k in range(3):
                scores[i, k] = np.log2(pssm[i, j, k] / background_probs[aa])
    return scores

# Generate synthetic data
def generate_data(num_sequences, length):
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    structure_types = 'HEC'
    sequences = [''.join(np.random.choice(list(amino_acids)) for _ in range(length)) for _ in range(num_sequences)]
    structures = [''.join(np.random.choice(list(structure_types)) for _ in range(length)) for _ in range(num_sequences)]
    return sequences, structures

sequences, structures = generate_data(1000, 50)

# Create PSSM
pssm = generate_pssm(sequences, structures)
background_probs = {aa: 0.05 for aa in 'ACDEFGHIKLMNPQRSTVWY'}

# Split data
X_train, X_test, y_train, y_test = train_test_split(sequences, structures, test_size=0.2, random_state=42)

# Predict secondary structure
def predict_structure(sequence):
    scores = pssm_score(sequence, pssm, background_probs)
    return ''.join('HEC'[np.argmax(score)] for score in scores)

y_pred = [predict_structure(seq) for seq in X_test]

# Flatten predictions and true values
y_test_flat = ''.join(y_test)
y_pred_flat = ''.join(y_pred)

# Visualize results
cm = confusion_matrix(list(y_test_flat), list(y_pred_flat))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['H', 'E', 'C'])
disp.plot(cmap='Blues')
plt.title('Confusion Matrix for Secondary Structure Prediction')
plt.show()

# Visualize PSSM for a specific position
position = 25  # Choose a specific position to visualize
plt.figure(figsize=(12, 8))
sns.heatmap(pssm[position], annot=True, cmap='YlGnBu', 
            xticklabels=['H', 'E', 'C'], 
            yticklabels=list('ACDEFGHIKLMNPQRSTVWY'))
plt.title(f'PSSM for Position {position} in Secondary Structure Prediction')
plt.xlabel('Secondary Structure')
plt.ylabel('Amino Acid')
plt.show()

## Instances where we use:     y = x · A^T + b

### Gene expression analysis: Transforming gene expression data

In [None]:
import torch
import numpy as np

# Simulated gene expression data
num_samples = 100
num_genes = 1000
gene_expression = torch.randn(num_samples, num_genes)

# Linear transformation for gene expression analysis
output_dim = 50
A = torch.randn(output_dim, num_genes)
b = torch.randn(output_dim)

# Transform gene expression data
transformed_expression = torch.matmul(gene_expression, A.t()) + b

print("Original shape:", gene_expression.shape)
print("Transformed shape:", transformed_expression.shape)

# Example analysis: Find genes with highest average expression
avg_expression = torch.mean(transformed_expression, dim=0)
top_genes = torch.argsort(avg_expression, descending=True)[:10]
print("Top 10 gene indices after transformation:", top_genes.tolist())

# Protein Structure Prediction:

In [None]:
import torch

# Simulated protein features (e.g., amino acid properties)
num_residues = 200
feature_dim = 20
protein_features = torch.randn(1, num_residues, feature_dim)

# Linear transformation for structure prediction
output_dim = 3  # 3D coordinates
A = torch.randn(output_dim, feature_dim)
b = torch.randn(output_dim)

# Predict 3D coordinates
predicted_structure = torch.matmul(protein_features, A.t()) + b

print("Protein features shape:", protein_features.shape)
print("Predicted structure shape:", predicted_structure.shape)

# Visualize first few predicted coordinates
print("First 5 predicted 3D coordinates:")
print(predicted_structure[0, :5, :].numpy())

# Sequence Analysis (DNA encoding):

In [None]:
import torch

# DNA sequence
dna_seq = "ATCGATCGATCG"

# One-hot encoding
nucleotide_to_index = {'A': 0, 'T': 1, 'C': 2, 'G': 3}
one_hot = torch.zeros(len(dna_seq), 4)
for i, nucleotide in enumerate(dna_seq):
    one_hot[i, nucleotide_to_index[nucleotide]] = 1

# Linear transformation for sequence analysis
output_dim = 8
A = torch.randn(output_dim, 4)
b = torch.randn(output_dim)

# Transform encoded sequence
transformed_seq = torch.matmul(one_hot, A.t()) + b

print("One-hot encoded shape:", one_hot.shape)
print("Transformed sequence shape:", transformed_seq.shape)

# Example analysis: Find position with highest transformed value
max_pos = torch.argmax(torch.max(transformed_seq, dim=1)[0])
print("Position with highest transformed value:", max_pos.item())

# Dimensionality Reduction:

Explanation of what happens during dimensionality reduction:

Original Data:

We start with high-dimensional data (100 dimensions in this case).
Each data point is represented by 100 features, which is difficult to visualize directly.
The first plot shows only the first 3 dimensions of this 100-dimensional space.


Dimensionality Reduction:

We reduce the 100-dimensional data to 2 dimensions.
This process attempts to preserve the most important information or patterns in the data.


Our Linear Transformation Method:

We use the equation y = x · A^T + b to transform the data.
This method projects the high-dimensional data onto a 2D plane.
The resulting plot shows how the data points are distributed in this new 2D space.
However, this random linear transformation may not optimally preserve the data structure.


PCA (Principal Component Analysis) Method:

PCA finds the directions (principal components) of maximum variance in the data.
It then projects the data onto these principal components.
The resulting plot shows the data distributed along the two most significant principal components.
PCA is often more effective at preserving the overall structure of the data.

We added plt.ion() at the beginning to enable interactive mode. We removed the plt.savefig() calls and kept the plt.show() calls. We added plt.ioff() and a final plt.show() at the end to keep the plot windows open.

When you run this script in PyCharm:

The plots should appear in the "SciView" tool window (usually on the right side of the PyCharm window).
If the plots don't appear automatically, you might need to click on the "Python Scientific" tab in the tool window.
You can interact with the plots, zoom in/out, and pan around.

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Enable interactive mode for matplotlib
plt.ion()

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Simulated high-dimensional biological data
num_samples = 1000
original_dim = 100
data = torch.randn(num_samples, original_dim)

# Linear transformation for dimensionality reduction
reduced_dim = 2
A = torch.randn(reduced_dim, original_dim)
b = torch.randn(reduced_dim)

# Reduce dimensionality using our linear transformation
reduced_data = torch.matmul(data, A.t()) + b

print("Original data shape:", data.shape)
print("Reduced data shape:", reduced_data.shape)

# Use PCA for comparison
pca = PCA(n_components=2)
pca_reduced_data = pca.fit_transform(data.numpy())

# Plot 1: Original vs Our Method
fig1, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Original data (first 2 dimensions)
ax1.scatter(data[:, 0].numpy(), data[:, 1].numpy(), alpha=0.5)
ax1.set_title("Original Data (First 2 Dimensions)")
ax1.set_xlabel("Dimension 1")
ax1.set_ylabel("Dimension 2")

# Reduced data (our method)
ax2.scatter(reduced_data[:, 0].numpy(), reduced_data[:, 1].numpy(), alpha=0.5)
ax2.set_title("Reduced Data (Our Method)")
ax2.set_xlabel("Dimension 1")
ax2.set_ylabel("Dimension 2")

plt.tight_layout()
plt.show()

# Plot 2: Original vs PCA
fig2, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Original data (first 2 dimensions)
ax1.scatter(data[:, 0].numpy(), data[:, 1].numpy(), alpha=0.5)
ax1.set_title("Original Data (First 2 Dimensions)")
ax1.set_xlabel("Dimension 1")
ax1.set_ylabel("Dimension 2")

# Reduced data (PCA)
ax2.scatter(pca_reduced_data[:, 0], pca_reduced_data[:, 1], alpha=0.5)
ax2.set_title("Reduced Data (PCA)")
ax2.set_xlabel("Principal Component 1")
ax2.set_ylabel("Principal Component 2")

plt.tight_layout()
plt.show()

print("Explained variance ratio (PCA):", pca.explained_variance_ratio_)

# Keep the plot windows open
plt.ioff()
plt.show()

# Protein-Protein Interaction Prediction:

### This could be used as part of a larger model to predict protein-protein interactions based on protein features.

In [None]:
import torch
import matplotlib.pyplot as plt

# Protein features (e.g., amino acid composition, hydrophobicity, etc.)
protein_features = torch.randn(100, 50)  # 100 proteins, 50 features each

# Linear transformation for interaction prediction
A = torch.randn(1, 50)
b = torch.randn(1)

# Predict interaction scores
interaction_scores = torch.matmul(protein_features, A.t()) + b

# Visualize interaction scores
plt.figure(figsize=(10, 6))
plt.hist(interaction_scores.numpy(), bins=20, edgecolor='black')
plt.title('Distribution of Protein-Protein Interaction Scores')
plt.xlabel('Interaction Score')
plt.ylabel('Frequency')
plt.show()

# Heatmap of protein features
plt.figure(figsize=(12, 8))
plt.imshow(protein_features.numpy(), aspect='auto', cmap='viridis')
plt.colorbar(label='Feature Value')
plt.title('Heatmap of Protein Features')
plt.xlabel('Feature Index')
plt.ylabel('Protein Index')
plt.show()

# Genomic Variant Effect Prediction:

### This could be used to predict the functional impact of genomic variants on different cellular processes.

In [None]:
import torch
import matplotlib.pyplot as plt

# Genomic variant features (e.g., position, surrounding sequence properties)
variant_features = torch.randn(1000, 30)  # 1000 variants, 30 features each

# Linear transformation for effect prediction
A = torch.randn(5, 30)  # 5 different effect categories
b = torch.randn(5)

# Predict variant effects
effect_predictions = torch.matmul(variant_features, A.t()) + b

# Visualize effect predictions
plt.figure(figsize=(10, 6))
plt.boxplot(effect_predictions.numpy())
plt.title('Distribution of Predicted Effects Across Categories')
plt.xlabel('Effect Category')
plt.ylabel('Predicted Effect Score')
plt.show()

# Scatter plot of two effect categories
plt.figure(figsize=(8, 8))
plt.scatter(effect_predictions[:, 0], effect_predictions[:, 1], alpha=0.5)
plt.title('Scatter Plot of Two Effect Categories')
plt.xlabel('Effect Category 1')
plt.ylabel('Effect Category 2')
plt.show()

# Drug Response Prediction:

### This could be used to predict drug responses across different cell lines based on molecular features.

In [None]:
import torch
import matplotlib.pyplot as plt
import seaborn as sns

# Drug molecular features and cell line features
drug_features = torch.randn(50, 100)  # 50 drugs, 100 features each
cell_features = torch.randn(20, 50)   # 20 cell lines, 50 features each

# Combine features
combined_features = torch.cat([drug_features.unsqueeze(1).expand(-1, 20, -1),
                               cell_features.unsqueeze(0).expand(50, -1, -1)], dim=2)

# Linear transformation for response prediction
A = torch.randn(1, 150)  # 100 drug features + 50 cell features
b = torch.randn(1)

# Predict drug response
response_predictions = torch.matmul(combined_features.view(-1, 150), A.t()) + b
response_predictions = response_predictions.view(50, 20)

# Visualize drug response predictions as a heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(response_predictions.numpy(), cmap='coolwarm', center=0)
plt.title('Heatmap of Predicted Drug Responses')
plt.xlabel('Cell Line Index')
plt.ylabel('Drug Index')
plt.show()

# Distribution of response predictions
plt.figure(figsize=(10, 6))
plt.hist(response_predictions.numpy().flatten(), bins=30, edgecolor='black')
plt.title('Distribution of Predicted Drug Responses')
plt.xlabel('Predicted Response')
plt.ylabel('Frequency')
plt.show()

# Metabolic Pathway Flux Analysis:

### This could be used to estimate reaction fluxes in metabolic pathways based on metabolite concentrations.

In [None]:
import torch
import matplotlib.pyplot as plt

# Metabolite concentrations
metabolite_concentrations = torch.randn(100, 50)  # 100 samples, 50 metabolites

# Linear transformation for flux prediction
A = torch.randn(20, 50)  # 20 different reactions
b = torch.randn(20)

# Predict reaction fluxes
predicted_fluxes = torch.matmul(metabolite_concentrations, A.t()) + b

# Visualize predicted fluxes
plt.figure(figsize=(12, 6))
plt.boxplot(predicted_fluxes.numpy())
plt.title('Distribution of Predicted Fluxes Across Reactions')
plt.xlabel('Reaction Index')
plt.ylabel('Predicted Flux')
plt.show()

# Heatmap of metabolite concentrations
plt.figure(figsize=(12, 8))
plt.imshow(metabolite_concentrations.numpy(), aspect='auto', cmap='viridis')
plt.colorbar(label='Concentration')
plt.title('Heatmap of Metabolite Concentrations')
plt.xlabel('Metabolite Index')
plt.ylabel('Sample Index')
plt.show()

# Transcription Factor Binding Site Prediction:

### This could be part of a model to predict transcription factor binding sites in DNA sequences.

In [None]:
import torch
import matplotlib.pyplot as plt

# DNA sequence features (e.g., one-hot encoded)
dna_sequences = torch.randn(1000, 100, 4)  # 1000 sequences, 100 bp length, 4 nucleotides

# Linear transformation for binding prediction
A = torch.randn(1, 400)  # Flattened filter
b = torch.randn(1)

# Predict binding scores
binding_scores = torch.matmul(dna_sequences.view(1000, -1), A.t()) + b

# Visualize binding scores
plt.figure(figsize=(10, 6))
plt.hist(binding_scores.numpy(), bins=30, edgecolor='black')
plt.title('Distribution of Predicted Binding Scores')
plt.xlabel('Binding Score')
plt.ylabel('Frequency')
plt.show()

# Visualize a sample DNA sequence
sample_sequence = dna_sequences[0].numpy().T
plt.figure(figsize=(12, 4))
plt.imshow(sample_sequence, aspect='auto', cmap='viridis')
plt.colorbar(label='Nucleotide Encoding')
plt.title('Visualization of a Sample DNA Sequence')
plt.xlabel('Position in Sequence')
plt.ylabel('Nucleotide (A, T, C, G)')
plt.yticks([0, 1, 2, 3], ['A', 'T', 'C', 'G'])
plt.show()