In [4]:
import pandas as pd
import random

# Parameters
num_samples = 1000  # Number of DNA samples
num_snps = 10000    # Total number of SNPs
cancer_snps = 100   # Number of SNPs that are correlated with cancer

# Step 1: Generate SNPs
def generate_genotype():
    """Generate random genotype (AA, AG, GG)."""
    alleles = ['A', 'G']  # Let's keep it simple with only A and G
    return random.choice(alleles) + random.choice(alleles)

# Step 2: Create SNP IDs
snp_ids = [f"rs{random.randint(1000000, 9999999)}" for _ in range(num_snps)]

# Step 3: Generate SNPs correlated with cancer
# Select specific SNPs that will have some correlation with cancer
correlated_snps = random.sample(snp_ids, cancer_snps)

# Step 4: Generate DNA samples
def generate_sample(correlated_snps, has_cancer):
    sample_data = {}
    for snp in snp_ids:
        # If the SNP is correlated with cancer, give a higher probability of certain genotypes based on the disease
        if snp in correlated_snps:
            if has_cancer:
                # Higher probability of "GG" (cancer-associated genotype)
                sample_data[snp] = random.choices(['AA', 'AG', 'GG'], weights=[0.1, 0.3, 0.6])[0]
            else:
                # Lower probability of "GG" in non-cancer samples
                sample_data[snp] = random.choices(['AA', 'AG', 'GG'], weights=[0.4, 0.4, 0.2])[0]
        else:
            # Randomly assign genotypes to non-cancer-associated SNPs
            sample_data[snp] = generate_genotype()
    
    return sample_data

# Step 5: Create the dataset
samples = []
cancer_labels = []

for i in range(num_samples):
    # Randomly assign some samples to have cancer (let's say 30% of samples have cancer)
    has_cancer = random.random() < 0.3
    sample_data = generate_sample(correlated_snps, has_cancer)
    samples.append(sample_data)
    cancer_labels.append(has_cancer)

# Step 6: Convert to DataFrame
samples_df = pd.DataFrame(samples)
samples_df['Has_Cancer'] = cancer_labels

# Display the first few rows of the generated data
samples_df.head()

# Step 7: Save the generated dataset to a CSV file
samples_df.to_csv('generated_dna_samples.csv', index=False)
print("Generated DNA samples saved to 'generated_dna_samples.csv'.")


Generated DNA samples saved to 'generated_dna_samples.csv'.


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Step 1: Load the generated dataset
data = pd.read_csv('generated_dna_samples.csv')

# Step 2: Encode the genotypes
def encode_genotype(genotype):
    if genotype == 'AA':
        return 0
    elif genotype == 'AG':
        return 1
    elif genotype == 'GG':
        return 2
    else:
        return -1  # In case of any unknown genotype (shouldn't happen in this context)

# Apply the encoding to all SNP columns
for snp in data.columns[:-1]:  # Exclude 'Has_Cancer' column from encoding
    data[snp] = data[snp].apply(encode_genotype)

# Step 3: Split features and labels
X = data.drop('Has_Cancer', axis=1)
y = data['Has_Cancer']

# Step 4: Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 5: Train a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Step 6: Make predictions and evaluate accuracy
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Step 7: Feature importance
feature_importances = pd.Series(clf.feature_importances_, index=X.columns).sort_values(ascending=False)
print("Top SNPs correlated with cancer:")
print(feature_importances.head(10))


Accuracy: 0.87
Top SNPs correlated with cancer:
rs1119748    0.010685
rs5657838    0.008877
rs2356057    0.008278
rs8316234    0.008271
rs8600934    0.007886
rs8661899    0.007613
rs5065895    0.007534
rs4319829    0.007490
rs6722285    0.007251
rs5949485    0.007088
dtype: float64
