In [4]:
import pandas as pd
import random

num_samples = 1000  
num_snps = 10000   
cancer_snps = 100   

def generate_genotype():
    """Generate random genotype (AA, AG, GG)."""
    alleles = ['A', 'G']  
    return random.choice(alleles) + random.choice(alleles)

snp_ids = [f"rs{random.randint(1000000, 9999999)}" for _ in range(num_snps)]


correlated_snps = random.sample(snp_ids, cancer_snps)


def generate_sample(correlated_snps, has_cancer):
    sample_data = {}
    for snp in snp_ids:
       
        if snp in correlated_snps:
            if has_cancer:
                
                sample_data[snp] = random.choices(['AA', 'AG', 'GG'], weights=[0.1, 0.3, 0.6])[0]
            else:
               
                sample_data[snp] = random.choices(['AA', 'AG', 'GG'], weights=[0.4, 0.4, 0.2])[0]
        else:
            sample_data[snp] = generate_genotype()
    
    return sample_data

samples = []
cancer_labels = []

for i in range(num_samples):
    has_cancer = random.random() < 0.3
    sample_data = generate_sample(correlated_snps, has_cancer)
    samples.append(sample_data)
    cancer_labels.append(has_cancer)

samples_df = pd.DataFrame(samples)
samples_df['Has_Cancer'] = cancer_labels

samples_df.head()

samples_df.to_csv('generated_dna_samples.csv', index=False)
print("Generated DNA samples saved to 'generated_dna_samples.csv'.")


Generated DNA samples saved to 'generated_dna_samples.csv'.


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

data = pd.read_csv('generated_dna_samples.csv')

def encode_genotype(genotype):
    if genotype == 'AA':
        return 0
    elif genotype == 'AG':
        return 1
    elif genotype == 'GG':
        return 2
    else:
        return -1  

for snp in data.columns[:-1]:  
    data[snp] = data[snp].apply(encode_genotype)

X = data.drop('Has_Cancer', axis=1)
y = data['Has_Cancer']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

feature_importances = pd.Series(clf.feature_importances_, index=X.columns).sort_values(ascending=False)
print("Top SNPs correlated with cancer:")
print(feature_importances.head(10))


Accuracy: 0.87
Top SNPs correlated with cancer:
rs1119748    0.010685
rs5657838    0.008877
rs2356057    0.008278
rs8316234    0.008271
rs8600934    0.007886
rs8661899    0.007613
rs5065895    0.007534
rs4319829    0.007490
rs6722285    0.007251
rs5949485    0.007088
dtype: float64
