# Hypothetical HBV Clinical Data Generation

Note: In order to mimick the process of data processing used in the study, I have generated hypothetical HBV clinical data. The data shown below has no relation to the original clinical data. 

In [1]:
import pandas as pd
import numpy as np
import random

## Step 1: Generate a random dataframe for Sample (i.e. unique patients' number used during the clinical study). 

In [2]:
# Create the random dataframe for Sample (S)
dataframe_sample = pd.DataFrame(np.random.randint(1,
                                                  100, size=(1000, 1)),
                                columns=list('S'))

## Step 2: Generate a random dataframe for Reference Genome (i.e. genotype-specific reference sequence (GenBank accession numbers: AF090842, AB033554, AB033556, AF121240, and AB032431, for genotypes A to E, respectively))

In [3]:
# List of elements for Reference Genome used in the clinical study
GenBank_sequences = ["AF090842",
                     "AB033554",
                     "AB033556",
                     "AF121240",
                     "AB032431"]

# Create the dataframe for Reference Genome (R)
dataframe_reference_genome = pd.DataFrame(np.random.choice
                                          (GenBank_sequences,
                                           size=1000,
                                           replace=True,
                                           p=None),
                                          columns=list('R'))

## Step 3: Generate a random dataframe for Gene (i.e. genes encoded by the genome: C (PreCore, Core), P (Polymerase, RT), S (PreS1, PreS2, HBsAg), X)

In [4]:
# List of elements for Genes used in the clinical study
gene_genome = ["PreCore", "Core",
               "Polymerase", "RT",
               "PreS1", "PreS2",
               "HBsAg", "X"]

In [5]:
# Create the dataframe for Genes (G)
dataframe_gene_genome = pd.DataFrame(np.random.choice
                                     (gene_genome, size=1000,
                                      replace=True, p=None),
                                     columns=list('G'))

## Step 4: Generate a random dataframe for Effect (i.e. intragenic or missense variant)

In [6]:
# List of elements for Effect used in the clinical study
effect_variant = ["intragenic_variant", "missense_variant"]

# Create the dataframe for Effect (E)
dataframe_effect = pd.DataFrame(np.random.choice(effect_variant,
                                                 size=1000,
                                                 replace=True,
                                                 p=None), columns=list('E'))

## Step 5: Generate a random dataframe for amino acid variant (e.g. p.Pro156Ser). 

In [7]:
# List for p.amino_acid (e.g. p.Pro in p.Pro156Ser)
p_amino_acid = ["p.Ala", "p.Arg", "p.Asn", "p.Asp", "p.Cys",
                "p.Gln", "p.Glu", "p.Gly", "p.His", "p.Ile", "p.Leu",
                "p.Lys", "p.Met", "p.Phe", "p.Pro", "p.Ser",
                "p.Thr", "p.Trp", "p.Tyr", "p.Val", "p.Asx", "p.Glx"]

# Generate the random p.amino_acid
random_p_amino_acid = [random.choice(p_amino_acid) for i in range(1000)]

In [8]:
# Generate the random amino acid position
random_digit_amino_acid_int = [int(100*random.random()) for i in range(1000)]

# Convert list of interger to list of string
random_digit_amino_acid = list()
for element in random_digit_amino_acid_int:
    random_digit_amino_acid.append(str(element))

In [9]:
# List for amino_acid (e.g. Ser in p.Pro156Ser)
amino_acid = ["Ala", "Arg", "Asn", "Asp", "Cys",
              "Gln", "Glu", "Gly", "His", "Ile", "Leu",
              "Lys", "Met", "Phe", "Pro", "Ser",
              "Thr", "Trp", "Tyr", "Val", "Asx", "Glx"]

# Generate the random amino_acid
random_amino_acid = [random.choice(amino_acid) for i in range(1000)]

In [10]:
# Combine p.amino_acid, position #
# and amino_acid into one string for each index
amino_acid_variant = map(lambda a, b, c: a + b + c,
                         random_p_amino_acid,
                         random_digit_amino_acid,
                         random_amino_acid)

# Create a dataframe for amino_acid_variant (A)
dataframe_amino_acid_variant = pd.DataFrame(
    amino_acid_variant,
    columns=list('A'))

## Step 6: Generate a random dataframe for nucleotide variant (e.g. c.314C>T). 

In [11]:
# Repeat "c." for 1000 times to create list
c_repeat_1000 = ["c."] * 1000

In [12]:
# Generate the random amino acid position
random_digit_nucleotide_int = [int(100*random.random()) for i in range(1000)]

# Convert list of interger to list of string
random_digit_nucleotide = list()
for element in random_digit_nucleotide_int:
    random_digit_nucleotide.append(str(element))

In [13]:
# List for single letter nucleotides (i.e. A, C, G, T) mutation combinations
nucleotide_mutation = ["C>T", "T>C", "A>G", "G>A"]

# Generate the random p.amino_acid
random_nucleotide_mutation = [random.choice(nucleotide_mutation)
                              for i in range(1000)]

In [14]:
# Combine "c.", position #, and nucleotide_mutation
# into one string for each index
nucleotide_variant = map(lambda a, b, c: a + b + c,
                         c_repeat_1000,
                         random_digit_nucleotide,
                         random_nucleotide_mutation)

# Create a dataframe for nucleotide_variant (N)
dataframe_nucleotide_variant = pd.DataFrame(
    nucleotide_variant, columns=list('N'))

## Step 6: Concatenate all dataframes made in Steps 1-5 into one pandas dataframe

In [15]:
clinical_data_hypothetical = pd.concat([dataframe_sample,
                                        dataframe_reference_genome,
                                        dataframe_gene_genome,
                                        dataframe_effect,
                                        dataframe_amino_acid_variant,
                                        dataframe_nucleotide_variant], axis=1)

In [16]:
clinical_data_hypothetical

Unnamed: 0,S,R,G,E,A,N
0,48,AF090842,PreCore,missense_variant,p.His8Glu,c.90A>G
1,25,AB032431,Core,intragenic_variant,p.Leu99Glx,c.70A>G
2,90,AB032431,Core,intragenic_variant,p.Leu97Ile,c.36A>G
3,67,AB033554,PreS1,intragenic_variant,p.Asn58Lys,c.51T>C
4,15,AB032431,RT,missense_variant,p.Ser84His,c.69T>C
5,87,AB033554,HBsAg,missense_variant,p.Tyr14Ser,c.37A>G
6,97,AB033554,HBsAg,intragenic_variant,p.Gln67Thr,c.65A>G
7,6,AB033554,PreCore,intragenic_variant,p.Arg88Val,c.54G>A
8,41,AF090842,RT,missense_variant,p.Asx59Pro,c.52T>C
9,93,AF121240,HBsAg,intragenic_variant,p.Pro32Gln,c.7A>G


## Step 7: Write pandas dataframe in Step 6 into xlsx file 

In [17]:
clinical_data_hypothetical.to_csv('./clinical_data_hypothetical.csv')