In [1]:
import pandas as pd

In [6]:
import pandas as pd

def convert_clinvar_txt_to_csv(input_file, output_file):
    """
    Convert ClinVar FBN1 data from tab-delimited text to CSV format
    
    Args:
        input_file (str): Path to input text file
        output_file (str): Path to output CSV file
    """
    
    # Read the tab-delimited file
    df = pd.read_csv(input_file, sep='\t', low_memory=False)
    
    # Display basic info about the data
    print(f"Original data shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print(f"First few rows:")
    print(df.head())
    
    # Save as CSV
    df.to_csv(output_file, index=False)
    
    print(f"\nConversion complete!")
    print(f"Input file: {input_file}")
    print(f"Output file: {output_file}")
    print(f"Rows converted: {len(df)}")

def analyze_clinvar_data(csv_file):
    """
    Perform basic analysis on the converted ClinVar data
    """
    df = pd.read_csv(csv_file)
    
    print("\n=== Data Analysis ===")
    print(f"Total variants: {len(df)}")
    
    # Count by classification
    classification_counts = df['Germline classification'].value_counts()
    print("\nGermline classification counts:")
    for classification, count in classification_counts.items():
        print(f"  {classification}: {count}")
    
    # Count by variant type
    variant_type_counts = df['Variant type'].value_counts()
    print("\nVariant type counts:")
    for variant_type, count in variant_type_counts.items():
        print(f"  {variant_type}: {count}")
    
    # Count by molecular consequence
    consequence_counts = df['Molecular consequence'].value_counts()
    print("\nMolecular consequence counts:")
    for consequence, count in consequence_counts.items():
        print(f"  {consequence}: {count}")

if __name__ == "__main__":
    # File paths
    input_txt_file = "clinvar_fbn1.txt"
    output_csv_file = "clinvar_fbn1.csv"
    
    # Convert the file
    convert_clinvar_txt_to_csv(input_txt_file, output_csv_file)
    
    # Analyze the data
    analyze_clinvar_data(output_csv_file)

Original data shape: (4319, 25)
Columns: ['Name', 'Gene(s)', 'Protein change', 'Condition(s)', 'Accession', 'GRCh37Chromosome', 'GRCh37Location', 'GRCh38Chromosome', 'GRCh38Location', 'VariationID', 'AlleleID(s)', 'dbSNP ID', 'Canonical SPDI', 'Variant type', 'Molecular consequence', 'Germline classification', 'Germline date last evaluated', 'Germline review status', 'Somatic clinical impact', 'Somatic clinical impact date last evaluated', 'Somatic clinical impact review status', 'Oncogenicity classification', 'Oncogenicity date last evaluated', 'Oncogenicity review status', 'Unnamed: 24']
First few rows:
                                         Name Gene(s) Protein change  \
0  NM_000138.5(FBN1):c.8608C>T (p.Leu2870Phe)    FBN1         L2870F   
1  NM_000138.5(FBN1):c.8606T>C (p.Leu2869Ser)    FBN1         L2869S   
2  NM_000138.5(FBN1):c.8603T>G (p.Val2868Gly)    FBN1         V2868G   
3  NM_000138.5(FBN1):c.8603T>A (p.Val2868Asp)    FBN1         V2868D   
4  NM_000138.5(FBN1):c.8602

In [13]:
df = pd.read_csv('clinvar_fbn1.csv')
df = df.drop(['Somatic clinical impact',
              'Somatic clinical impact date last evaluated',
              'Somatic clinical impact review status',
              'Oncogenicity classification', 
              'Oncogenicity date last evaluated', 
              'Oncogenicity review status', 
              'Unnamed: 24'], axis = 1)
df

Unnamed: 0,Name,Gene(s),Protein change,Condition(s),Accession,GRCh37Chromosome,GRCh37Location,GRCh38Chromosome,GRCh38Location,VariationID,AlleleID(s),dbSNP ID,Canonical SPDI,Variant type,Molecular consequence,Germline classification,Germline date last evaluated,Germline review status
0,NM_000138.5(FBN1):c.8608C>T (p.Leu2870Phe),FBN1,L2870F,Familial thoracic aortic aneurysm and aortic d...,VCV000927302,15,48703195,15,48410998,927302,912630,rs2042856175,NC_000015.10:48410997:G:A,single nucleotide variant,missense variant,Uncertain significance,"Dec 5, 2023","criteria provided, single submitter"
1,NM_000138.5(FBN1):c.8606T>C (p.Leu2869Ser),FBN1,L2869S,Marfan syndrome|Familial thoracic aortic aneur...,VCV001414872,15,48703197,15,48411000,1414872,1474486,rs363848,NC_000015.10:48410999:A:G,single nucleotide variant,missense variant,Uncertain significance,"Apr 16, 2024","criteria provided, multiple submitters, no con..."
2,NM_000138.5(FBN1):c.8603T>G (p.Val2868Gly),FBN1,V2868G,not provided,VCV001175867,15,48703200,15,48411003,1175867,1166131,rs1555393485,NC_000015.10:48411002:A:C,single nucleotide variant,missense variant,Uncertain significance,"Apr 1, 2021","criteria provided, single submitter"
3,NM_000138.5(FBN1):c.8603T>A (p.Val2868Asp),FBN1,V2868D,Familial thoracic aortic aneurysm and aortic d...,VCV000519710,15,48703200,15,48411003,519710,510524,rs1555393485,NC_000015.10:48411002:A:T,single nucleotide variant,missense variant,Uncertain significance,"Apr 18, 2016","criteria provided, single submitter"
4,NM_000138.5(FBN1):c.8602G>A (p.Val2868Ile),FBN1,V2868I,Familial thoracic aortic aneurysm and aortic d...,VCV001171543,15,48703201,15,48411004,1171543,1161120,rs1365239366,NC_000015.10:48411003:C:T,single nucleotide variant,missense variant,Conflicting classifications of pathogenicity,"Sep 12, 2024","criteria provided, conflicting classifications"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4314,NM_000138.5(FBN1):c.2T>C (p.Met1Thr),FBN1,M1T,Familial thoracic aortic aneurysm and aortic d...,VCV000549124,15,48936965,15,48644768,549124,539956,rs886041536,NC_000015.10:48644767:A:G,single nucleotide variant,missense variant|initiator_codon_variant,Pathogenic/Likely pathogenic,"Aug 15, 2025","criteria provided, multiple submitters, no con..."
4315,NM_000138.5(FBN1):c.2T>G (p.Met1Arg),FBN1,M1R,Marfan syndrome|Familial thoracic aortic aneur...,VCV000280307,15,48936965,15,48644768,280307,264741,rs886041536,NC_000015.10:48644767:A:C,single nucleotide variant,missense variant|initiator_codon_variant,Pathogenic,"Sep 28, 2024","criteria provided, multiple submitters, no con..."
4316,NM_000138.5(FBN1):c.1A>T (p.Met1Leu),FBN1,M1L,Familial thoracic aortic aneurysm and aortic d...,VCV000577426,15,48936966,15,48644769,577426,569532,rs730880097,NC_000015.10:48644768:T:A,single nucleotide variant,missense variant|initiator_codon_variant,Pathogenic,"Sep 5, 2025","criteria provided, multiple submitters, no con..."
4317,NM_000138.5(FBN1):c.1A>C (p.Met1Leu),FBN1,M1L,Marfan syndrome,VCV000495569,15,48936966,15,48644769,495569,487865,rs730880097,NC_000015.10:48644768:T:G,single nucleotide variant,missense variant|initiator_codon_variant,Pathogenic,"Sep 26, 2023",reviewed by expert panel


In [14]:
df[['trash', 'mutation']] = df['Name'].str.split(':', expand=True)


In [16]:
df[['mutation_1', 'trashh']] = df['mutation'].str.split('(', expand=True)


In [18]:
df = df.drop(['trash', 'trashh', 'mutation'], axis = 1)

In [20]:
df.to_csv('fbn_snp.csv')