In [2]:
import pandas as pd

soft_file = "/content/drive/MyDrive/Bioinformatyka/modelowanie/projekt_new/GSE25066_series_matrix.txt"

with open(soft_file, encoding='utf-8') as f:
    lines = f.readlines()

#file parts
meta_lines = []
expr_lines = []
in_expr = False

for line in lines:
    line = line.strip()
    if line.startswith("!series_matrix_table_begin"):
        in_expr = True
        continue
    elif line.startswith("!series_matrix_table_end"):
        in_expr = False
        continue
    elif line.startswith("!Sample_") and not in_expr:
        meta_lines.append(line)
    elif in_expr:
        expr_lines.append(line)

#metadata
meta_dict = {}
for line in meta_lines:
    parts = line.split(None, 1)
    if len(parts) == 2:
        col_name = parts[0].replace('!Sample_', '')
        values = parts[1].split('\t') if '\t' in parts[1] else parts[1].split()
        values = [v.strip('"') for v in values]
        meta_dict[col_name] = values

metadata_df = pd.DataFrame(meta_dict)

#expresion
from io import StringIO
expr_text = '\n'.join(expr_lines)
expression_df = pd.read_csv(StringIO(expr_text), sep='\t')

expr_df_t = expression_df.set_index('ID_REF').T
expr_df_t.index.name = 'geo_accession'
expr_df_t.reset_index(inplace=True)

if 'geo_accession' in metadata_df.columns:
    metadata_df['geo_accession'] = metadata_df['geo_accession'].str.strip()
    full_df = pd.merge(metadata_df, expr_df_t, on='geo_accession', how='inner')
else:
    full_df = metadata_df.copy()

#saving
metadata_df.to_csv("sample_metadata.csv", index=False)
expression_df.to_csv("expression_matrix.csv", index=False)
full_df.to_csv("combined_data.csv", index=False)

print("Saved:")
print("- sample_metadata.csv")
print("- expression_matrix.csv")
print("- combined_data.csv")


Saved:
- sample_metadata.csv
- expression_matrix.csv
- combined_data.csv


In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive
