In [1]:
import pandas as pd

In [2]:
import pandas as pd
df = pd.read_csv("radiomics_features_all_patients_TUMOR.csv", delimiter='\t')  # Try '\t' for tab-delimited

In [3]:
num_features = df.shape[1]
print("Number of features (columns):", num_features)

Number of features (columns): 1


---

# Saved the csv with number of features

In [4]:
with open("radiomics_features_all_patients_TUMOR.csv", "r", encoding="utf-8", errors="ignore") as f:
    first_line = f.readline().strip()

# Split column names by comma
feature_names = first_line.split(',')

print("✅ Number of features:", len(feature_names))
print("🔤 First 10 feature names:", feature_names[:10])

✅ Number of features: 1678
🔤 First 10 feature names: ['PatientID', 'T1c_diagnostics_Versions_PyRadiomics', 'T1c_diagnostics_Versions_Numpy', 'T1c_diagnostics_Versions_SimpleITK', 'T1c_diagnostics_Versions_PyWavelet', 'T1c_diagnostics_Versions_Python', 'T1c_diagnostics_Configuration_Settings', 'T1c_diagnostics_Configuration_EnabledImageTypes', 'T1c_diagnostics_Image-original_Hash', 'T1c_diagnostics_Image-original_Dimensionality']


In [5]:
# Step 1: Read the header line from the original file
with open("radiomics_features_all_patients_TUMOR.csv", "r", encoding="utf-8", errors="ignore") as f:
    header_line = f.readline().strip()

# Step 2: Split into feature names
feature_names = header_line.split(',')

# Step 3: Save as a single-row CSV
import csv

with open("radiomics_feature_names.csv", "w", newline='', encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(feature_names)

print(f"✅ Saved feature names to 'radiomics_feature_names.csv' as a single header row.")

✅ Saved feature names to 'radiomics_feature_names.csv' as a single header row.


In [6]:
df = pd.read_csv('radiomics_feature_names.csv')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Columns: 1678 entries, PatientID to MD_original_ngtdm_Strength
dtypes: object(1678)
memory usage: 132.0+ bytes


---

# Saved the cleaned data along with data points

In [8]:
import io

# Step 1: Read all lines safely (ignoring encoding errors)
with open("radiomics_features_all_patients_TUMOR.csv", "r", encoding="utf-8", errors="ignore") as f:
    lines = f.read().splitlines()

# Step 2: Get the correct number of columns from the header
header = lines[0].strip()
expected_columns = header.count(',') + 1

# Step 3: Keep only lines with the expected number of columns
valid_lines = [line.strip() for line in lines if line.count(',') + 1 == expected_columns]

# Step 4: Reconstruct a valid CSV string
reconstructed_csv = "\n".join(valid_lines)

# Step 5: Load into DataFrame
df = pd.read_csv(io.StringIO(reconstructed_csv))

print("✅ Cleaned DataFrame shape:", df.shape)

# Step 6: Save to a new CSV file
df.to_csv("radiomics_cleaned.csv", index=False)

print("📁 Saved cleaned dataset to 'radiomics_cleaned.csv'")

✅ Cleaned DataFrame shape: (0, 1678)
📁 Saved cleaned dataset to 'radiomics_cleaned.csv'


In [9]:
df = pd.read_csv("radiomics_cleaned.csv")

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Columns: 1678 entries, PatientID to MD_original_ngtdm_Strength
dtypes: object(1678)
memory usage: 132.0+ bytes


In [11]:
df.head()

Unnamed: 0,PatientID,T1c_diagnostics_Versions_PyRadiomics,T1c_diagnostics_Versions_Numpy,T1c_diagnostics_Versions_SimpleITK,T1c_diagnostics_Versions_PyWavelet,T1c_diagnostics_Versions_Python,T1c_diagnostics_Configuration_Settings,T1c_diagnostics_Configuration_EnabledImageTypes,T1c_diagnostics_Image-original_Hash,T1c_diagnostics_Image-original_Dimensionality,...,MD_original_glszm_SmallAreaHighGrayLevelEmphasis,MD_original_glszm_SmallAreaLowGrayLevelEmphasis,MD_original_glszm_ZoneEntropy,MD_original_glszm_ZonePercentage,MD_original_glszm_ZoneVariance,MD_original_ngtdm_Busyness,MD_original_ngtdm_Coarseness,MD_original_ngtdm_Complexity,MD_original_ngtdm_Contrast,MD_original_ngtdm_Strength


In [5]:
import pandas as pd

def standardize_id(pid):
    if isinstance(pid, str) and pid.startswith("UCSF-PDGM-"):
        try:
            suffix = pid.split("-")[-1]
            standardized_suffix = suffix.zfill(4)  # pad with leading zeros to 4 digits
            return f"UCSF-PDGM-{standardized_suffix}"
        except:
            return pid
    return pid

# Load the cleaned radiomics file
df = pd.read_csv("radiomics_cleaned.csv")

# Apply standardization
df["PatientID"] = df["PatientID"].apply(standardize_id)

# Save back to the same file (override)
df.to_csv("radiomics_cleaned.csv", index=False)

print("✅ PatientIDs standardized and saved to radiomics_cleaned.csv")

✅ PatientIDs standardized and saved to radiomics_cleaned.csv


In [6]:
def standardize_id(pid):
    if isinstance(pid, str) and pid.startswith("UCSF-PDGM-"):
        try:
            suffix = pid.split("-")[-1]
            standardized_suffix = suffix.zfill(4)  # pad to 4 digits
            return f"UCSF-PDGM-{standardized_suffix}"
        except:
            return pid
    return pid

# Load the metadata file
df_meta = pd.read_csv("UCSF-PDGM-metadata_v2.csv")

# Rename 'ID' to 'PatientID' for consistency and standardize
df_meta = df_meta.rename(columns={"ID": "PatientID"})
df_meta["PatientID"] = df_meta["PatientID"].apply(standardize_id)

# Save it back (override the file)
df_meta.to_csv("UCSF-PDGM-metadata_v2.csv", index=False)

print("✅ Standardized and saved UCSF-PDGM-metadata_v2.csv with corrected PatientIDs")


✅ Standardized and saved UCSF-PDGM-metadata_v2.csv with corrected PatientIDs


In [7]:
# Load the two datasets
df_radiomics = pd.read_csv("radiomics_cleaned.csv")
df_metadata = pd.read_csv("UCSF-PDGM-metadata_v2.csv")

# Merge on the standardized PatientID
df_merged = pd.merge(df_radiomics, df_metadata, on="PatientID", how="inner")

# Save to a new CSV file
df_merged.to_csv("radiomics_cleaned_merged.csv", index=False)

print(f"✅ Merged dataset shape: {df_merged.shape}")
print("📁 Saved merged file as 'radiomics_cleaned_merged.csv'")


✅ Merged dataset shape: (76, 20569)
📁 Saved merged file as 'radiomics_cleaned_merged.csv'
