In [1]:
import os
import csv
import zipfile
import pandas as pd
from pyspark.sql import SparkSession
from tqdm import tqdm

# 1) Create or get an existing SparkSession
spark = SparkSession.builder.appName("GetFannieMaeDataset").getOrCreate()

# 2) Optional: configure Pandas for nicer display
pd.set_option("display.max_columns", None)

# 3) Define the columns we want to apply as headers
headers = [
    "POOL_ID",
    "LOAN_ID",
    "ACT_PERIOD",
    "CHANNEL",
    "SELLER",
    "SERVICER",
    "MASTER_SERVICER",
    "ORIG_RATE",
    "CURR_RATE",
    "ORIG_UPB",
    "ISSUANCE_UPB",
    "CURRENT_UPB",
    "ORIG_TERM",
    "ORIG_DATE",
    "FIRST_PAY",
    "LOAN_AGE",
    "REM_MONTHS",
    "ADJ_REM_MONTHS",
    "MATR_DT",
    "OLTV",
    "OCLTV",
    "NUM_BO",
    "DTI",
    "CSCORE_B",
    "CSCORE_C",
    "FIRST_FLAG",
    "PURPOSE",
    "PROP",
    "NO_UNITS",
    "OCC_STAT",
    "STATE",
    "MSA",
    "ZIP",
    "MI_PCT",
    "PRODUCT",
    "PPMT_FLG",
    "IO",
    "FIRST_PAY_IO",
    "MNTHS_TO_AMTZ_IO",
    "DLQ_STATUS",
    "PMT_HISTORY",
    "MOD_FLAG",
    "MI_CANCEL_FLAG",
    "Zero_Bal_Code",
    "ZB_DTE",
    "LAST_UPB",
    "RPRCH_DTE",
    "CURR_SCHD_PRNCPL",
    "TOT_SCHD_PRNCPL",
    "UNSCHD_PRNCPL_CURR",
    "LAST_PAID_INSTALLMENT_DATE",
    "FORECLOSURE_DATE",
    "DISPOSITION_DATE",
    "FORECLOSURE_COSTS",
    "PROPERTY_PRESERVATION_AND_REPAIR_COSTS",
    "ASSET_RECOVERY_COSTS",
    "MISCELLANEOUS_HOLDING_EXPENSES_AND_CREDITS",
    "ASSOCIATED_TAXES_FOR_HOLDING_PROPERTY",
    "NET_SALES_PROCEEDS",
    "CREDIT_ENHANCEMENT_PROCEEDS",
    "REPURCHASES_MAKE_WHOLE_PROCEEDS",
    "OTHER_FORECLOSURE_PROCEEDS",
    "NON_INTEREST_BEARING_UPB",
    "PRINCIPAL_FORGIVENESS_AMOUNT",
    "ORIGINAL_LIST_START_DATE",
    "ORIGINAL_LIST_PRICE",
    "CURRENT_LIST_START_DATE",
    "CURRENT_LIST_PRICE",
    "ISSUE_SCOREB",
    "ISSUE_SCOREC",
    "CURR_SCOREB",
    "CURR_SCOREC",
    "MI_TYPE",
    "SERV_IND",
    "CURRENT_PERIOD_MODIFICATION_LOSS_AMOUNT",
    "CUMULATIVE_MODIFICATION_LOSS_AMOUNT",
    "CURRENT_PERIOD_CREDIT_EVENT_NET_GAIN_OR_LOSS",
    "CUMULATIVE_CREDIT_EVENT_NET_GAIN_OR_LOSS",
    "HOMEREADY_PROGRAM_INDICATOR",
    "FORECLOSURE_PRINCIPAL_WRITE_OFF_AMOUNT",
    "RELOCATION_MORTGAGE_INDICATOR",
    "ZERO_BALANCE_CODE_CHANGE_DATE",
    "LOAN_HOLDBACK_INDICATOR",
    "LOAN_HOLDBACK_EFFECTIVE_DATE",
    "DELINQUENT_ACCRUED_INTEREST",
    "PROPERTY_INSPECTION_WAIVER_INDICATOR",
    "HIGH_BALANCE_LOAN_INDICATOR",
    "ARM_5_YR_INDICATOR",
    "ARM_PRODUCT_TYPE",
    "MONTHS_UNTIL_FIRST_PAYMENT_RESET",
    "MONTHS_BETWEEN_SUBSEQUENT_PAYMENT_RESET",
    "INTEREST_RATE_CHANGE_DATE",
    "PAYMENT_CHANGE_DATE",
    "ARM_INDEX",
    "ARM_CAP_STRUCTURE",
    "INITIAL_INTEREST_RATE_CAP",
    "PERIODIC_INTEREST_RATE_CAP",
    "LIFETIME_INTEREST_RATE_CAP",
    "MARGIN",
    "BALLOON_INDICATOR",
    "PLAN_NUMBER",
    "FORBEARANCE_INDICATOR",
    "HIGH_LOAN_TO_VALUE_HLTV_REFINANCE_OPTION_INDICATOR",
    "DEAL_NAME",
    "RE_PROCS_FLAG",
    "ADR_TYPE",
    "ADR_COUNT",
    "ADR_UPB",
]

# 4) Absolute path to the single ZIP file
zip_path = r"D:\mae_files\Performance_All.zip"

# 5) Where to extract the CSV files
extract_folder = r"D:\kala"

# 6) Final Excel output path
output_excel = r"D:\kala\selected_fm_rows.xlsx"

# This list will hold the sampled data from each CSV
sampled_frames = []

# 8) Build a schema with all columns as StringType
# schema = StructType([StructField(col, StringType(), True) for col in headers])

# 9) Open the single ZIP file
with zipfile.ZipFile(zip_path, 'r') as zf:
    # List all files in this ZIP
    all_files = zf.namelist()
    # We only want CSV files that end with "Q4.csv"
    q4_files = [f for f in all_files if f.endswith("2Q4.csv")]
    
    # Iterate over these Q4 files with a progress bar
    for q4 in tqdm(q4_files, desc="Processing Q4 CSVs"):
        # Extract CSV to extract_folder
        extracted_path = zf.extract(q4, extract_folder)
        
        # Read into Spark
        spark_df = (
            spark.read
            .csv(extracted_path, sep='|', header=False, inferSchema=True)
            # .csv(extracted_path, sep='|', header=False, schema=schema)
        )
        # Apply our headers
        spark_df = spark_df.toDF(*headers)
        
        # Take a sample (adjust fraction as needed)
        sampled_spark = spark_df.sample(fraction=0.01, seed=42)
        
        # Convert Spark DataFrame to pandas
        sampled_pd = sampled_spark.toPandas()
        
        # Append to the list
        sampled_frames.append(sampled_pd)
        
        # Delete the extracted CSV to save space
        os.remove(extracted_path)

# 8) Concatenate all sampled dataframes
if sampled_frames:
    final_df = pd.concat(sampled_frames, ignore_index=True)
    
    # 9) Export the result to Excel
    final_df.head(100_000).to_excel(output_excel, index=False)
    print(f"Final concatenated sample saved to: {output_excel}")
else:
    print("No Q4.csv files found in the ZIP archive.")


Processing Q4 CSVs: 100%|███████████████████████████████████████████████████████████████| 3/3 [08:34<00:00, 171.52s/it]
  final_df = pd.concat(sampled_frames, ignore_index=True)


Final concatenated sample saved to: D:\kala\selected_fm_rows.xlsx


In [2]:
final_df.head()

Unnamed: 0,POOL_ID,LOAN_ID,ACT_PERIOD,CHANNEL,SELLER,SERVICER,MASTER_SERVICER,ORIG_RATE,CURR_RATE,ORIG_UPB,ISSUANCE_UPB,CURRENT_UPB,ORIG_TERM,ORIG_DATE,FIRST_PAY,LOAN_AGE,REM_MONTHS,ADJ_REM_MONTHS,MATR_DT,OLTV,OCLTV,NUM_BO,DTI,CSCORE_B,CSCORE_C,FIRST_FLAG,PURPOSE,PROP,NO_UNITS,OCC_STAT,STATE,MSA,ZIP,MI_PCT,PRODUCT,PPMT_FLG,IO,FIRST_PAY_IO,MNTHS_TO_AMTZ_IO,DLQ_STATUS,PMT_HISTORY,MOD_FLAG,MI_CANCEL_FLAG,Zero_Bal_Code,ZB_DTE,LAST_UPB,RPRCH_DTE,CURR_SCHD_PRNCPL,TOT_SCHD_PRNCPL,UNSCHD_PRNCPL_CURR,LAST_PAID_INSTALLMENT_DATE,FORECLOSURE_DATE,DISPOSITION_DATE,FORECLOSURE_COSTS,PROPERTY_PRESERVATION_AND_REPAIR_COSTS,ASSET_RECOVERY_COSTS,MISCELLANEOUS_HOLDING_EXPENSES_AND_CREDITS,ASSOCIATED_TAXES_FOR_HOLDING_PROPERTY,NET_SALES_PROCEEDS,CREDIT_ENHANCEMENT_PROCEEDS,REPURCHASES_MAKE_WHOLE_PROCEEDS,OTHER_FORECLOSURE_PROCEEDS,NON_INTEREST_BEARING_UPB,PRINCIPAL_FORGIVENESS_AMOUNT,ORIGINAL_LIST_START_DATE,ORIGINAL_LIST_PRICE,CURRENT_LIST_START_DATE,CURRENT_LIST_PRICE,ISSUE_SCOREB,ISSUE_SCOREC,CURR_SCOREB,CURR_SCOREC,MI_TYPE,SERV_IND,CURRENT_PERIOD_MODIFICATION_LOSS_AMOUNT,CUMULATIVE_MODIFICATION_LOSS_AMOUNT,CURRENT_PERIOD_CREDIT_EVENT_NET_GAIN_OR_LOSS,CUMULATIVE_CREDIT_EVENT_NET_GAIN_OR_LOSS,HOMEREADY_PROGRAM_INDICATOR,FORECLOSURE_PRINCIPAL_WRITE_OFF_AMOUNT,RELOCATION_MORTGAGE_INDICATOR,ZERO_BALANCE_CODE_CHANGE_DATE,LOAN_HOLDBACK_INDICATOR,LOAN_HOLDBACK_EFFECTIVE_DATE,DELINQUENT_ACCRUED_INTEREST,PROPERTY_INSPECTION_WAIVER_INDICATOR,HIGH_BALANCE_LOAN_INDICATOR,ARM_5_YR_INDICATOR,ARM_PRODUCT_TYPE,MONTHS_UNTIL_FIRST_PAYMENT_RESET,MONTHS_BETWEEN_SUBSEQUENT_PAYMENT_RESET,INTEREST_RATE_CHANGE_DATE,PAYMENT_CHANGE_DATE,ARM_INDEX,ARM_CAP_STRUCTURE,INITIAL_INTEREST_RATE_CAP,PERIODIC_INTEREST_RATE_CAP,LIFETIME_INTEREST_RATE_CAP,MARGIN,BALLOON_INDICATOR,PLAN_NUMBER,FORBEARANCE_INDICATOR,HIGH_LOAN_TO_VALUE_HLTV_REFINANCE_OPTION_INDICATOR,DEAL_NAME,RE_PROCS_FLAG,ADR_TYPE,ADR_COUNT,ADR_UPB
0,,100003164216,62016,R,Other,Other,,3.625,3.625,478000.0,,445095.54,360,112012,12013,42.0,318.0,318.0,122042.0,68,68.0,2.0,25.0,763.0,742.0,N,R,PU,1,P,CA,31080,913,,FRM,N,N,,,0,,N,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,N,,,,,7,,N,,,,,,Y,,,,,,,,,,,,,,,,N,,,,,
1,,100008120784,102012,R,"Wells Fargo Bank, N.A.","Wells Fargo Bank, N.A.",,3.375,3.375,344000.0,,0.0,180,122011,32012,8.0,172.0,170.0,22027.0,56,68.0,1.0,14.0,793.0,,N,R,SF,1,P,NY,35620,109,,FRM,N,N,,,0,,N,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,N,,,,,7,,N,,,,,,N,,,,,,,,,,,,,,,,N,,,,,
2,,100008218145,122012,C,Flagstar Capital Markets Corporation,Flagstar Capital Markets Corporation,,3.625,3.625,403000.0,,0.0,360,92012,112012,2.0,358.0,357.0,102042.0,54,61.0,2.0,33.0,749.0,747.0,N,R,SF,1,P,WA,42660,981,,FRM,N,N,,,0,,N,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,N,,,,,7,,N,,,,,,N,,,,,,,,,,,,,,,,N,,,,,
3,,100013477387,122012,C,"Jpmorgan Chase Bank, National Association","Jp Morgan Chase Bank, Na",,3.375,3.375,136000.0,,0.0,180,82012,102012,3.0,177.0,176.0,92027.0,54,54.0,1.0,29.0,810.0,,N,R,PU,1,P,IL,16980,605,,FRM,N,N,,,0,,N,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,N,,,,,7,,N,,,,,,N,,,,,,,,,,,,,,,,N,,,,,
4,,100019487094,32020,C,Other,Other,,3.625,3.625,65000.0,,37589.85,180,102012,122012,88.0,92.0,92.0,112027.0,67,67.0,1.0,42.0,674.0,,N,C,SF,1,P,MS,25060,395,,FRM,N,N,,,0,,N,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,N,,,,,7,,N,,,,,,N,,,,,,,,,,,,,,,,N,,,,,


In [3]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1347114 entries, 0 to 1347113
Columns: 108 entries, POOL_ID to ADR_UPB
dtypes: float64(36), int32(8), int64(1), object(63)
memory usage: 1.0+ GB
