In [None]:
import pandas as pd
import re
# from bs4 import BeautifulSoup # Uncomment if needed for HTML stripping
from google.colab import drive # Import drive library
import os # To check if path exists

In [None]:
try:
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")
    print("Please ensure you authorize Colab to access your Drive.")
    exit()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully.


In [None]:
import pandas as pd
import re
# from bs4 import BeautifulSoup # Uncomment if needed for HTML stripping

drive_base_path = '/content/drive/MyDrive/Web Scraping /'
input_csv_file = os.path.join(drive_base_path, 'egypt_institutions_programs_flattened_with_amenities.csv')
output_csv_file = ('egypt_institutions_programs_cleaned.csv') # Saving cleaned file back to Drive

try:
    # Read the CSV into a Pandas DataFrame, SPECIFYING UTF-8 ENCODING
    df = pd.read_csv(input_csv_file, encoding='utf-8') # <--- ADD encoding='utf-8'
    print(f"Successfully loaded {len(df)} rows from {input_csv_file}")
    print("Initial DataFrame info:")
    df.info()
    print("\nSample rows before cleaning (should show correct Arabic now):")
    print(df.head())

except Exception as e:
    print(f"Error loading CSV: {e}")
    # If you get a UnicodeDecodeError here, the file might NOT be UTF-8.
    # You could try 'latin-1' or 'windows-1252' to see if it loads without error,
    # but the Mojibake would persist, meaning the *source* file needs fixing or Solution 2.
    print("If you encountered a UnicodeDecodeError, the source CSV might not be UTF-8 encoded.")
    exit()

Successfully loaded 17581 rows from /content/drive/MyDrive/Web Scraping /egypt_institutions_programs_flattened_with_amenities.csv
Initial DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17581 entries, 0 to 17580
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   University               17581 non-null  object 
 1   Category                 17581 non-null  object 
 2   Year Built               17581 non-null  object 
 3   Description              17581 non-null  object 
 4   Location                 17532 non-null  object 
 5   Phone                    17581 non-null  object 
 6   Email                    17581 non-null  object 
 7   Coordinator              15303 non-null  object 
 8   Amenities                17581 non-null  object 
 9   Detail Page URL          17581 non-null  object 
 10  Program Category         17353 non-null  object 
 11  Faculty                  17353

In [None]:
# List of strings to consider as 'missing'
missing_markers = ['N/A', 'Not Found', 'None Listed', 'None Provided', '', 'Error: Missing Detail URL', 'Error: Request Timeout', 'Error: HTTP Request Failed'] # Add any other markers you see

# Convert specific columns or all object columns
for col in df.select_dtypes(include='object').columns: # Iterate through string columns
    # Replace markers with NaN first, then fill NaN with empty string
    df[col] = df[col].replace(missing_markers, pd.NA, regex=False)
    df[col] = df[col].fillna('')
print("\nStandardized missing markers.")


Standardized missing markers.


In [None]:
text_columns_to_clean = [
    'Name', 'Category', 'Description', 'Location', 'Coordinator', 'Amenities',
    'Program Category', 'Faculty', 'Program Name', 'Program Description', 'Prerequisites',
    'Affiliated Universities' # Add any other relevant text columns
]

def clean_text(text):
    if pd.isna(text) or text == '':
        return ''
    text = str(text)
    # Optional: Strip lingering HTML tags (if necessary)
    # soup = BeautifulSoup(text, "lxml")
    # text = soup.get_text()

    # Normalize whitespace: replace multiple spaces/newlines/tabs with a single space
    text = re.sub(r'\s+', ' ', text)
    # Trim leading/trailing whitespace
    text = text.strip()
    # Optional: Remove specific unwanted characters (example: remove control characters)
    text = re.sub(r'[\x00-\x1f\x7f]', '', text)
    # Optional: Normalize Unicode characters (helps with different quote types etc.)
    # import unicodedata
    # text = unicodedata.normalize('NFKC', text)
    return text

for col in text_columns_to_clean:
    if col in df.columns:
        print(f"Cleaning column: {col}")
        df[col] = df[col].apply(clean_text)
    else:
        print(f"Warning: Column '{col}' not found for cleaning.")
print("\nPerformed whitespace and basic text cleaning.")

# Special handling for Amenities to keep newlines if desired for readability later,
# but still clean excess whitespace around them.
if 'Amenities' in df.columns:
    def clean_amenities(text):
         if pd.isna(text) or text == '': return ''
         lines = text.split('\n')
         cleaned_lines = [' '.join(line.split()).strip() for line in lines if line.strip()]
         return "\n".join(cleaned_lines) # Re-join with single newline
    df['Amenities'] = df['Amenities'].apply(clean_amenities)
    print("Performed specific newline cleaning for Amenities.")

Cleaning column: Category
Cleaning column: Description
Cleaning column: Location
Cleaning column: Coordinator
Cleaning column: Amenities
Cleaning column: Program Category
Cleaning column: Faculty
Cleaning column: Program Name
Cleaning column: Program Description
Cleaning column: Prerequisites
Cleaning column: Affiliated Universities

Performed whitespace and basic text cleaning.
Performed specific newline cleaning for Amenities.


In [None]:
numeric_columns = [
    'Year Built', 'Fee In USD', 'Fee In EGP', 'Years Of Study', 'Number Of Semesters',
    'Credit Hours', 'Max Study Years', 'Semesters Abroad'
]

def extract_and_convert_numeric(value):
    if pd.isna(value) or value == '':
        return None # Use None for missing numbers (will become NaN in numeric columns)
    # Try extracting the first sequence of digits (possibly with decimal)
    # Remove commas, $, etc. first
    cleaned_value = re.sub(r'[$,]', '', str(value))
    # Find the first number-like pattern (integer or float)
    match = re.search(r'\d+(\.\d+)?', cleaned_value)
    if match:
        try:
            num_str = match.group(0)
            # Decide if it should be integer or float
            if '.' in num_str:
                return float(num_str)
            else:
                return int(num_str)
        except ValueError:
            return None # Conversion failed
    else:
        return None # No number found

for col in numeric_columns:
    if col in df.columns:
        print(f"Converting column to numeric: {col}")
        df[col] = df[col].apply(extract_and_convert_numeric)
        # Optionally convert integer columns after processing NAs
        if df[col].notna().all() and all(df[col] % 1 == 0):
             # Check if convertible to nullable Int64
             try:
                df[col] = df[col].astype('Int64')
             except Exception:
                pass # Keep as float if conversion fails
    else:
         print(f"Warning: Column '{col}' not found for numeric conversion.")
print("\nPerformed numeric conversion.")
df.info() # Check dtypes after conversion

Converting column to numeric: Year Built
Converting column to numeric: Fee In USD
Converting column to numeric: Fee In EGP
Converting column to numeric: Years Of Study
Converting column to numeric: Number Of Semesters
Converting column to numeric: Credit Hours
Converting column to numeric: Max Study Years
Converting column to numeric: Semesters Abroad

Performed numeric conversion.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17581 entries, 0 to 17580
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   University               17581 non-null  object 
 1   Category                 17581 non-null  object 
 2   Year Built               17180 non-null  float64
 3   Description              17581 non-null  object 
 4   Location                 17581 non-null  object 
 5   Phone                    17581 non-null  object 
 6   Email                    17581 non-null  object 
 7   Coordinator    

In [None]:
print("\nSample rows after cleaning:")
print(df.head())

# Optional: Check for columns that are now entirely empty/NA
print("\nColumns with all missing values after cleaning:")
print(df.columns[df.isnull().all()])

# Save the cleaned DataFrame
try:
    df.to_csv(output_csv_file, index=False, encoding='utf-8')
    print(f"\nCleaned data successfully saved to {output_csv_file}")
except Exception as e:
    print(f"Error saving cleaned CSV: {e}")


Sample rows after cleaning:
                                          University           Category  \
0  Cairo Higher Institute for Languages, Interpre...     High institute   
1                           Ain Shams University ASU  Public university   
2                           Ain Shams University ASU  Public university   
3                           Ain Shams University ASU  Public university   
4                           Ain Shams University ASU  Public university   

   Year Built                                        Description  \
0         NaN                                                      
1      1950.0  Ain Shams University is the third Egyptian uni...   
2      1950.0  Ain Shams University is the third Egyptian uni...   
3      1950.0  Ain Shams University is the third Egyptian uni...   
4      1950.0  Ain Shams University is the third Egyptian uni...   

                                            Location        Phone  \
0  Mokattam - the commercial station - 5 S

In [None]:
# === Import Necessary Libraries ===
import pandas as pd
import re
from bs4 import BeautifulSoup # Keep import in case needed later, but commented out by default
from google.colab import drive
import os
import logging

# === Configuration ===
# Configure logging
LOG_FILE_CLEAN = 'data_cleaning.log' # Log file specific to this cleaning script
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - [%(funcName)s] %(message)s',
    handlers=[
        logging.FileHandler(LOG_FILE_CLEAN, mode='w', encoding='utf-8'), # Save log to file
        logging.StreamHandler() # Also print logs to console
    ]
)

# Define file paths in Google Drive (Ensure the path exactly matches your Drive)
# IMPORTANT: Note the space after 'Web Scraping ' if it exists in your folder name
DRIVE_BASE_PATH = '/content/drive/MyDrive/Web Scraping /'
INPUT_CSV_FILE = os.path.join(DRIVE_BASE_PATH, 'egypt_institutions_programs_flattened_with_amenities.csv')
OUTPUT_CSV_FILE = os.path.join(DRIVE_BASE_PATH, 'egypt_institutions_programs_cleaned.csv')

# Markers indicating missing or irrelevant data
MISSING_MARKERS = [
    'N/A', 'Not Found', 'None Listed', 'None Provided', '',
    'Error: Missing Detail URL', 'Error: Request Timeout',
    'Error: HTTP Request Failed', 'Error: Parsing Failed'
    # Add any other similar markers observed in your data
]

# Columns expected to contain free text needing cleaning
TEXT_COLUMNS_TO_CLEAN = [
    'Name', 'Category', 'Description', 'Location', 'Coordinator', 'Amenities',
    'Program Category', 'Faculty', 'Program Name', 'Program Description', 'Prerequisites',
    'Affiliated Universities'
]

# Columns expected to contain numeric data
NUMERIC_COLUMNS = [
    'Year Built', 'Fee In USD', 'Fee In EGP', 'Years Of Study', 'Number Of Semesters',
    'Credit Hours', 'Max Study Years', 'Semesters Abroad'
]

# === Helper Functions ===

def fix_mojibake(text):
    """Attempts to fix Mojibake assuming UTF-8 misinterpreted as Latin-1."""
    if isinstance(text, str) and text: # Only process non-empty strings
        try:
            # Encode back to bytes using the likely WRONG encoding, then decode with CORRECT encoding
            return text.encode('latin-1').decode('utf-8')
        except (UnicodeEncodeError, UnicodeDecodeError):
            # If the fix fails (e.g., text was already correct or different issue)
            return text
    return text # Return non-strings or empty strings as is

def clean_text(text):
    """Cleans whitespace and optional basic character issues from a text string."""
    if pd.isna(text) or text == '':
        return ''
    text = str(text)
    # --- Optional HTML Stripping (Uncomment if needed) ---
    # try:
    #     soup = BeautifulSoup(text, "lxml")
    #     text = soup.get_text()
    # except Exception as e_html:
    #     logging.warning(f"HTML parsing failed for text snippet: {text[:50]}... Error: {e_html}")
    #     # Fallback to regex or just continue if BeautifulSoup fails
    #     text = re.sub(r'<[^>]+>', '', text) # Basic regex fallback for tags

    # Normalize whitespace: replace multiple spaces/newlines/tabs with a single space
    text = re.sub(r'\s+', ' ', text)
    # Trim leading/trailing whitespace
    text = text.strip()
    # Optional: Remove control characters that might cause issues
    text = re.sub(r'[\x00-\x1f\x7f]', '', text)
    return text

def clean_amenities(text):
    """Cleans amenities text, preserving intended newlines."""
    if pd.isna(text) or text == '':
        return ''
    text = str(text)
    lines = text.split('\n')
    # Clean whitespace from each line, keep non-empty lines
    cleaned_lines = [' '.join(line.split()).strip() for line in lines if line.strip()]
    return "\n".join(cleaned_lines) # Re-join with single newline

def extract_and_convert_numeric(value):
    """Extracts number from string and converts to int or float."""
    if pd.isna(value) or value == '':
        return pd.NA # Use Pandas NA for missing numbers
    # Remove common currency symbols, commas, etc.
    cleaned_value = re.sub(r'[$,]', '', str(value))
    # Find the first number-like pattern (integer or float, allows leading sign)
    match = re.search(r'-?\d+(\.\d+)?', cleaned_value)
    if match:
        num_str = match.group(0)
        try:
            if '.' in num_str:
                return float(num_str)
            else:
                return int(num_str)
        except ValueError:
            logging.warning(f"Could not convert extracted number string '{num_str}' to numeric.")
            return pd.NA # Conversion failed
    else:
        # If no number found, might be text like 'Contact Faculty'
        logging.debug(f"No numeric pattern found in value: '{value}'")
        return pd.NA

# === Main Script Execution ===

if __name__ == "__main__":
    logging.info("--- Data Cleaning Script Started ---")

    # --- 1. Mount Google Drive ---
    try:
        drive.mount('/content/drive')
        logging.info("Google Drive mounted successfully.")
    except Exception as e:
        logging.error(f"Error mounting Google Drive: {e}. Please authorize access.", exc_info=True)
        exit()

    # --- 2. Load Data (Attempting UTF-8 fix) ---
    logging.info(f"Attempting to load data from: {INPUT_CSV_FILE}")
    if not os.path.exists(INPUT_CSV_FILE):
        logging.error(f"Input file not found: {INPUT_CSV_FILE}")
        try:
            logging.info(f"Listing contents of '{DRIVE_BASE_PATH}': {os.listdir(DRIVE_BASE_PATH)}")
        except Exception as e_ls:
            logging.error(f"Could not list directory contents: {e_ls}")
        exit()

    try:
        # ** Try reading with UTF-8 first **
        df = pd.read_csv(INPUT_CSV_FILE, encoding='utf-8')
        logging.info(f"Successfully loaded {len(df)} rows using UTF-8 encoding.")
    except UnicodeDecodeError:
        logging.warning("UTF-8 decoding failed. Trying with 'latin-1' and will apply Mojibake fix later.")
        try:
             df = pd.read_csv(INPUT_CSV_FILE, encoding='latin-1')
             logging.info(f"Successfully loaded {len(df)} rows using latin-1 encoding (Mojibake fix needed).")
             needs_mojibake_fix = True
        except Exception as e_load:
            logging.error(f"Error loading CSV with fallback encoding: {e_load}", exc_info=True)
            exit()
    except Exception as e_load:
        logging.error(f"Generic error loading CSV: {e_load}", exc_info=True)
        exit()
    else:
         needs_mojibake_fix = False # UTF-8 worked, likely no fix needed unless file itself is corrupted

    logging.info("Initial DataFrame info:")
    df.info()
    logging.info("Sample rows before cleaning:")
    logging.info(df.head())


    # --- 3. Fix Mojibake (if needed) ---
    if needs_mojibake_fix:
        logging.info("Applying Mojibake fix (assuming UTF-8 misinterpreted as latin-1)...")
        mojibake_pattern = r'[ØÙÖÝ]' # Common starting characters in this type of Mojibake
        applied_fix_count = 0
        for col in TEXT_COLUMNS_TO_CLEAN:
            if col in df.columns and df[col].dtype == 'object':
                # Check more thoroughly if fix is needed for this column
                if df[col].astype(str).str.contains(mojibake_pattern, regex=True, na=False).any():
                     logging.info(f"Applying Mojibake fix to column: {col}")
                     df[col] = df[col].apply(fix_mojibake)
                     applied_fix_count += 1
                else:
                     logging.debug(f"No Mojibake pattern detected in column: {col}")
        if applied_fix_count == 0:
             logging.warning("Loaded with latin-1, but no common Mojibake patterns detected. Check data manually.")
        logging.info("Mojibake fix attempt complete.")
        logging.info("Sample rows after Mojibake fix attempt:")
        logging.info(df.head())

    # --- 4. Standardize Missing Values ---
    logging.info(f"Standardizing missing values (replacing {MISSING_MARKERS}) with empty strings...")
    num_cols_processed = 0
    for col in df.select_dtypes(include='object').columns: # Only process object (usually string) columns
        # Create a boolean mask for values to replace
        mask = df[col].isin(MISSING_MARKERS)
        if mask.any():
             logging.debug(f"Replacing missing markers in column: {col}")
             df[col] = df[col].replace(MISSING_MARKERS, pd.NA) # Replace with Pandas NA
             num_cols_processed += 1

    df = df.fillna('') # Fill all NA values (from replace or original) with empty string
    logging.info(f"Standardized missing values in {num_cols_processed} object columns.")


    # --- 5. Clean Text Fields ---
    logging.info("Cleaning whitespace and special characters in text columns...")
    for col in TEXT_COLUMNS_TO_CLEAN:
        if col in df.columns and df[col].dtype == 'object':
             logging.debug(f"Cleaning text column: {col}")
             if col == 'Amenities':
                  df[col] = df[col].apply(clean_amenities)
             else:
                  df[col] = df[col].apply(clean_text)
    logging.info("Text cleaning complete.")


    # --- 6. Clean and Convert Numeric Fields ---
    logging.info("Converting specified columns to numeric types...")
    for col in NUMERIC_COLUMNS:
        if col in df.columns:
            logging.debug(f"Converting column to numeric: {col}")
            original_dtype = df[col].dtype
            df[col] = df[col].apply(extract_and_convert_numeric)
            # Attempt conversion to nullable Int64 if appropriate
            try:
                 # Check if all non-NA values are integers after conversion
                 if df[col].dropna().apply(lambda x: float(x).is_integer()).all():
                     df[col] = df[col].astype('Int64') # Use Pandas nullable integer type
            except Exception:
                 # If conversion fails or column contains floats, leave as float/object
                 logging.debug(f"Column {col} kept as float or object after conversion attempt.")
                 pass
            logging.debug(f"Column '{col}' type changed from {original_dtype} to {df[col].dtype}")
        else:
             logging.warning(f"Numeric column '{col}' not found in DataFrame.")
    logging.info("Numeric conversion complete.")


    # --- 7. Final Review ---
    logging.info("Cleaned DataFrame info:")
    df.info()
    logging.info("Sample rows after all cleaning:")
    logging.info(df.head())
    logging.info("Checking for columns that are now entirely empty/NA:")
    all_na_cols = df.columns[df.isna().all()].tolist()
    if all_na_cols:
         logging.info(f"Columns with all NA values: {all_na_cols}")
    else:
         logging.info("No columns found with all NA values.")


    # --- 8. Save Cleaned Data ---
    logging.info(f"Attempting to save cleaned data to: {OUTPUT_CSV_FILE}")
    try:
        # Use 'utf-8-sig' to include BOM for better Excel compatibility with UTF-8
        df.to_csv(OUTPUT_CSV_FILE, index=False, encoding='utf-8-sig')
        logging.info(f"Cleaned data successfully saved to {OUTPUT_CSV_FILE}")
        logging.info("You can find the file in your Google Drive.")
    except Exception as e:
        logging.error(f"Error saving cleaned CSV: {e}", exc_info=True)

    logging.info("--- Data Cleaning Script Finished ---")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17581 entries, 0 to 17580
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   University               17581 non-null  object 
 1   Category                 17581 non-null  object 
 2   Year Built               17581 non-null  object 
 3   Description              17581 non-null  object 
 4   Location                 17532 non-null  object 
 5   Phone                    17581 non-null  object 
 6   Email                    17581 non-null  object 
 7   Coordinator              15303 non-null  object 
 8   Amenities                17581 non-null  object 
 9   Detail Page URL          17581 non-null  object 
 10  Program Category         17353 non-null  object 
 11  Faculty                  17353 non-null  object 
 12  P

In [None]:
import pandas as pd
import re
from google.colab import drive
import os
import logging
from langchain_text_splitters import RecursiveCharacterTextSplitter # Import the splitter

# === Configuration ===
# Configure logging
LOG_FILE_CHUNK = 'text_chunking.log'
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - [%(funcName)s] %(message)s',
    handlers=[
        logging.FileHandler(LOG_FILE_CHUNK, mode='w', encoding='utf-8'),
        logging.StreamHandler()
    ]
)

# Define file paths in Google Drive
DRIVE_BASE_PATH = '/content/drive/MyDrive/Web Scraping /'
INPUT_CLEANED_CSV = os.path.join(DRIVE_BASE_PATH, 'Cleaned_File.csv')
OUTPUT_CHUNKS_CSV = os.path.join(DRIVE_BASE_PATH, 'egypt_institutions_chunks.csv') # File to save chunks

# Chunking Parameters (adjust based on your target model and experiments)
# RecursiveCharacterTextSplitter uses character count by default
CHUNK_SIZE = 800  # Max characters per chunk
CHUNK_OVERLAP = 100 # Characters overlap between consecutive chunks

# === Helper Function ===

def format_document_text(row):
    """Combines relevant columns from a row into a single text passage."""
    text_parts = []

    # --- University Info ---
    text_parts.append(f"University Name: {row.get('Name', 'N/A')}")
    if row.get('Category'): text_parts.append(f"Category: {row.get('Category')}")
    if row.get('Year Built'): text_parts.append(f"Established: {row.get('Year Built')}")
    if row.get('Location'): text_parts.append(f"Location: {row.get('Location')}")
    if row.get('Description'): text_parts.append(f"\nUniversity Description: {row.get('Description')}")
    if row.get('Amenities') and row.get('Amenities') != 'Not Found' and row.get('Amenities') != 'None Listed':
        # Format amenities nicely if present
        amenities_list = row.get('Amenities', '').split('\n')
        if amenities_list:
            text_parts.append("\nAmenities:")
            for amenity in amenities_list:
                 text_parts.append(f"- {amenity}")

    # --- Program Info (only if a program name exists for this row) ---
    if row.get('Program Name') and row.get('Program Name') not in ['N/A', 'Not Found']:
        text_parts.append("\n---\nProgram Details:") # Separator
        if row.get('Program Category'): text_parts.append(f"Program Category: {row.get('Program Category')}")
        if row.get('Faculty'): text_parts.append(f"Faculty: {row.get('Faculty')}")
        text_parts.append(f"Program Name: {row.get('Program Name')}")
        if row.get('Program Description'): text_parts.append(f"Program Description: {row.get('Program Description')}")
        if row.get('Years Of Study'): text_parts.append(f"Years of Study: {row.get('Years Of Study')}")
        if row.get('Number Of Semesters'): text_parts.append(f"Semesters: {row.get('Number Of Semesters')}")
        if row.get('Credit Hours'): text_parts.append(f"Credit Hours: {row.get('Credit Hours')}")
        if row.get('Max Study Years'): text_parts.append(f"Max Study Years: {row.get('Max Study Years')}")
        if row.get('Semesters Abroad'): text_parts.append(f"Semesters Abroad: {row.get('Semesters Abroad')}")

        fees = []
        if row.get('Fee In USD') and str(row.get('Fee In USD')).strip(): fees.append(f"Fee (USD): {row.get('Fee In USD')}")
        if row.get('Fee In EGP') and str(row.get('Fee In EGP')).strip(): fees.append(f"Fee (EGP): {row.get('Fee In EGP')}")
        if fees: text_parts.append(", ".join(fees))

        if row.get('Prerequisites'): text_parts.append(f"Prerequisites: {row.get('Prerequisites')}")
        if row.get('Affiliated Universities'): text_parts.append(f"Affiliated Universities: {row.get('Affiliated Universities')}")

    # --- Contact Info (Optional, add if useful context) ---
    # if row.get('Phone'): text_parts.append(f"Phone: {row.get('Phone')}")
    # if row.get('Email'): text_parts.append(f"Email: {row.get('Email')}")
    # if row.get('Coordinator'): text_parts.append(f"Coordinator: {row.get('Coordinator')}")

    return "\n".join(filter(None, text_parts)) # Join non-empty parts with newline

# === Main Script Execution ===

if __name__ == "__main__":
    logging.info("--- Text Chunking Script Started ---")

    # --- 1. Mount Google Drive ---
    try:
        drive.mount('/content/drive')
        logging.info("Google Drive mounted successfully.")
    except Exception as e:
        logging.error(f"Error mounting Google Drive: {e}. Please authorize access.", exc_info=True)
        exit()

    # --- 2. Load Cleaned Data ---
    logging.info(f"Loading cleaned data from: {INPUT_CLEANED_CSV}")
    if not os.path.exists(INPUT_CLEANED_CSV):
        logging.error(f"Input file not found: {INPUT_CLEANED_CSV}")
        exit()
    try:
        df_cleaned = pd.read_csv(INPUT_CLEANED_CSV, encoding='utf-8-sig') # Read the cleaned file
        # Handle potential empty strings read as NaN after cleaning/saving
        df_cleaned = df_cleaned.fillna('')
        logging.info(f"Successfully loaded {len(df_cleaned)} rows from cleaned file.")
    except Exception as e:
        logging.error(f"Error loading cleaned CSV: {e}", exc_info=True)
        exit()

    # --- 3. Initialize Text Splitter ---
    logging.info(f"Initializing RecursiveCharacterTextSplitter with chunk_size={CHUNK_SIZE}, chunk_overlap={CHUNK_OVERLAP}")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        length_function=len, # Use character length
        is_separator_regex=False, # Treat separators literally
        separators=["\n\n", "\n", ". ", ", ", " ", ""] # Define splitting hierarchy
    )

    # --- 4. Process Rows and Create Chunks ---
    all_chunks_data = []
    logging.info("Starting chunking process...")
    total_rows = len(df_cleaned)

    for index, row in df_cleaned.iterrows():
        if index % 100 == 0: # Log progress every 100 rows
             logging.info(f"Processing row {index + 1} of {total_rows}...")

        # Create the combined text document for this row
        document_text = format_document_text(row)

        if not document_text.strip():
             logging.warning(f"Skipping row {index + 1} (University: {row.get('Name', 'N/A')}) due to empty combined text.")
             continue

        # Define metadata for these chunks
        metadata = {
            'university_name': row.get('Name', ''),
            'program_name': row.get('Program Name', ''), # Will be empty/NA for rows without programs
            'faculty': row.get('Faculty', ''),
            'source_url': row.get('Detail Page URL', ''),
            'original_row_index': index # Link back to original cleaned row if needed
        }

        # Split the document text into chunks
        try:
            chunks = text_splitter.split_text(document_text)
        except Exception as e_split:
             logging.error(f"Error splitting text for row {index+1} (University: {metadata['university_name']}): {e_split}", exc_info=True)
             chunks = [] # Handle error gracefully, maybe add error marker?

        # Store each chunk with its metadata
        for i, chunk_text in enumerate(chunks):
            chunk_data = {
                'chunk_text': chunk_text,
                'chunk_sequence': i + 1, # Sequence number of chunk within its document
                **metadata # Add all metadata keys
            }
            all_chunks_data.append(chunk_data)

    logging.info(f"Chunking complete. Generated {len(all_chunks_data)} chunks from {total_rows} input rows.")

    # --- 5. Save Chunks to CSV ---
    if all_chunks_data:
        df_chunks = pd.DataFrame(all_chunks_data)
        logging.info(f"Attempting to save {len(df_chunks)} chunks to: {OUTPUT_CHUNKS_CSV}")
        try:
            # Define order of columns for the output chunk file
            chunk_fieldnames = [
                'university_name', 'faculty', 'program_name', 'chunk_sequence',
                'chunk_text', 'source_url', 'original_row_index'
            ]
            df_chunks = df_chunks[chunk_fieldnames] # Reorder columns
            df_chunks.to_csv(OUTPUT_CHUNKS_CSV, index=False, encoding='utf-8-sig')
            logging.info(f"Chunks successfully saved to {OUTPUT_CHUNKS_CSV}")
        except Exception as e:
            logging.error(f"Error saving chunks CSV: {e}", exc_info=True)
    else:
        logging.warning("No chunks were generated to save.")

    logging.info("--- Text Chunking Script Finished ---")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
