In [2]:
import pandas as pd

def split_series_title(series_title):
    # Split the series_title into the respective columns
    parts = series_title.split(',')
    
    # Handle cases with two or more parts in the title
    if len(parts) == 3:
        # Split the first part further if needed (Series and Activity)
        first_part = parts[0].split('-')
        if len(first_part) == 2:
            series = first_part[0].strip()  # "Avg hrs per day"
            activity = first_part[1].strip()  # "Household activities (includes travel)"
        else:
            series = first_part[0].strip()  # If there's no dash, the whole part is the series
            activity = ""
        
        # Second part is Age group (e.g., "15-24 yrs")
        age_group = parts[1].strip()
        
        # Third part is Gender (e.g., "Women")
        gender = parts[2].strip()
        
    elif len(parts) == 2:  # If gender is missing, fill with MenAndWomen
        # Split the first part into series and activity
        first_part = parts[0].split('-')
        if len(first_part) == 2:
            series = first_part[0].strip()  # "Avg hrs per day"
            activity = first_part[1].strip()  # "Household activities (includes travel)"
        else:
            series = first_part[0].strip()  # If no dash, treat as series only
            activity = ""
        
        # The second part is the Age group
        age_group = parts[1].strip()
        
        # No gender, so set to 'MenAndWomen'
        gender = "MenAndWomen"
        
    else:
        # Handle any other unexpected cases
        series = activity = age_group = gender = None
    
    return pd.Series([series, activity, age_group, gender], index=['Series', 'Activity', 'Age group', 'Gender'])

def main():
    # File paths
    data_file = 'tu.data.1.AllData.txt'
    selected_series_file = 'tu_select_series.txt'
    output_csv_path = 'tu_processed_data.csv'

    # Load the selected series list
    with open(selected_series_file, 'r', encoding='utf-8') as f:
        selected_series = [line.strip() for line in f.readlines()]

    # Load the data file
    data = pd.read_csv(data_file, sep='\t', engine='python')

    # Clean the 'series_id' column in the data to remove any unwanted characters
    data.columns = data.columns.str.strip()
    data['series_id'] = data['series_id'].str.strip()

    # Filter data based on selected series
    filtered_data = data[data['series_id'].isin(selected_series)]

    # Load the series info to get the series title
    series_info = pd.read_csv('tu.series.txt', sep='\t', engine='python')
    series_info.columns = series_info.columns.str.strip()  # Clean column names
    series_info['series_id'] = series_info['series_id'].str.strip()

    # Filter the series_info to only include selected series
    filtered_series_info = series_info[series_info['series_id'].isin(selected_series)]

    # Merge the filtered data with the series_info to get the series title
    merged_data = pd.merge(filtered_data, filtered_series_info[['series_id', 'series_title']], on='series_id', how='left')

    # Split the 'series_title' into the new columns
    merged_data[['Series', 'Activity', 'Age group', 'Gender']] = merged_data['series_title'].apply(split_series_title)

    # Drop the 'series_title' column as it's no longer needed
    merged_data.drop(columns=['series_title'], inplace=True)

    # Save to CSV
    merged_data.to_csv(output_csv_path, index=False)

    print(f"Processed data saved to {output_csv_path}")

# Run the main function
if __name__ == "__main__":
    main()


Processed data saved to processed_data.csv
