In [None]:
import pandas as pd
from google.colab import drive
import os

# 1. Mount Google Drive
drive.mount('/content/drive')

# 2. Define file path
file_path = '/content/drive/My Drive/SOSC314/ICE Hashtag Data/CNN_Data_cleaned.xlsx'

# Check if the file exists
if not os.path.exists(file_path):
    print(f"File not found, please check the path: {file_path}")
else:
    print("Loading data and processing Comment_Details...")

    # 3. Read Excel file
    # Ensure sheet names match exactly: "Video_Summary" and "Comment_Details"
    df_metadata = pd.read_excel(file_path, sheet_name="Video_Summary", dtype={'video_id': str})
    df_comments = pd.read_excel(file_path, sheet_name="Comment_Details", dtype={'video_id': str})

    # 4. Data Preprocessing (ID Cleaning)
    df_metadata['video_id'] = df_metadata['video_id'].str.strip()
    df_comments['video_id'] = df_comments['video_id'].str.strip()

    # 5. Timestamp Processing (for Comment_Details table)
    # Convert to datetime objects and handle potential errors
    df_comments['timestamp'] = pd.to_datetime(df_comments['timestamp'], errors='coerce')

    # 6. Sorting
    # Sort by timestamp from oldest to newest (helpful for chronological analysis)
    df_comments = df_comments.sort_values(by='timestamp', ascending=True)

    # 7. Core Cleaning Logic (Filtering)
    valid_ids = set(df_metadata['video_id'].unique())
    df_comments_cleaned = df_comments[df_comments['video_id'].isin(valid_ids)].copy()

    # 8. Results Statistics
    print("-" * 30)
    print(f"Processing successful!")
    print(f"Valid videos in Summary: {len(valid_ids)}")
    print(f"Comments retained: {len(df_comments_cleaned)}")
    print(f"Comments removed: {len(df_comments) - len(df_comments_cleaned)}")
    print("-" * 30)

    # 9. Save the cleaned results
    output_path = '/content/drive/My Drive/SOSC314/ICE Hashtag Data/CNN_Comments_Cleaned.xlsx'

    # Using specific date_format to ensure Excel renders the timestamp correctly
    df_comments_cleaned.to_excel(output_path, index=False, date_format='yyyy-mm-dd hh:mm:ss')
    print(f"Final cleaned data saved to: {output_path}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
File not found, please check the path: /content/drive/My Drive/SOSC314/ICE Hashtag Data/CNN_Data_cleaned.xlsx


In [None]:
import pandas as pd
from google.colab import drive
import os

# 1. Mount Google Drive to access your files
drive.mount('/content/drive')

# 2. Define the paths for your cleaned Excel files
# Adjust the folder path if your files are in a different directory
folder_path = '/content/drive/My Drive/SOSC314/ICE Hashtag Data/'
video_input_path = os.path.join(folder_path, 'CNN_Video_cleaned.xlsx')
comments_input_path = os.path.join(folder_path, 'CNN_Comments_Cleaned.xlsx')

# 3. Define output CSV paths
video_output_csv = os.path.join(folder_path, 'CNN_Video_Final.csv')
comments_output_csv = os.path.join(folder_path, 'CNN_Comments_Final.csv')

def convert_to_csv():
    try:
        print("Reading Excel files...")

        # 4. Load Excel files
        # Force video_id as string to prevent scientific notation or data loss
        # Since they only have one sheet, we don't need to specify sheet_name (defaults to the first one)
        df_video = pd.read_excel(video_input_path, dtype={'video_id': str})
        df_comments = pd.read_excel(comments_input_path, dtype={'video_id': str})

        # 5. Optional: Ensure timestamp is in datetime format before saving to CSV
        if 'timestamp' in df_comments.columns:
            df_comments['timestamp'] = pd.to_datetime(df_comments['timestamp'], errors='coerce')

        # 6. Save to CSV
        # 'utf-8-sig' ensures that Emoji and special characters display correctly in both Excel and Python
        df_video.to_csv(video_output_csv, index=False, encoding='utf-8-sig')
        df_comments.to_csv(comments_output_csv, index=False, encoding='utf-8-sig')

        print("-" * 30)
        print("Conversion Successful!")
        print(f"Video Data Saved: {video_output_csv} ({len(df_video)} rows)")
        print(f"Comments Data Saved: {comments_output_csv} ({len(df_comments)} rows)")
        print("-" * 30)
        print("You can now use these CSV files for RQ1, RQ2, and RQ3 analysis.")

    except Exception as e:
        print(f"An error occurred: {e}")

# Run the conversion
convert_to_csv()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Reading Excel files...
------------------------------
Conversion Successful!
Video Data Saved: /content/drive/My Drive/SOSC314/ICE Hashtag Data/CNN_Video_Final.csv (65 rows)
Comments Data Saved: /content/drive/My Drive/SOSC314/ICE Hashtag Data/CNN_Comments_Final.csv (141371 rows)
------------------------------
You can now use these CSV files for RQ1, RQ2, and RQ3 analysis.
