In [1]:
import pandas as pd
import os

# --- CONFIGURATION ---
input_file = 'hanoi.csv'
output_file = 'weather_Hanoi_deduplicated.csv'

# Columns to identify duplicates (Unique Key)
# Usually, a location at a specific time should be unique.
subset_cols = ['name', 'datetime']

def remove_duplicates():
    if not os.path.exists(input_file):
        print(f"❌ Error: File {input_file} not found.")
        return

    print(f"READING: {input_file}...")
    df = pd.read_csv(input_file)
    original_rows = len(df)

    # --- STEP 1: DROP DUPLICATES ---
    # keep='first': Keep the first occurrence, drop the rest
    df.drop_duplicates(subset=subset_cols, keep='first', inplace=True)
    
    new_rows = len(df)
    removed_count = original_rows - new_rows

    print(f"--- REPORT ---")
    print(f"Original rows: {original_rows}")
    print(f"Rows after cleaning: {new_rows}")
    print(f"Duplicated rows removed: {removed_count}")

    df.to_csv(output_file, index=False, encoding='utf-8')
    print(f"✅ Saved clean file to: {output_file}")

if __name__ == "__main__":
    remove_duplicates()

READING: hanoi.csv...
--- REPORT ---
Original rows: 51864
Rows after cleaning: 51864
Duplicated rows removed: 0
✅ Saved clean file to: weather_Hanoi_deduplicated.csv


In [2]:
import pandas as pd
import os

# --- CONFIGURATION ---
input_file = 'weather_Hanoi_deduplicated.csv' # Use the file from step 1
output_file = 'hanoi_final.csv'

def convert_units():
    if not os.path.exists(input_file):
        print(f"❌ Error: File {input_file} not found.")
        return

    print(f"READING: {input_file}...")
    df = pd.read_csv(input_file)

    # Ensure numeric types
    cols_to_convert = ['sealevelpressure', 'precip']
    for col in cols_to_convert:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

    # --- STEP 1: CONVERT PRESSURE (atm -> hPa) ---
    # Formula: 1 atm = 1013.25 hPa
    if 'sealevelpressure' in df.columns:
        # SAFETY CHECK: Only convert if values look like ATM (small numbers, e.g., 0.9 - 1.1)
        # If values are already > 800, they are likely already in hPa/mb.
        mean_pressure = df['sealevelpressure'].mean()
        
        if mean_pressure < 10: 
            print("⚙️ Detected Pressure in ATM. Converting to hPa...")
            df['sealevelpressure'] = df['sealevelpressure'] * 1013.25
        else:
            print(f"⚠️ Pressure average is {mean_pressure:.1f}. It seems already in hPa. Skipping conversion.")

    # --- STEP 2: CONVERT PRECIPITATION (inch -> mm) ---
    # Formula: 1 inch = 25.4 mm
    if 'precip' in df.columns:
        print("⚙️ Converting Precipitation from inch to mm...")
        df['precip'] = df['precip'] * 25.4

    # Save
    df.to_csv(output_file, index=False, encoding='utf-8')
    print(f"✅ Success! Units converted. Saved to: {output_file}")
    
    # Preview
    print(df[['datetime', 'sealevelpressure', 'precip']].head())

if __name__ == "__main__":
    convert_units()

READING: weather_Hanoi_deduplicated.csv...
⚙️ Detected Pressure in ATM. Converting to hPa...
⚙️ Converting Precipitation from inch to mm...
✅ Success! Units converted. Saved to: weather_Hanoi_metric_standardized.csv
              datetime  sealevelpressure  precip
0  2020-01-01T00:00:00        1024.39575     0.0
1  2020-01-01T01:00:00        1024.39575     0.0
2  2020-01-01T02:00:00        1023.38250     0.0
3  2020-01-01T03:00:00        1023.38250     0.0
4  2020-01-01T04:00:00        1023.38250     0.0
