In [1]:
import pandas as pd

# adjust path/name
infile = "/workspaces/Baswap-app/dataset/merged_all_data.csv"
outfile = "/workspaces/Baswap-app/dataset/merged_all_data_new.csv"

# read CSV
df = pd.read_csv(infile)

# drop Unnamed: 0 if it's just an index column
if "Unnamed: 0" in df.columns:
    # only drop if it looks like a default index column (all integers or monotonic)
    try:
        if pd.api.types.is_integer_dtype(df["Unnamed: 0"]) or df["Unnamed: 0"].is_monotonic_increasing:
            df = df.drop(columns=["Unnamed: 0"])
    except Exception:
        # if any problem, still safe to drop if user expects it
        df = df.drop(columns=["Unnamed: 0"])

# find EC column (exact match first, else fuzzy)
ec_original = "EC[g/l]"

# coerce EC column to numeric (non-numeric -> NaN)
df[ec_original] = pd.to_numeric(df[ec_original], errors="coerce")

# keep only ds, station and EC
keep_cols = ["ds", "station", ec_original]
missing = [c for c in keep_cols if c not in df.columns]
if missing:
    raise KeyError(f"Missing required column(s): {missing}")

df = df[keep_cols].copy()

# optional: parse ds to datetime (uncomment if you want)
# df["ds"] = pd.to_datetime(df["ds"], errors="coerce")

# rename EC column
df = df.rename(columns={ec_original: "EC Value (g/l)"})

# compute EC Value (us/cm)
# multiplier is 2000 per your instruction
df["EC Value (us/cm)"] = df["EC Value (g/l)"] * 2000

# (optional) reorder columns
df = df[["ds", "station", "EC Value (g/l)", "EC Value (us/cm)"]]

# save
df.to_csv(outfile, index=False)

print(f"Saved processed file to: {outfile}")


Saved processed file to: /workspaces/Baswap-app/dataset/merged_all_data_new.csv


In [1]:
from data import combined_data_retrieve

df = combined_data_retrieve()
df.head()

2025-11-03 03:59:06.246 
  command:

    streamlit run /home/vscode/.local/lib/python3.11/site-packages/ipykernel_launcher.py [ARGUMENTS]


Unnamed: 0.1,ds,EC[g/l],station,Unnamed: 0,DO Value,DO Temperature,EC Value (us/cm),EC Temperature,Battery Voltage
0,1996-02-02 03:00:00,0.1,AnDinh,,,,,,
1,1996-02-02 05:00:00,0.1,AnDinh,,,,,,
2,1996-02-02 07:00:00,0.1,AnDinh,,,,,,
3,1996-02-04 17:00:00,0.1,AnDinh,,,,,,
4,1996-02-04 19:00:00,0.1,AnDinh,,,,,,


In [3]:
df["ds"].max().date(), df["ds"].min().date()

(datetime.date(2025, 6, 6), datetime.date(1995, 2, 2))

In [3]:
# after reading df
print("dtype:", df["ds"].dtype)
print("first rows:", df["ds"].head(10).tolist())

# count actual Python types inside the Series
print(df["ds"].map(type).value_counts().to_dict())

dtype: object
first rows: ['1996-02-02 03:00:00+07:00', '1996-02-02 05:00:00+07:00', '1996-02-02 07:00:00+07:00', '1996-02-04 17:00:00+07:00', '1996-02-04 19:00:00+07:00', '1996-02-04 21:00:00+07:00', '1996-02-04 23:00:00+07:00', '1996-02-05 05:00:00+07:00', '1996-02-05 07:00:00+07:00', '1996-02-05 09:00:00+07:00']
{<class 'str'>: 863428}
