In [1]:
import pandas as pd
import json

# Load the Excel file
file_path = "HSK Vocabulary.xlsx"  # Replace with your actual path if needed
sheets = pd.read_excel(file_path, sheet_name=None)

# Initialize the output dictionary
output = {}

# Process each sheet (e.g., HSK1, HSK2, ...)
for sheet_name, df in sheets.items():
    # Skip sheets without the required columns
    required_columns = {"Word", "Pinyin", "Meaning", "Unit"}
    if not required_columns.issubset(df.columns):
        continue

    # Drop rows missing required data
    df = df.dropna(subset=["Unit", "Word", "Pinyin", "Meaning"])

    # Ensure 'Unit' is an integer
    df["Unit"] = df["Unit"].astype(int)

    # Group by 'Unit'
    grouped = df.groupby("Unit")

    # Build the data structure
    units = []
    for unit, group in grouped:
        words = [
            {
                "character": row["Word"],
                "pinyin": row["Pinyin"],
                "meaning": row["Meaning"]
            }
            for _, row in group.iterrows()
        ]
        units.append({"unit": unit, "words": words})

    # Save under a standardized HSK level name
    output[sheet_name.replace("HSK", "HSK-")] = units

# Save to JSON file (optional)
with open("hsk_vocab.json", "w", encoding="utf-8") as f:
    json.dump(output, f, ensure_ascii=False, indent=2)

# Optional: print part of the JSON
# print(json.dumps(output, ensure_ascii=False, indent=2)[:2000])  # Preview the first part


{
  "HSK-1": [
    {
      "unit": 1,
      "words": [
        {
          "character": "对不起",
          "pinyin": "duìbuqǐ",
          "meaning": "to be sorry"
        },
        {
          "character": "好",
          "pinyin": "hǎo",
          "meaning": "good, fine"
        },
        {
          "character": "没关系",
          "pinyin": "méi guānxi",
          "meaning": "that’s OK, it doesn’t matter"
        },
        {
          "character": "你",
          "pinyin": "nǐ",
          "meaning": "(singular) you"
        }
      ]
    },
    {
      "unit": 2,
      "words": [
        {
          "character": "不客气",
          "pinyin": "bú kèqi",
          "meaning": "you’re welcome, don’t mention it"
        },
        {
          "character": "不",
          "pinyin": "bù",
          "meaning": "no, not"
        },
        {
          "character": "谢谢",
          "pinyin": "xièxie",
          "meaning": "to thank"
        },
        {
          "character": "再见",
          "pinyin":

In [3]:
import pandas as pd

# Load the Excel file
file_path = "HSK Vocabulary.xlsx"  # Adjust the path as needed
sheets = pd.read_excel(file_path, sheet_name=None)

# Create an empty list to hold data
all_data = []

# Process each sheet
for sheet_name, df in sheets.items():
    if not {"Word", "Pinyin", "Meaning", "Unit"}.issubset(df.columns):
        continue  # Skip invalid sheets
    
    # Add a column for HSK level
    df["HSK Level"] = sheet_name
    
    # Keep only the relevant columns
    df = df[["HSK Level", "Unit", "Word", "Pinyin", "Meaning"]]
    
    all_data.append(df)

# Concatenate all sheets into one DataFrame
full_df = pd.concat(all_data, ignore_index=True)

# Display the full DataFrame
full_df.tail(20)  # Show first 20 rows (you can change this)


Unnamed: 0,HSK Level,Unit,Word,Pinyin,Meaning
1777,HSK5,2,装修,zhuāngxiū,"to decorate (a house, room, etc.)"
1778,HSK5,11,状态,zhuàngtài,"state, status"
1779,HSK5,3,撞,zhuàng,to bump against
1780,HSK5,6,追,zhuī,"to chase, to go after"
1781,HSK5,7,姿势,zīshì,"pose, posture"
1782,HSK5,15,资格,zīgé,qualification
1783,HSK5,9,资料,zīliào,"data, material"
1784,HSK5,1,自杀,zìshā,to commit suicide
1785,HSK5,18,自由,zìyóu,freedom; free
1786,HSK5,8,综合,zōnghé,"to synthesize, to summarize; comprehensive, in..."
