In [2]:
import csv, json
from pathlib import Path
from datetime import datetime

base = Path("/Users/zinebberrekia/Desktop/NLP_finalwork/el-bayan/Data/datasetstructure")

folders = [
  "sentences/correct","sentences/incorrect","sentences/raw_text",
  "annotations","rules","difficulty","feedback","exercises","learners",
  "audio/sentence_audio","audio/word_audio","meta","evaluation",
]

csv_templates = {
  # Sentences
  "sentences/correct/sentences_correct.csv": [
    "sentence_id","sentence","sentence_diacritized","translation","rule_id",
    "difficulty","source","i3rab","syntactic_structure_ref"
  ],
  "sentences/incorrect/sentences_incorrect.csv": [
    "sentence_id","wrong_sentence","correct_sentence","error_type","error_subtype",
    "error_span_start","error_span_end","severity","rule_id","difficulty","learner_level_hint"
  ],

  # Annotations
  "annotations/tokens.csv": ["sentence_id","token_id","token","lemma","pos"],
  "annotations/morphological_features.csv": [
    "sentence_id","token_id","case","gender","number","tense","mood","voice",
    "aspect","person","voice_diacritic","orthographic_variant"
  ],
  "annotations/dependencies.csv": ["sentence_id","token_id","head_token_id","dep_label"],
  "annotations/diacritization.csv": ["sentence_id","token_id","undiacritized","diacritized","confidence"],
  "annotations/i3rab.csv": ["sentence_id","token_id","case_mark","syntactic_role","governor_token_id"],

  # Rules
  "grammar_rules.csv": ["rule_id","rule_name","description","example","pedagogical_goal","prerequisites","common_misconceptions"],
  "rules/rule_to_sentence.csv": ["rule_id","sentence_id"],
  "rules/rule_taxonomy.csv": ["rule_id","parent_rule_id","category","subcategory","notes"],
  "rules/conjugation_patterns.csv": ["verb_lemma","form","tense","person","number","gender","mood","diacritized_form","features_json"],

  # Difficulty
  "difficulty/rule_difficulty.csv": ["rule_id","level","category","evidence_metric","last_updated","source"],
  "difficulty/sentence_difficulty.csv": ["sentence_id","level","category","evidence_metric","last_updated","source"],

  # Feedback
  "feedback/correction_suggestions.csv": ["error_id","sentence_id","wrong_sentence","correct_sentence","suggestion","rule_id"],
  "feedback/explanation_text.csv": ["error_id","rule_id","explanation_ar","explanation_en","reading_level","example_ids"],
  "feedback/adaptive_next_step.csv": ["error_id","next_exercise_type","recommended_topic","repeat_times"],
  "feedback/error_templates.csv": ["error_type","subtype","template_text_ar","template_text_en","rule_id"],
  "feedback/rubrics.csv": ["rubric_id","dimension","levels_json"],
  "feedback/safety_checks.csv": ["check_id","category","status","notes"],

  # Exercises (shared metadata cols included)
  "exercises/fill_in_blank.csv": ["exercise_id","sentence_id","question","answer","level","time_limit_sec","points","hint_refs","audio_ref"],
  "exercises/error_correction.csv": ["exercise_id","sentence_id","incorrect_sentence","correct_sentence","level","time_limit_sec","points","hint_refs","audio_ref"],
  "exercises/mcq_grammar.csv": ["exercise_id","sentence_id","question","option_a","option_b","option_c","option_d","correct_option","level","time_limit_sec","points","hint_refs","audio_ref"],
  "exercises/conjugation_drill.csv": ["exercise_id","verb_lemma","prompt_features_json","correct_form","distractors","rule_id","level","time_limit_sec","points","hint_refs","audio_ref"],
  "exercises/i3rab_analysis.csv": ["exercise_id","sentence_id","target_tokens","correct_labels","distractors","explanation_ref","level","time_limit_sec","points","hint_refs","audio_ref"],
  "exercises/generative_prompt.csv": ["exercise_id","prompt_ar","expected_traits","rubric_ref","level","time_limit_sec","points","hint_refs","audio_ref"],

  # Learners
  "learners/learner_profiles.csv": ["learner_id","name","current_level","goals","preferred_script","access_constraints"],
  "learners/learner_progress.csv": ["learner_id","sentence_id","is_correct","attempts","timestamp"],
  "learners/learner_errors_history.csv": ["learner_id","error_id","repeat_count"],
  "learners/interactions.csv": ["learner_id","session_id","timestamp","sentence_id","action_type","response","latency_ms","device_type","offline_flag"],
  "learners/adaptive_state.csv": ["learner_id","estimated_level","mastery_by_rule_json","last_recommendation","next_due_ts"],

  # Audio
  "audio/alignments.csv": ["sentence_id","token_id","start_ms","end_ms","speaker_id"],

  # Meta
  "meta/sources.csv": ["source_id","type","url_or_ref","license"],
  "meta/splits.csv": ["sentence_id","split","rationale"],
  "meta/models.csv": ["model_id","base_model","version","finetune_data_ref","prompt_template_ref"],
  "meta/prompts.csv": ["prompt_id","template_text","variables_json","purpose"],

  # Evaluation
  "evaluation/metrics.csv": ["metric_id","name","definition"],
  "evaluation/results.csv": ["model_id","eval_set","metric_id","value","notes"],
  "evaluation/human_ratings.csv": ["item_id","rater_id","clarity","correctness","usefulness","comments"],
}

AR_SENTENCES = [
  "الولدُ يقرأُ كتابًا.",
  "ذهبتِ الطالبةُ إلى المدرسةِ مبكرًا.",
  "يكتبُ المعلمُ الدرسَ على السبورةِ.",
  "لمْ يحضرِ الطالبُ الامتحانَ.",
  "هلْ قرأتَ القصةَ الجديدةَ؟"
]
AR_WRONG = [
  "الولد يقرء كتاب.",
  "ذهبت الطالبة الى المدرسة مبكرا.",
  "يكتب المعلم الدرس على السبورة.",
  "لم يحضر الطالب الامتحان.",
  "هل قرت القصة الجديدة؟"
]
AR_CORR = AR_SENTENCES

def now_iso():
    return datetime.utcnow().isoformat(timespec='seconds') + 'Z'

def gen_value(col, i):
    col_l = col.lower()
    if col_l in {"sentence","prompt_ar"}: return AR_SENTENCES[i % 5]
    if col_l == "wrong_sentence": return AR_WRONG[i % 5]
    if col_l == "correct_sentence": return AR_CORR[i % 5]
    if col_l in {"sentence_diacritized","diacritized"}: return AR_SENTENCES[i % 5]
    if col_l in {"undiacritized","token"}: return ["الولد","يقرأ","كتابًا","."][i % 4]
    if col_l == "lemma": return ["ولد","قرأ","كتاب","."][i % 4]
    if col_l == "pos": return ["NOUN","VERB","NOUN","PUNCT"][i % 4]
    if col_l in {"case","case_mark"}: return ["رفع","نصب","جر"][i % 3]
    if col_l == "gender": return ["مذكر","مؤنث"][i % 2]
    if col_l == "number": return ["مفرد","مثنى","جمع"][i % 3]
    if col_l == "tense": return ["ماضٍ","مضارع","أمر"][i % 3]
    if col_l == "mood": return ["رفع","نصب","جزم"][i % 3]
    if col_l in {"voice","voice_diacritic"}: return ["معلوم","مجهول"][i % 2]
    if col_l == "aspect": return ["تام","ناقص"][i % 2]
    if col_l == "person": return ["الأول","الثاني","الثالث"][i % 3]
    if col_l in {"syntactic_role","dep_label"}: return ["فاعل","مفعول به","خبر","مبتدأ"][i % 4]
    if col_l in {"category","error_type","subtype","error_subtype"}: return ["إعراب","اتفاق","تصريف","ترتيب"][i % 4]
    if col_l in {"severity","level"}: return ["سهل","متوسط","صعب"][i % 3]
    if col_l == "reading_level": return ["بسيط","متوسط","متقدم"][i % 3]
    if col_l == "rule_name": return ["رفع الفاعل","نصب المفعول","جر المضاف إليه","تنوين النصب"][i % 4]
    if col_l in {"description","example","template_text_ar","template_text_en","explanation_ar","explanation_en"}: return "مثال/شرح تجريبي"
    if col_l == "suggestion": return "اقترح تصحيح الصيغة وإضافة الحركات المناسبة"
    if col_l.endswith("_json"): return json.dumps({"sample": i})
    if col_l.endswith("_ids") or col_l.endswith("_ref") or col_l.endswith("_refs"): return "ref-" + str(i+1)
    if col_l == "split": return ["train","dev","test"][i % 3]
    if col_l == "type": return ["curriculum","web","book"][i % 3]
    if col_l in {"url_or_ref","license"}: return "N/A"
    if col_l == "status": return ["ok","flagged"][i % 2]
    if col_l in {"notes","rationale","comments"}: return ""
    if col_l in {"points","time_limit_sec","latency_ms","start_ms","end_ms"}: return str((i+1)*10)
    if col_l in {"timestamp","last_updated","next_due_ts"}: return now_iso()
    if col_l in {"is_correct","offline_flag"}: return ["true","false"][i % 2]
    if col_l == "device_type": return ["mobile","web"][i % 2]
    if col_l in {"base_model","version"}: return ["llm-arabic-base","v1"][i % 2]
    if col_l in {"metric_id","name","definition"}: return ["acc","Accuracy","fraction correct"][i % 3]
    if col_l == "value": return str(0.8 + 0.02*i)
    if col_l.endswith("_id") or col_l.endswith("id"): return str(i+1)
    if col_l.endswith("_token_id"): return str((i % 4) + 1)
    if col_l in {"head_token_id","governor_token_id"}: return str((i % 4) + 1)
    if col_l == "form": return ["I","II","III","IV","V","VI","VII","VIII","IX","X"][i % 10]
    return f"val-{i+1}"

# Create folders
for p in folders:
    (base / p).mkdir(parents=True, exist_ok=True)

# Create CSVs with header + 5 example rows
for rel, cols in csv_templates.items():
    path = base / rel
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w", encoding="utf-8", newline="") as f:
        w = csv.writer(f)
        w.writerow(cols)
        for i in range(5):
            w.writerow([gen_value(c, i) for c in cols])

# Track empty dirs with .gitkeep
for p in ["sentences/raw_text","audio/sentence_audio","audio/word_audio"]:
    (base / p / ".gitkeep").write_text("", encoding="utf-8")

print("Done: datasetstructure generated with files and 5 example rows.")
PY

Done: datasetstructure generated with files and 5 example rows.


NameError: name 'PY' is not defined