In [None]:
# 0) Mount Drive + install deps
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
!pip install -q transformers datasets evaluate accelerate

In [None]:
# 1) Imports & settings
import os, glob, subprocess, gc, warnings
import pandas as pd, numpy as np, torch, torch.nn.functional as F
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, AutoConfig,
    Trainer, TrainingArguments, DataCollatorWithPadding, set_seed
)
import evaluate, transformers
warnings.filterwarnings("ignore")

In [None]:
# ---------- User-editable knobs (make small for free Colab) ----------
DRIVE_BASE = "/content/drive/MyDrive/Colab Notebooks/CodeMix"
REPO_URL = "https://github.com/bharathichezhiyan/DravidianCodeMix-Dataset.git"
CLONE_PATH = os.path.join(DRIVE_BASE, "repo")
FORCE_TRAIN_CSV = None   # optional: set explicit csv path in drive if autodetect fails
LANG = "tamil"
TASK = "offensive"
SEED = 42

RESULTS_DIR = os.path.join(DRIVE_BASE, "results")
os.makedirs(DRIVE_BASE, exist_ok=True); os.makedirs(RESULTS_DIR, exist_ok=True)

set_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ---------- Clone repo if needed ----------
if not os.path.exists(CLONE_PATH):
    print("Cloning dataset repo...")
    subprocess.run(["git", "clone", "--depth", "1", REPO_URL, CLONE_PATH], check=True)
else:
    print("Repo present:", CLONE_PATH)

In [None]:
# ---------- Find candidate CSV ----------
def find_candidate_csv(lang, task):
    patterns = [
        os.path.join(CLONE_PATH, "**", f"*{lang}*{task}*train*.csv"),
        os.path.join(CLONE_PATH, "**", f"*{lang}*{task}*.csv"),
        os.path.join(CLONE_PATH, "**", f"*{lang}*train*.csv"),
        os.path.join(CLONE_PATH, "**", f"*{lang}*full*.csv"),
        os.path.join(CLONE_PATH, "**", f"*{lang}*.csv"),
    ]
    files=[]
    for p in patterns: files.extend(glob.glob(p, recursive=True))
    return sorted(list(set(files)))

if FORCE_TRAIN_CSV:
    train_csv = FORCE_TRAIN_CSV
else:
    candidates = find_candidate_csv(LANG, TASK)
    if not candidates:
        raise RuntimeError("No dataset CSV found. Set FORCE_TRAIN_CSV to a path in Drive.")
    preferred = [f for f in candidates if "full" in f or "train" in f]
    train_csv = preferred[0] if preferred else candidates[0]
print("Using CSV:", train_csv)

In [None]:
# ---------- Robust CSV read (comma or tab fallback) ----------
def robust_read_csv(path):
    for sep in [",","\t","|"]:
        try:
            df = pd.read_csv(path, sep=sep, encoding="utf-8", on_bad_lines="skip", engine="python")
            if df.shape[0] > 0 and df.shape[1] > 1:
                return df
        except Exception:
            continue
    # fallback: single-column read
    df = pd.read_csv(path, encoding="utf-8", on_bad_lines="skip", engine="python")
    return df

df = robust_read_csv(train_csv)
print("Loaded CSV shape:", df.shape)
display(df.head(3))

In [None]:
column_names = df.columns.tolist()
print(column_names)
input_column = column_names[0]
output_column = column_names[1]
print(input_column)
print(output_column)

In [None]:
print(df[output_column].unique())

In [None]:
print(df[output_column].value_counts())

In [None]:
examples = []
for index, row in df.iterrows():
  input_value = row[input_column]
  output_value = row[output_column]
  if output_value == 'not-Tamil':
    continue
  label = None
  if output_value == "Not_offensive":
    label = output_value
  elif output_value.startswith("Offensive"):
    label = "Offensive"
  else:
    raise ValueError(f"Unexpected label: {output_value}")
  examples.append((input_value, label))
print(len(examples))

In [None]:
from collections import Counter

labels2count = Counter()
for example in examples:
  labels2count[example[1]] += 1
print(labels2count)

# Train: 3000, Val: 800, Test: 800

In [None]:
not_offensive_examples = []
offensive_examples = []
for example in examples:
  if example[1] == "Offensive":
    offensive_examples.append((example[0], 0))
  else:
    not_offensive_examples.append((example[0], 1))
print(len(not_offensive_examples))
print(len(offensive_examples))

In [None]:
import random
random.seed(123)

random.shuffle(not_offensive_examples)
random.shuffle(offensive_examples)

In [None]:
num_train_per_class = 1500
num_val_per_class = 400
num_test_per_class = 400

train_examples = not_offensive_examples[0:num_train_per_class] + offensive_examples[0:num_train_per_class]
random.shuffle(train_examples)

val_examples = not_offensive_examples[num_train_per_class:num_train_per_class+num_val_per_class] + offensive_examples[num_train_per_class:num_train_per_class+num_val_per_class]
random.shuffle(val_examples)

test_examples = not_offensive_examples[num_train_per_class+num_val_per_class:num_train_per_class+num_val_per_class+num_test_per_class] + offensive_examples[num_train_per_class+num_val_per_class:num_train_per_class+num_val_per_class+num_test_per_class]
random.shuffle(test_examples)

print(len(train_examples), len(val_examples), len(test_examples))


In [None]:
import csv

train_filename = DRIVE_BASE + '/train.csv'

with open(train_filename, 'w', newline='') as file:
  writer = csv.writer(file)

  # Use writerows() to write all the data at once.
  writer.writerows([['review', 'label']] + train_examples)

In [None]:
val_filename = DRIVE_BASE + '/val.csv'

with open(val_filename, 'w', newline='') as file:
  writer = csv.writer(file)

  # Use writerows() to write all the data at once.
  writer.writerows([['review', 'label']] + val_examples)

In [None]:
test_filename = DRIVE_BASE + '/test.csv'

with open(test_filename, 'w', newline='') as file:
  writer = csv.writer(file)

  # Use writerows() to write all the data at once.
  writer.writerows([['review', 'label']] + test_examples)