In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# === Load Single Training File ===
import pandas as pd

train_df = pd.read_csv('/content/drive/MyDrive/dataset/ocr_text/train/STask_A_train.csv')  # <-- update path if needed

# Drop index if not needed as feature
train_df = train_df[['text', 'label']]

In [None]:
# === Preprocessing ===
import re
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')

def preprocess(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[0-9]+', '', text)
    text = re.sub(r'@\S+', '', text)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    filtered = [w for w in tokens if w not in stopwords.words('english') and len(w) > 2]
    return " ".join(filtered)

train_df['text'] = train_df['text'].map(preprocess)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# === Train-Test Split ===
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42)

# === Train Transformer (ALBERT) ===
!pip install simpletransformers -q

from simpletransformers.classification import ClassificationModel

model = ClassificationModel(
    'albert', 'albert-base-v1',
    num_labels=2,
    use_cuda=False,
    args={
        'reprocess_input_data': True,
        'overwrite_output_dir': True,
        'num_train_epochs': 3,
        'train_batch_size': 16,
        'eval_batch_size': 16,
    }
)

model.train_model(train_data)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.3/316.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.8/9.8 MB[0m [31m73.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.3 MB/s[0m eta 

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]



  0%|          | 0/6 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/203 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/203 [00:00<?, ?it/s]

Running Epoch 3 of 3:   0%|          | 0/203 [00:00<?, ?it/s]

(609, 0.6043123259822332)

In [None]:
# === Evaluate on Validation Set ===
preds, _ = model.predict(val_data['text'].tolist())
val_data['pred'] = preds

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(val_data['label'], val_data['pred']))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/51 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       0.70      0.65      0.67       410
           1       0.66      0.72      0.69       400

    accuracy                           0.68       810
   macro avg       0.68      0.68      0.68       810
weighted avg       0.68      0.68      0.68       810



In [None]:
# === Predict on Evaluation File ===

eval_df = pd.read_csv('/content/drive/MyDrive/dataset/ocr_text/eval/STask-A(index,text)val.csv')  # <-- update if needed
eval_df['text'] = eval_df['text'].map(preprocess)

eval_preds, _ = model.predict(eval_df['text'].tolist())
eval_df['label'] = eval_preds

# === Create 'index' from filename ===
eval_df['index'] = eval_df['index'].astype(str)

# === Final Submission Format ===
submission = eval_df[['index', 'label']].rename(columns={"label": "prediction"})

# === Save to evaloutput.json ===
import json

with open('/content/drive/MyDrive/dataset/evaloutput.json', 'w') as f:
    json.dump(submission.to_dict(orient='records'), f, indent=2)

print("✅ Submission file saved as evaloutput.json")


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

✅ Submission file saved as evaloutput.json
