In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# === Load Single Training File ===
import pandas as pd

train_df = pd.read_csv('/content/drive/MyDrive/meme_classifications/ocr_text/train/STask_A_train.csv')  # <-- update path if needed

# Drop index if not needed as feature
train_df = train_df[['text', 'label']]

In [3]:
# === Preprocessing ===
import re
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')

def preprocess(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[0-9]+', '', text)
    text = re.sub(r'@\S+', '', text)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    filtered = [w for w in tokens if w not in stopwords.words('english') and len(w) > 2]
    return " ".join(filtered)

train_df['text'] = train_df['text'].map(preprocess)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
# === Train-Test Split ===
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42)

# === Train Transformer (ALBERT) ===
!pip install simpletransformers -q

from simpletransformers.classification import ClassificationModel

model = ClassificationModel(
    'albert', 'albert-base-v1',
    num_labels=2,
    use_cuda=True,
    args={
        'reprocess_input_data': True,
        'overwrite_output_dir': True,
        'num_train_epochs': 3,
        'train_batch_size': 16,
        'eval_batch_size': 16,
    }
)

model.train_model(train_data)

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/6 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 3:   0%|          | 0/203 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 2 of 3:   0%|          | 0/203 [00:00<?, ?it/s]

Running Epoch 3 of 3:   0%|          | 0/203 [00:00<?, ?it/s]

(609, 0.5896460638061924)

In [6]:
# === Evaluate on Validation Set ===
preds, _ = model.predict(val_data['text'].tolist())
val_data['pred'] = preds

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(val_data['label'], val_data['pred']))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/51 [00:00<?, ?it/s]

  with amp.autocast():


              precision    recall  f1-score   support

           0       0.70      0.59      0.64       410
           1       0.64      0.74      0.69       400

    accuracy                           0.67       810
   macro avg       0.67      0.67      0.66       810
weighted avg       0.67      0.67      0.66       810



In [8]:
# === Predict on Evaluation File ===

eval_df = pd.read_csv('/content/drive/MyDrive/meme_classifications/test_ocr_text/STask-A(index,text)test.csv')  # <-- update if needed
eval_df['text'] = eval_df['text'].map(preprocess)

eval_preds, _ = model.predict(eval_df['text'].tolist())
eval_df['label'] = eval_preds

# === Create 'index' from filename ===
eval_df['index'] = eval_df['index'].astype(str)

# === Final Submission Format ===
submission = eval_df[['index', 'label']].rename(columns={"label": "prediction"})

# === Save to evaloutput.json ===
import json

with open('/content/drive/MyDrive/submissions.json', 'w') as f:
    json.dump(submission.to_dict(orient='records'), f, indent=2)

print("✅ Submission file saved as submissions.json")


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  with amp.autocast():


✅ Submission file saved as submissions.json
