# Hausa Sentiment Analysis: Model Training

This notebook demonstrates how to fine-tune the HausaBERTa transformer model on the preprocessed Hausa sentiment dataset.

In [1]:
# Load Preprocessed Cleaned Data
import pandas as pd
train_df = pd.read_csv('data/afrisenti_twitter_hausa_train_clean.csv')
val_df = pd.read_csv('data/afrisenti_twitter_hausa_validation_clean.csv')
test_df = pd.read_csv('data/afrisenti_twitter_hausa_test_clean.csv')

train_texts = train_df['tweet_clean'].tolist()
val_texts = val_df['tweet_clean'].tolist()
train_labels = train_df['label'].tolist()
val_labels = val_df['label'].tolist()

print('Sample preprocessed train text:')
print(train_texts[:5])
print('Sample train labels:')
print(train_labels[:5])

Sample preprocessed train text:
['da kudin da arewa babu wani abin azo agani da yayi wa alummah allah ya isa yacucemu wlh yarikitamana kasa yarikitamana kasuwanci harkar ilimi harkar lfy hanyoyi babu lantarki dasuransu komai yalalace ga cinhanci da rashawa a fili ko ina a nigeria jamiyaryar su tabataman mlm ', 'kaga wani adu ar banda wai a haka shi ne shugaban sojoji.... gaskiya buhari kaci amanan mu da kasa wannan mutum ah wajen nan', 'sai haquri fa yan madrid daman kunce champion din ya muku yawa', 'hmmm yanzu kai kasan girman allah daxakace mukuma ga allah kune kukabarshi kuna karyata ayoyinsa kace allah baya karbar adduar talakan nigeria  bayan kunzalunceshi kuma allah ya karbar adduar wanda aka zalunta cikin sauri amma kace wai allah baya karbar addua talakawa', 'wai gwamno nin nigeria suna afa kwayoyi ko ']
Sample train labels:
[2, 2, 2, 2, 2]


In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
import joblib
import os
from datetime import datetime
import re
import sys
sys.path.append('.')
from hausa_preprocess import HausaTextPreprocessor

# Load cleaned Hausa sentiment data
train_df = pd.read_csv('data/afrisenti_twitter_hausa_train_clean.csv')
val_df = pd.read_csv('data/afrisenti_twitter_hausa_validation_clean.csv')

# Use robust Hausa preprocessor
preprocessor = HausaTextPreprocessor()
train_df['tweet_clean'] = train_df['tweet_clean'].astype(str).apply(preprocessor.preprocess)
val_df['tweet_clean'] = val_df['tweet_clean'].astype(str).apply(preprocessor.preprocess)

# Prepare data
X_train = train_df['tweet_clean']
y_train = train_df['label']
X_val = val_df['tweet_clean']
y_val = val_df['label']

# Encode labels if not already numeric
if y_train.dtype == object:
    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train)
    y_val = label_encoder.transform(y_val)
else:
    label_encoder = None

# Build pipeline (Logistic Regression as example)
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1,2))),
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42))
])

# Train
pipeline.fit(X_train, y_train)

# Evaluate
val_preds = pipeline.predict(X_val)
val_acc = accuracy_score(y_val, val_preds)
val_f1 = f1_score(y_val, val_preds, average='macro')
print(f"Validation Accuracy: {val_acc:.4f}")
print(f"Validation Macro F1: {val_f1:.4f}")
print(classification_report(y_val, val_preds))

# Save model and label encoder
os.makedirs('models/hausa_sentiment', exist_ok=True)
joblib.dump(pipeline, 'models/hausa_sentiment/logreg_model.joblib')
if label_encoder:
    joblib.dump(label_encoder, 'models/hausa_sentiment/label_encoder.joblib')
print('Model and encoder saved.')

Validation Accuracy: 0.7340
Validation Macro F1: 0.7361
              precision    recall  f1-score   support

           0       0.87      0.80      0.83       887
           1       0.65      0.74      0.69       896
           2       0.71      0.67      0.69       894

    accuracy                           0.73      2677
   macro avg       0.74      0.73      0.74      2677
weighted avg       0.74      0.73      0.74      2677

Model and encoder saved.
