# Hausa Sentiment Analysis: Model Training

This notebook demonstrates how to fine-tune the HausaBERTa transformer model on the preprocessed Hausa sentiment dataset.

In [1]:
# Load Preprocessed Cleaned Data
import pandas as pd
train_df = pd.read_csv('data/afrisenti_twitter_hausa_train_clean.csv')
val_df = pd.read_csv('data/afrisenti_twitter_hausa_validation_clean.csv')
test_df = pd.read_csv('data/afrisenti_twitter_hausa_test_clean.csv')

train_texts = train_df['tweet_clean'].tolist()
val_texts = val_df['tweet_clean'].tolist()
train_labels = train_df['label'].tolist()
val_labels = val_df['label'].tolist()

print('Sample preprocessed train text:')
print(train_texts[:5])
print('Sample train labels:')
print(train_labels[:5])

Sample preprocessed train text:
['kudin arewa babu abin azo agani alummah allah ya isa yacucemu wlh yarikitamana kasa yarikitamana kasuwanci harkar ilimi harkar lfy hanyoyi babu lantarki dasuransu komai yalalace cinhanci rashawa fili nigeria jamiyaryar tabataman mlm', 'kaga adu ar banda wai haka shugaban sojoji gaskiya buhari kaci amanan kasa mutum ah wajen nan', 'haquri yan madrid daman kunce champion din ya muku yawa', 'hmm kasan girman allah daxakace mukuma allah kune kukabarshi kuna karyata ayoyinsa kace allah baya karbar adduar talakan nigeria kunzalunceshi allah ya karbar adduar aka zalunta sauri kace wai allah baya karbar addua talakawa', 'wai gwamno nin nigeria suna afa kwayoyi']
Sample train labels:
[2, 2, 2, 2, 2]


In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin
import joblib
import os
from datetime import datetime
import re
import sys
import warnings
warnings.filterwarnings('ignore')

sys.path.append('.')
from hausa_preprocess import HausaTextPreprocessor

class HausaFeatureExtractor(BaseEstimator, TransformerMixin):
    """Custom feature extractor for Hausa text that integrates with sklearn pipelines."""
    
    def __init__(self):
        self.preprocessor = HausaTextPreprocessor()
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        features = []
        for text in X:
            _, feature_dict = self.preprocessor.preprocess(str(text), extract_features=True)
            features.append(list(feature_dict.values()))
        return np.array(features)

# Load cleaned Hausa sentiment data
print("Loading training data...")
train_df = pd.read_csv('data/afrisenti_twitter_hausa_train_clean.csv')
val_df = pd.read_csv('data/afrisenti_twitter_hausa_validation_clean.csv')

print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")

# Use robust Hausa preprocessor
print("Preprocessing text...")
preprocessor = HausaTextPreprocessor()
train_df['tweet_clean'] = train_df['tweet_clean'].astype(str).apply(preprocessor.preprocess)
val_df['tweet_clean'] = val_df['tweet_clean'].astype(str).apply(preprocessor.preprocess)

# Prepare data
X_train = train_df['tweet_clean']
y_train = train_df['label']
X_val = val_df['tweet_clean']
y_val = val_df['label']

# Enhanced: Show label distribution
print(f"\nLabel distribution in training:")
print(y_train.value_counts().sort_index())

# Encode labels if not already numeric
if y_train.dtype == object:
    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train)
    y_val = label_encoder.transform(y_val)
    print(f"Labels encoded. Classes: {label_encoder.classes_}")
else:
    label_encoder = None

# Enhanced: Create multiple model configurations for comparison
print("\nTraining and comparing multiple models...")

models_to_test = {}

# 1. Enhanced Logistic Regression (your original, but improved)
models_to_test['logistic_enhanced'] = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=15000,      # Increased vocabulary
        ngram_range=(1, 3),      # Include trigrams
        min_df=2,               # Remove very rare words
        max_df=0.95,            # Remove very common words  
        sublinear_tf=True       # Better for large feature spaces
    )),
    ('clf', LogisticRegression(
        max_iter=2000, 
        class_weight='balanced', 
        C=1.0,
        random_state=42
    ))
])

# 2. Character n-grams for morphological features
models_to_test['char_ngrams'] = Pipeline([
    ('tfidf_char', TfidfVectorizer(
        analyzer='char_wb',
        ngram_range=(3, 6),
        max_features=10000,
        min_df=2
    )),
    ('clf', LogisticRegression(
        max_iter=2000,
        class_weight='balanced',
        C=10.0,
        random_state=42
    ))
])

# 3. Feature union combining different representations
models_to_test['feature_union'] = Pipeline([
    ('features', FeatureUnion([
        ('word_tfidf', TfidfVectorizer(
            max_features=10000, 
            ngram_range=(1,2),
            min_df=2
        )),
        ('char_tfidf', TfidfVectorizer(
            analyzer='char_wb',
            ngram_range=(3,5),
            max_features=5000,
            min_df=2
        )),
        ('linguistic', HausaFeatureExtractor())
    ])),
    ('scaler', StandardScaler(with_mean=False)),  # For mixed features
    ('clf', LogisticRegression(
        max_iter=2000,
        class_weight='balanced',
        random_state=42
    ))
])

# 4. Support Vector Machine with RBF kernel
models_to_test['svm_rbf'] = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=12000,
        ngram_range=(1,2),
        min_df=2
    )),
    ('clf', SVC(
        kernel='rbf',
        C=1.0,
        class_weight='balanced',
        probability=True,
        random_state=42
    ))
])

# 5. Complement Naive Bayes (good for imbalanced text data)
models_to_test['complement_nb'] = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=12000,
        ngram_range=(1,2),
        min_df=2
    )),
    ('clf', ComplementNB(alpha=0.1))
])

# Train and evaluate all models
results = {}
best_model = None
best_f1 = 0

print(f"{'Model':<20} {'Accuracy':<10} {'F1-Macro':<10} {'F1-Weighted':<12}")
print("-" * 55)

for model_name, pipeline in models_to_test.items():
    try:
        # Train model
        pipeline.fit(X_train, y_train)
        
        # Evaluate on validation set
        val_preds = pipeline.predict(X_val)
        val_acc = accuracy_score(y_val, val_preds)
        val_f1_macro = f1_score(y_val, val_preds, average='macro')
        val_f1_weighted = f1_score(y_val, val_preds, average='weighted')
        
        # Store results
        results[model_name] = {
            'pipeline': pipeline,
            'accuracy': val_acc,
            'f1_macro': val_f1_macro,
            'f1_weighted': val_f1_weighted,
            'predictions': val_preds
        }
        
        print(f"{model_name:<20} {val_acc:<10.4f} {val_f1_macro:<10.4f} {val_f1_weighted:<12.4f}")
        
        # Track best model
        if val_f1_macro > best_f1:
            best_f1 = val_f1_macro
            best_model = model_name
            
    except Exception as e:
        print(f"Error training {model_name}: {e}")

print(f"\nBest model: {best_model} (F1-Macro: {best_f1:.4f})")

# Enhanced: Hyperparameter tuning on best model
if best_model and best_model != 'feature_union':  # Skip complex model for quick tuning
    print(f"\nPerforming hyperparameter tuning on {best_model}...")
    
    base_pipeline = results[best_model]['pipeline']
    
    # Define parameter grid based on model type
    if 'logistic' in best_model:
        param_grid = {
            'tfidf__max_features': [12000, 15000, 20000],
            'clf__C': [0.1, 1.0, 10.0, 100.0],
            'clf__penalty': ['l1', 'l2'],
            'clf__solver': ['liblinear', 'saga']
        }
    elif 'svm' in best_model:
        param_grid = {
            'tfidf__max_features': [10000, 15000],
            'clf__C': [0.1, 1.0, 10.0],
            'clf__gamma': ['scale', 'auto']
        }
    else:
        param_grid = None
    
    if param_grid:
        # Perform grid search
        cv_strategy = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        grid_search = GridSearchCV(
            base_pipeline,
            param_grid,
            cv=cv_strategy,
            scoring='f1_macro',
            n_jobs=-1,
            verbose=1
        )
        
        grid_search.fit(X_train, y_train)
        
        # Evaluate tuned model
        tuned_preds = grid_search.best_estimator_.predict(X_val)
        tuned_acc = accuracy_score(y_val, tuned_preds)
        tuned_f1 = f1_score(y_val, tuned_preds, average='macro')
        
        print(f"Tuned model results:")
        print(f"Best parameters: {grid_search.best_params_}")
        print(f"Validation Accuracy: {tuned_acc:.4f}")
        print(f"Validation F1-Macro: {tuned_f1:.4f}")
        
        # Update best model if tuning improved performance
        if tuned_f1 > best_f1:
            results[f'{best_model}_tuned'] = {
                'pipeline': grid_search.best_estimator_,
                'accuracy': tuned_acc,
                'f1_macro': tuned_f1,
                'f1_weighted': f1_score(y_val, tuned_preds, average='weighted'),
                'predictions': tuned_preds
            }
            best_model = f'{best_model}_tuned'
            best_f1 = tuned_f1

# Enhanced: Create ensemble of top models
print(f"\nCreating ensemble from top performing models...")
top_models = sorted(results.items(), key=lambda x: x[1]['f1_macro'], reverse=True)[:3]

ensemble_models = []
for name, result in top_models:
    if hasattr(result['pipeline'].named_steps['clf'], 'predict_proba'):
        ensemble_models.append((name, result['pipeline']))

if len(ensemble_models) >= 2:
    voting_clf = VotingClassifier(
        estimators=ensemble_models,
        voting='soft',
        n_jobs=-1
    )
    
    print("Training ensemble...")
    voting_clf.fit(X_train, y_train)
    
    ensemble_preds = voting_clf.predict(X_val)
    ensemble_acc = accuracy_score(y_val, ensemble_preds)
    ensemble_f1 = f1_score(y_val, ensemble_preds, average='macro')
    
    results['ensemble'] = {
        'pipeline': voting_clf,
        'accuracy': ensemble_acc,
        'f1_macro': ensemble_f1,
        'f1_weighted': f1_score(y_val, ensemble_preds, average='weighted'),
        'predictions': ensemble_preds
    }
    
    print(f"Ensemble Accuracy: {ensemble_acc:.4f}")
    print(f"Ensemble F1-Macro: {ensemble_f1:.4f}")
    
    if ensemble_f1 > best_f1:
        best_model = 'ensemble'
        best_f1 = ensemble_f1

# Enhanced: Display final results
print(f"\n{'='*60}")
print("FINAL RESULTS")
print(f"{'='*60}")
print(f"Best model: {best_model}")
print(f"Best validation accuracy: {results[best_model]['accuracy']:.4f}")
print(f"Best validation F1-macro: {results[best_model]['f1_macro']:.4f}")

# Show detailed classification report for best model
print(f"\nClassification report for {best_model}:")
best_preds = results[best_model]['predictions']
if label_encoder:
    target_names = label_encoder.classes_
else:
    target_names = None
print(classification_report(y_val, best_preds, target_names=target_names))

# Enhanced: Save best model with metadata
print(f"\nSaving models...")
os.makedirs('models/hausa_sentiment', exist_ok=True)

# Save the best performing model
best_pipeline = results[best_model]['pipeline']
joblib.dump(best_pipeline, 'models/hausa_sentiment/best_model.joblib')

# Save original logistic regression model for compatibility
if 'logistic_enhanced' in results:
    pipeline = results['logistic_enhanced']['pipeline']  # Use enhanced version as baseline
else:
    # Fallback: create original model structure
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1,2))),
        ('clf', LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42))
    ])
    pipeline.fit(X_train, y_train)

joblib.dump(pipeline, 'models/hausa_sentiment/logreg_model.joblib')

# Save label encoder
if label_encoder:
    joblib.dump(label_encoder, 'models/hausa_sentiment/label_encoder.joblib')

# Enhanced: Save comprehensive results
results_summary = {
    'timestamp': datetime.now().isoformat(),
    'best_model': best_model,
    'models_tested': len(results),
    'results': {
        name: {
            'accuracy': float(result['accuracy']),
            'f1_macro': float(result['f1_macro']),
            'f1_weighted': float(result['f1_weighted'])
        }
        for name, result in results.items()
    }
}

import json
with open('models/hausa_sentiment/training_results.json', 'w') as f:
    json.dump(results_summary, f, indent=2)

print('Models and results saved successfully!')
print(f'Best model saved as: models/hausa_sentiment/best_model.joblib')
print(f'Compatibility model saved as: models/hausa_sentiment/logreg_model.joblib')

Loading training data...
Training set size: 14172
Validation set size: 2677
Preprocessing text...

Label distribution in training:
label
0    4687
1    4912
2    4573
Name: count, dtype: int64

Training and comparing multiple models...
Model                Accuracy   F1-Macro   F1-Weighted 
-------------------------------------------------------

Label distribution in training:
label
0    4687
1    4912
2    4573
Name: count, dtype: int64

Training and comparing multiple models...
Model                Accuracy   F1-Macro   F1-Weighted 
-------------------------------------------------------
logistic_enhanced    0.7396     0.7416     0.7413      
logistic_enhanced    0.7396     0.7416     0.7413      
char_ngrams          0.7561     0.7570     0.7568      
char_ngrams          0.7561     0.7570     0.7568      
feature_union        0.7135     0.7143     0.7141      
feature_union        0.7135     0.7143     0.7141      
svm_rbf              0.7408     0.7425     0.7423      
svm_rbf   