# Hausa Sentiment Analysis: Data Preprocessing

This notebook covers the preprocessing steps for Hausa sentiment analysis using the HausaBERTa transformer model. We will load the pre-split datasets, clean and preprocess the text, and prepare the data for model training.

In [1]:
# Import Required Libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Load Hausa Sentiment Dataset
train_df = pd.read_csv('data/afrisenti_twitter_hausa_train.csv')
val_df = pd.read_csv('data/afrisenti_twitter_hausa_validation.csv')
test_df = pd.read_csv('data/afrisenti_twitter_hausa_test.csv')
print('Train shape:', train_df.shape)
print('Validation shape:', val_df.shape)
print('Test shape:', test_df.shape)
train_df.head()

Train shape: (14172, 2)
Validation shape: (2677, 2)
Test shape: (5303, 2)


Unnamed: 0,tweet,label
0,@user Da kudin da Arewa babu wani abin azo aga...,2
1,@user Kaga wani Adu ar Banda💔😭 wai a haka Shi ...,2
2,@user Sai haquri fa yan madrid daman kunce cha...,2
3,@user Hmmm yanzu kai kasan girman allah daxaka...,2
4,@user @user Wai gwamno nin Nigeria suna afa kw...,2


In [3]:
# Hausa Text Preprocessing (Comprehensive Class Version)
import re
import string
from typing import List
import pandas as pd

# class HausaTextPreprocessor:
#     def __init__(self):
#         self.hausa_stopwords = {
#             'da', 'ne', 'ce', 'na', 'ta', 'shi', 'ita', 'su', 'ni', 'ka', 'ki', 'ku', 'mu', 'wa', 'zuwa', 'akan',
#             'amma', 'ko', 'kuma', 'saboda', 'don', 'ba', 'bai', 'bata', 'baiwa', 'bayan', 'cikin', 'ga', 'ina',
#             'yana', 'yake', 'yayi', 'yake', 'yanzu', 'wannan', 'wancan', 'wata', 'wani', 'wasu', 'duk', 'kowa',
#             'me', 'mece', 'mene', 'wace', 'wane', 'wacece', 'wanene', 'inda', 'lokacin', 'idan', 'kamar', 'saboda',
#             'daidai', 'kawai', 'har', 'sai', 'tun', 'daga', 'zuwa', 'kuma', 'ko', 'amma', 'saboda', 'idan', 'ko',
#             'da', 'ba', 'ce', 'ne', 'shi', 'ta', 'su', 'ni', 'ka', 'ki', 'ku', 'mu', 'wa', 'zuwa', 'akan', 'ga',
#             'cikin', 'bayan', 'lokacin', 'inda', 'yanzu', 'kamar', 'saboda', 'kawai', 'har', 'sai', 'tun', 'daga'
#         }
#         self.punctuation = set(string.punctuation)
#         self.hausa_chars = set('abcdefghijklmnopqrstuvwxyz’ʼƙɗɓçäöüÀÁÂÃÈÉÊÌÍÎÒÓÔÕÙÚÛÇÑ')

#     def clean_text(self, text: str) -> str:
#         if pd.isna(text):
#             return ""
#         text = str(text).lower()
#         text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
#         text = re.sub(r'\S+@\S+', '', text)
#         text = re.sub(r'@[\w_]+', '', text)
#         text = re.sub(r'#[\w_]+', '', text)
#         text = re.sub(r'\d+', '', text)
#         # Remove emojis
#         emoji_pattern = re.compile("["
#             u"\U0001F600-\U0001F64F"  # emoticons
#             u"\U0001F300-\U0001F5FF"  # symbols & pictographs
#             u"\U0001F680-\U0001F6FF"  # transport & map symbols
#             u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
#             u"\U00002700-\U000027BF"  # Dingbats
#             u"\U000024C2-\U0001F251"  # Enclosed characters
#             "]+", flags=re.UNICODE)
#         text = emoji_pattern.sub(r'', text)
#         text = re.sub(r'\s+', ' ', text)
#         text = text.strip('"\'')
#         return text.strip()

#     def remove_punctuation(self, text: str) -> str:
#         return ''.join(char for char in text if char not in self.punctuation)

#     def keep_hausa_chars(self, text: str) -> str:
#         return ''.join(char for char in text if char in self.hausa_chars or char.isspace())

#     def tokenize(self, text: str) -> List[str]:
#         return text.split()

#     def remove_stopwords(self, tokens: List[str]) -> List[str]:
#         return [token for token in tokens if token not in self.hausa_stopwords]

#     def preprocess(self, text: str, remove_stopwords: bool = True, keep_only_hausa: bool = False) -> str:
#         text = self.clean_text(text)
#         text = self.remove_punctuation(text)
#         if keep_only_hausa:
#             text = self.keep_hausa_chars(text)
#         tokens = self.tokenize(text)
#         if remove_stopwords:
#             tokens = self.remove_stopwords(tokens)
#         return ' '.join(tokens)

# Instantiate the preprocessor
from hausa_preprocess import HausaTextPreprocessor
preprocessor = HausaTextPreprocessor()

# Show before/after samples for verification
print('Sample before cleaning:')
print(train_df['tweet'].head(5))

# Apply robust Hausa preprocessing
train_df['tweet_clean'] = train_df['tweet'].apply(lambda x: preprocessor.preprocess(x))
val_df['tweet_clean'] = val_df['tweet'].apply(lambda x: preprocessor.preprocess(x))
test_df['tweet_clean'] = test_df['tweet'].apply(lambda x: preprocessor.preprocess(x))

print('\nSample after cleaning:')
print(train_df['tweet_clean'].head(5))

# Show label distribution for verification
if 'label' in train_df.columns:
    print('\nTrain label distribution:')
    print(train_df['label'].value_counts())
if 'label' in val_df.columns:
    print('\nValidation label distribution:')
    print(val_df['label'].value_counts())
if 'label' in test_df.columns:
    print('\nTest label distribution:')
    print(test_df['label'].value_counts())

# Export cleaned data for training and evaluation
export_cols = ['tweet_clean', 'label']
train_df[export_cols].to_csv('data/afrisenti_twitter_hausa_train_clean.csv', index=False)
val_df[export_cols].to_csv('data/afrisenti_twitter_hausa_validation_clean.csv', index=False)
test_df[export_cols].to_csv('data/afrisenti_twitter_hausa_test_clean.csv', index=False)
print('\nCleaned data exported for train, validation, and test sets.')

Sample before cleaning:
0    @user Da kudin da Arewa babu wani abin azo aga...
1    @user Kaga wani Adu ar Banda💔😭 wai a haka Shi ...
2    @user Sai haquri fa yan madrid daman kunce cha...
3    @user Hmmm yanzu kai kasan girman allah daxaka...
4    @user @user Wai gwamno nin Nigeria suna afa kw...
Name: tweet, dtype: object

Sample after cleaning:
0    kudin arewa babu abin azo agani alummah allah ...
1    kaga adu ar banda wai haka shugaban sojoji gas...
2    haquri yan madrid daman kunce champion din ya ...
3    hmm kasan girman allah daxakace mukuma allah k...
4              wai gwamno nin nigeria suna afa kwayoyi
Name: tweet_clean, dtype: object

Train label distribution:
label
1    4912
0    4687
2    4573
Name: count, dtype: int64

Validation label distribution:
label
1    896
2    894
0    887
Name: count, dtype: int64

Test label distribution:
label
1    1789
2    1759
0    1755
Name: count, dtype: int64

Cleaned data exported for train, validation, and test sets.

Sample after