In [4]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

In [5]:
!pip install gensim==3.8.1

from gensim.models import Word2Vec



In [8]:
# core system imports
import os

import sys, os, re, csv, codecs, numpy as np, pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import string
import random
from unidecode import unidecode
import itertools
from timeit import timeit

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import (
    Dense, 
    Input, 
    LSTM, 
    Embedding, 
    Dropout, 
    Activation, 
    Bidirectional, 
    GlobalMaxPool1D
)
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras import (
    initializers, 
    regularizers, 
    constraints, 
    optimizers, 
    layers
)

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    f1_score,
    precision_score,
    recall_score,
    accuracy_score,
    precision_recall_fscore_support
)

# Matplotlib config
%matplotlib inline
%alias_magic t timeit


Created `%t` as an alias for `%timeit`.
Created `%%t` as an alias for `%%timeit`.


In [7]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  0


In [45]:
# Location of the pipeline metadata store
_pipeline_root = './pipeline/'

# Directory of the raw data files
_data_root = './data/hausa'

_data_filepath = os.path.join(_data_root, "data.csv")
_stopwords_filepath = os.path.join(_data_root, "stopwords.txt")

In [46]:
# List datasets in directory
os.listdir(_data_root)

['data.csv', 'stopwords.txt']

In [47]:
# Read data from CSV file
data = pd.read_csv(_data_filepath)
data.head()

Unnamed: 0,text,author_id,Label
0,- @aishambuhari ta nemi babban sufeton 'yan sa...,7.97e+17,Neutral
1,"""Duk dan Bokon da baida Ilimin Addini Annoba n...",2290470000.0,Neutral
2,"""Duk mutumin da yayi tunanin bawa mutane ilimi...",1071387000.0,Neutral
3,"""Duk wanda ya sabawa dokar kaucewa kamuwa daga...",1.26e+18,Positive
4,"""Duk wanda ya san ya fito daga yankin da ake A...",1039268000.0,Positive


In [48]:
# Read stop words
stopwords_list = list()

with open(_stopwords_filepath) as file:
    stopwords_list = [line.strip() for line in file.readlines()]

In [49]:
listToStr = ' '.join([str(word) for word in stopwords_list])
print(listToStr)

ta da ya sai ba yi na kuma ma ji cikin in ni wata wani ce tana don za sun amma ga ina ne mai suka wannan a ko lokacin su take shi yake yana ka ban ita tafi


In [75]:
# removing stopwords
def _apply_lowercase(text):
    text = [item for item in text if item not in stopwords_list]
    text = ''.join(text)
    return text

# removing stopwords
def _stopwords_removal(text):
    text = [item for item in text if item not in stopwords_list]
    text = ''.join(text)
    return text

# remove punctuations
def _punctuation_removal(text):
    all_list = [char for char in text if char not in string.punctuation]
    clean_str = ''.join(all_list)
    return clean_str

# Shuffle dataset
def _shuffle_dataset(dataset):
    dataset = shuffle(data)
    dataset = data.reset_index(drop=True)
    return dataset

In [77]:
# Change string to lower case
data['text'] = data['text'].apply(str.lower)

# remove punctuations or special characters
data['text'] = data['text'].apply(_punctuation_removal)

# remove stopwords
data['text'] = data['text'].apply(_stopwords_removal)

# Shuffle the dataset to prevent bias:
data = _shuffle_dataset(data)

# Print head of the data
data.head()

Unnamed: 0,text,author_id,Label
0,msllci ko bbu nnob tun sli wje ne mi tsri d sn...,1665774000.0,Neutral
1,kdunvskno kd guys yi hkuri zun gid wnke hnnu...,1112804000.0,Positive
2,hydr313 mlm hydr wi in nufin mu ynzu b bund y ...,1.06e+18,Positive
3,lkliehjr sddheeck wnke hnnu ske wnkew,9.29e+17,Positive
4,hfstpki yshersdiq kwi kr hkuri zun gid dink ...,8.14e+17,Positive
