# **Implementation of Machine Learning model for Yoruba Language 'Yo'**

Importing Essential Python libraries such as numpy,pandas for computation, Matplotlib,seaborn for visualization

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re

Upload the train file for 'yo' language

In [2]:
from google.colab import files
uploaded = files.upload()

Saving train.txt to train.txt


In [3]:
train = pd.read_csv('train.txt',sep='\t')

Upload the dev file for 'yo' language

In [4]:
from google.colab import files
uploaded = files.upload()

Saving dev.txt to dev.txt


In [5]:
dev = pd.read_csv('dev.txt',sep='\t')

Upload the test file for 'yo' language 

In [6]:
from google.colab import files
uploaded = files.upload()

Saving test.txt to test.txt


In [7]:
test = pd.read_csv('test.txt',sep='\t')

# **Data Preprocessing**

In [8]:
print(train.shape)
print(dev.shape)
print(test.shape)

(5965, 2)
(852, 2)
(1705, 2)


In [9]:
train.head(10)

Unnamed: 0,text,label
0,Kí ni #Yoruba ń pè ní ìṣẹ̀ṣe? Wọ́n á ní ìṣẹ̀ṣe...,positive
1,You are nobody without your fans. She understo...,neutral
2,"8• Ìsúnkì ọ̀rọ̀ tàbí ìpàrójẹ ni àkàà, yanrìn, ...",neutral
3,"RT @user: Òrùn kò pa mí rí, Òjò kò pa mí rí, M...",neutral
4,"RT @user: 8. Bí a bá ní """"""""""""""""Gbélẹ̀yí fẹ́ a...",neutral
5,"@user *Wọ́n fi ẹ̀tẹ̀ sílẹ̀, wọ́n ń pa làpálàpá.",negative
6,"Àbá 3 - Ìyá, bàbá ló yẹ ó bá ọmọ wí, ká jọ p'ẹ...",positive
7,Àwòkọ́ṣe irun ìbílẹ̀ wa ni èèbó fi ṣe irun atọ...,neutral
8,"Ní 1970, Fẹlá fi America sílẹ̀, ó padà sílé, ó...",neutral
9,Mo ti gba ọ̀nà ìyè yìí ná ... #Yoruba,positive


In [10]:
dev.head(10)

Unnamed: 0,text,label
0,Jọ̀'ọ́ rọra jẹ *Kẹ́míkà*,positive
1,Ta ni Ọmọ Ògún? L'étí omi Ògún ni a ti máa ń ṣ...,neutral
2,Omo egbe agbaboolu Arsenal iyen Gervinho lo gb...,positive
3,"@user: Oooooooooo E ku ojumo """"""""@user: Ọmọ-ká...",positive
4,Omodo agba Pari owe yii.. #yoruba #culture #os...,neutral
5,"Kílọ́ńṣelẹ̀ kẹ̀ ẹ, ẹ̀yin tèmi. Ẹ kú afẹ́rẹ́ o :)",positive
6,Walahi ti iya to bi mi ba toro ninu suya yii n...,negative
7,"À rí ìgbọdọ̀ wí, baálé ilé sú 'ápẹ. #EsinOro🐎 ...",negative
8,"Bí ẹyẹ oko ò mọ bíntín lọ, eré e kó kọ́lé ló m...",positive
9,@user Wọ́n ní kí ẹ wá gbe o🙄,neutral


In [11]:
test.head(10)

Unnamed: 0,text,label
0,"Alakikanju, ọlọ́pọlọ pípé àti olódodo ọ̀dọ́mọd...",positive
1,"12. Àààlò o! Òjò patapàtà, ó d'órí àpáta, ó rá...",neutral
2,Mo sere jade lonii emi naa ba won kopa ninu et...,positive
3,"RT @user: Orúkọ mí ní Olùwashinà, ọmọ ìlú Èko ...",neutral
4,@user wọ́n ni àgbájọ ọwọ́ lá fi nsọ̀yà. Mo ti ...,positive
5,Olukemi Zaynab 👉 Yorùbátv ẸGBẸ ỌMỌ OLÚWA BÍ GB...,neutral
6,Nǹkan ẹ̀rọ̀ mìíràn #idahun #Ibeere #YorubaQnA ...,neutral
7,@user Mo nife ee #yoruba,positive
8,Ọjọ́ Ìsin Imi = Ọjọ́ Ìsinmi (ọjọ́ ti Ọ̀rúnmìlà...,neutral
9,"Yàtọ̀ sí ká jẹran ìgbín, a máa ń fìgbín bọ Òòṣ...",neutral


In [40]:
from sklearn.preprocessing import LabelEncoder
lab = LabelEncoder()
yo_train = lab.fit_transform(train['label'])

In [41]:
train['label_cat'] = yo_train

In [42]:
lab = LabelEncoder()
yo_dev = lab.fit_transform(dev['label'])

In [43]:
dev['label_cat'] = yo_dev

In [44]:
lab = LabelEncoder()
yo_test = lab.fit_transform(test['label'])

In [17]:
test['label_cat'] = yo_test

Hashtags - collection and extraction

In [45]:
def hashtag_extract(x):
    hashtags = []
    
    for i in x:
        ht = re.findall(r"#(\w+)", i)
        hashtags.append(ht)

    return hashtags

In [46]:
import re
HT_negative = hashtag_extract(train['text'][train['label_cat'] == 0])
HT_neutral = hashtag_extract(train['text'][train['label_cat'] == 1])
HT_positive = hashtag_extract(train['text'][train['label_cat'] == 2])

HT_neutral = sum(HT_neutral,[])
HT_negative = sum(HT_negative,[])
HT_positive = sum(HT_positive,[])

# Tokenizing

In [20]:
tokenized_text = train['text'].apply(lambda x: x.split()) 
import gensim
model_w2v = gensim.models.Word2Vec(
            tokenized_text,
            size=200, 
            window=5,
            min_count=2,
            sg = 1,
            hs = 0,
            negative = 10, 
            workers= 2, 
            seed = 34)

model_w2v.train(tokenized_text, total_examples= len(train['text']), epochs=20)



(1853355, 2655720)

In [21]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models.doc2vec import LabeledSentence

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Evaluation

Cleaning training text by stemming

In [25]:
train_corpus = []

for i in range(0, 5965):
  review = re.sub('[^a-zA-Z]', ' ', train['text'][i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
  review = ' '.join(review)
  train_corpus.append(review)

Cleaning dev text by stemming

In [26]:
dev_corpus = []

for i in range(0, 852):
  review = re.sub('[^a-zA-Z]', ' ', dev['text'][i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
  review = ' '.join(review)
  dev_corpus.append(review)

Cleaning testing text by stemming

In [27]:
test_corpus = []

for i in range(0,1705):
  review = re.sub('[^a-zA-Z]', ' ', test['text'][i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
  review = ' '.join(review)
  test_corpus.append(review)

In [28]:
train.head()

Unnamed: 0,text,label,label_cat
0,Kí ni #Yoruba ń pè ní ìṣẹ̀ṣe? Wọ́n á ní ìṣẹ̀ṣe...,positive,2
1,You are nobody without your fans. She understo...,neutral,1
2,"8• Ìsúnkì ọ̀rọ̀ tàbí ìpàrójẹ ni àkàà, yanrìn, ...",neutral,1
3,"RT @user: Òrùn kò pa mí rí, Òjò kò pa mí rí, M...",neutral,1
4,"RT @user: 8. Bí a bá ní """"""""""""""""Gbélẹ̀yí fẹ́ a...",neutral,1


CountVectorizer to convert text to vector

In [29]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1000)
x = cv.fit_transform(train_corpus).toarray()
y = train.iloc[:, 2]
print(x.shape)
print(y.shape)

(5965, 1000)
(5965,)


In [30]:
dev.head()

Unnamed: 0,text,label,label_cat
0,Jọ̀'ọ́ rọra jẹ *Kẹ́míkà*,positive,2
1,Ta ni Ọmọ Ògún? L'étí omi Ògún ni a ti máa ń ṣ...,neutral,1
2,Omo egbe agbaboolu Arsenal iyen Gervinho lo gb...,positive,2
3,"@user: Oooooooooo E ku ojumo """"""""@user: Ọmọ-ká...",positive,2
4,Omodo agba Pari owe yii.. #yoruba #culture #os...,neutral,1


In [31]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1000)
x_dev = cv.fit_transform(dev_corpus).toarray()
y_dev = dev.iloc[:, 2]
print(x_dev.shape)

(852, 1000)


In [32]:
test.head()

Unnamed: 0,text,label,label_cat
0,"Alakikanju, ọlọ́pọlọ pípé àti olódodo ọ̀dọ́mọd...",positive,2
1,"12. Àààlò o! Òjò patapàtà, ó d'órí àpáta, ó rá...",neutral,1
2,Mo sere jade lonii emi naa ba won kopa ninu et...,positive,2
3,"RT @user: Orúkọ mí ní Olùwashinà, ọmọ ìlú Èko ...",neutral,1
4,@user wọ́n ni àgbájọ ọwọ́ lá fi nsọ̀yà. Mo ti ...,positive,2


In [33]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1000)
x_test = cv.fit_transform(test_corpus).toarray()
y_test = test.iloc[:, 2]
print(x_test.shape)

(1705, 1000)


Standardization

In [34]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x)
x_dev = sc.transform(x_dev)
x_test = sc.transform(x_test)

Logistic Regression

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
model = LogisticRegression(multi_class='multinomial')
model.fit(x_train, y)
y_pred_dev = model.predict(x_dev)
y_pred_test = model.predict(x_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Checking for Accuracy and F1 Score

In [47]:
print("Train:", model.score(x_train, y))
print("Valid:", model.score(x_dev, y_dev))
print("Test:", model.score(x_test, y_test))

Train: 0.7921207041072925
Valid: 0.2992957746478873
Test: 0.3912023460410557


F1 score for dev set

In [48]:
print("F1 score:", f1_score(y_dev, y_pred_dev,average='weighted'))

F1 score: 0.2902702145197495


Confusion Matrix for dev set

In [49]:
print("Confusion matrix")
print("*****************")
cm_dev = confusion_matrix(y_dev, y_pred_dev)
print(cm_dev)

Confusion matrix
*****************
[[ 83  30 101]
 [148  47 119]
 [138  61 125]]


F1 score for test set

In [50]:
print("F1 score:", f1_score(y_test, y_pred_test,average='weighted'))

F1 score: 0.38812674891138704


Confusion Matrix for test set

In [51]:
print("Confusion matrix")
print("*****************")
cm_test = confusion_matrix(y_test, y_pred_test)
print(cm_test)

Confusion matrix
*****************
[[ 89 122 143]
 [161 193 290]
 [166 156 385]]


Classifiaction Report

In [52]:
print("classification_report")
print("*****************")
print(classification_report(y_test, y_pred_test, labels=[0, 1, 2]))

classification_report
*****************
              precision    recall  f1-score   support

           0       0.21      0.25      0.23       354
           1       0.41      0.30      0.35       644
           2       0.47      0.54      0.50       707

    accuracy                           0.39      1705
   macro avg       0.36      0.37      0.36      1705
weighted avg       0.39      0.39      0.39      1705

