# NLP Homework 2: Part of Speech (POS) Tagging 

Done by: Abdulrahman Hisham Al Muaitah  
University ID: 202110856

## Task 1: Build a Part of Speech Tagger Using Pre-DL Classifiers

#### Installing/importing libraries 

In [1]:
!pip install pyconll



In [1]:
import pandas as pd 
import numpy as np
import csv 
import gensim as gs 
import multiprocessing 
from gensim.models.word2vec import Word2Vec
import pyconll
import seaborn as sns 
import matplotlib.pyplot as plt 

In [2]:
train_PADT = pyconll.load_from_file('UD_Arabic-PADT/ar_padt-ud-train.conllu')
test_PADT = pyconll.load_from_file('UD_Arabic-PADT/ar_padt-ud-test.conllu')
test_PUD = pyconll.load_from_file('UD_Arabic-PUD/ar_pud-ud-test.conllu')

**We will start with the PADT dataset** 

The following algorithm I wrote, will go through each list of PyConll objects, fetch the "form" and "upos" of each word in each sentence. The "form" is the word, and "upos" is the tag.  
The algorithm will output 4 list of lists, each containing a sentence lists of the words and their corresponding tags. 

In [None]:
wordTokenList = [] 
posTokenList = [] 
testwordTokenList = [] 
testposTokenList = [] 

trainsentences = [ i for i in train_PADT] 
for i in range(len(trainsentences) - 1): 
  sentenceList = [] 
  for j in trainsentences[i]: 
    if j.upos == None: 
      continue 
    sentenceList.append(j.form)
  wordTokenList.append(sentenceList)
print("The word sentence list: \n",wordTokenList[:500])

for i in range(len(trainsentences) - 1): 
  sentenceList = [] 
  for j in trainsentences[i]: 
    if j.upos == None: 
      continue 
    sentenceList.append(j.upos)
  posTokenList.append(sentenceList)
print("The POS sentence list: \n",posTokenList[:500])

testsentences = [ i for i in test_PADT] 
for i in range(len(testsentences) - 1): 
  sentenceList = [] 
  for j in testsentences[i]: 
    if j.upos == None: 
      continue 
    sentenceList.append(j.form)
  testwordTokenList.append(sentenceList)
print("The word sentence list: \n",testwordTokenList[:500])

for i in range(len(testsentences) - 1): 
  sentenceList = [] 
  for j in testsentences[i]: 
    if j.upos == None: 
      continue 
    sentenceList.append(j.upos)
  testposTokenList.append(sentenceList)
print("The POS sentence list: \n",testposTokenList[:500])

Concatenating the train and test lists for both the words and the pos tags 

In [4]:
fullWordList = wordTokenList + testwordTokenList 
fullposList = posTokenList + testposTokenList

#set() for distinguishing the unique words 
wordlen = len(set([word.lower() for sentence in fullWordList for word in sentence]))
taglen = len(set([word.lower() for sentence in fullposList for word in sentence]))

In [5]:
print("The length of the word list: ",len(fullWordList)) 
print("The length of the POS list: ",len(fullposList))
print("The number of words in the corpus: ",wordlen) 
print("The number of pos in the corpus: ",taglen)

The length of the word list:  6753
The length of the POS list:  6753
The number of words in the corpus:  23353
The number of pos in the corpus:  17


#### Pre-DL Classifiers 

In [6]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score 
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC 
from sklearn import preprocessing 

In [7]:
words =fullWordList 
tags = [word.lower() for sentence in fullposList for word in sentence]
print(len(words)) 
print(len(tags)) 
print(tags[:20])

6753
252085
['x', 'verb', 'noun', 'noun', 'adj', 'adp', 'noun', 'noun', 'noun', 'punct', 'x', 'punct', 'adj', 'x', 'num', 'punct', 'num', 'punct', 'x', 'x']


In [8]:
svmW2V = Word2Vec(sentences=fullWordList,size=100,workers=4,window=1,min_count=0) 

In [9]:
X = [] 
for i in range(len(words)): 
  wvlist = [] 
  for k in range(len(words[i])): 
    wvlist.append(svmW2V.wv[words[i][k]]) 
  X.append(wvlist) 
print(len(X))

X = [item for sublist in X for item in sublist] 
print(len(X))

6753
252085


In [10]:
le = preprocessing.LabelEncoder() 
y = le.fit_transform(tags)

In [11]:
y

array([16, 15,  7, ..., 16,  7, 12], dtype=int64)

In [12]:
print(len(X[0]))
print(len(X[1]))


100
100


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,random_state=15)

In [14]:
data = np.c_[X_train,y_train]

In [15]:
np.shape(data)

(201668, 101)

In [16]:
colnames = [] 
for i in range(101): 
    colnames.append('column'+str(i)) 
print(colnames)

['column0', 'column1', 'column2', 'column3', 'column4', 'column5', 'column6', 'column7', 'column8', 'column9', 'column10', 'column11', 'column12', 'column13', 'column14', 'column15', 'column16', 'column17', 'column18', 'column19', 'column20', 'column21', 'column22', 'column23', 'column24', 'column25', 'column26', 'column27', 'column28', 'column29', 'column30', 'column31', 'column32', 'column33', 'column34', 'column35', 'column36', 'column37', 'column38', 'column39', 'column40', 'column41', 'column42', 'column43', 'column44', 'column45', 'column46', 'column47', 'column48', 'column49', 'column50', 'column51', 'column52', 'column53', 'column54', 'column55', 'column56', 'column57', 'column58', 'column59', 'column60', 'column61', 'column62', 'column63', 'column64', 'column65', 'column66', 'column67', 'column68', 'column69', 'column70', 'column71', 'column72', 'column73', 'column74', 'column75', 'column76', 'column77', 'column78', 'column79', 'column80', 'column81', 'column82', 'column83', '

In [17]:
df = pd.DataFrame(data,columns=colnames)
df

Unnamed: 0,column0,column1,column2,column3,column4,column5,column6,column7,column8,column9,...,column91,column92,column93,column94,column95,column96,column97,column98,column99,column100
0,-0.060211,0.133235,0.035630,-0.003546,-0.152450,-0.002033,0.128328,-0.019265,-0.010600,0.009889,...,0.091864,-0.047691,0.018412,0.016028,-0.046584,0.034492,-0.030808,0.009225,-0.103478,7.0
1,-0.155224,0.287055,0.093696,-0.029975,-0.304174,0.006146,0.263189,-0.049060,-0.054328,-0.021311,...,0.188742,-0.127478,0.011151,0.034658,-0.072074,0.093484,-0.070370,0.034756,-0.182817,4.0
2,-0.121631,0.308496,0.072621,-0.007128,-0.327200,-0.013204,0.262613,-0.016637,0.003773,0.032442,...,0.203560,-0.100833,0.048860,0.026665,-0.080110,0.052318,-0.076983,0.012771,-0.229666,7.0
3,-0.743744,0.909768,0.388754,-0.189795,-0.945927,0.024326,1.008494,-0.273541,-0.281397,-0.450574,...,0.474782,-0.607040,-0.151203,-0.020935,-0.040873,0.282479,-0.228365,0.022018,-0.489107,3.0
4,-0.182165,0.404983,0.104731,-0.012205,-0.428645,-0.014739,0.397501,-0.073843,-0.023079,0.017354,...,0.286820,-0.164982,0.017077,0.048140,-0.113707,0.101430,-0.083862,0.022204,-0.282102,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201663,-0.368708,0.720986,0.271292,-0.122599,-0.637311,-0.032909,0.823462,-0.489211,0.193820,-0.165287,...,0.533269,-0.549148,-0.220598,0.063703,0.128571,0.176466,0.117814,-0.135907,-0.577726,1.0
201664,-0.162557,0.318183,0.106665,-0.036985,-0.346502,-0.009485,0.326853,-0.066836,-0.046710,-0.014947,...,0.231627,-0.152132,-0.000203,0.034507,-0.081202,0.095048,-0.057100,0.028715,-0.227016,0.0
201665,-0.469673,0.839096,0.315497,-0.069448,-0.826144,-0.026721,0.884715,-0.313299,0.058686,-0.163864,...,0.591226,-0.500957,-0.127662,0.105667,-0.038093,0.216647,-0.077737,-0.014114,-0.568877,1.0
201666,-0.202855,0.392997,0.130035,-0.028124,-0.452715,-0.018643,0.449976,-0.090900,-0.072661,-0.008243,...,0.323002,-0.196769,-0.001865,0.059691,-0.124940,0.130209,-0.073025,0.050157,-0.278449,7.0


In [18]:
df['column100']

0         7.0
1         4.0
2         7.0
3         3.0
4         8.0
         ... 
201663    1.0
201664    0.0
201665    1.0
201666    7.0
201667    4.0
Name: column100, Length: 201668, dtype: float64

In [20]:
print(np.shape(X_train),np.shape(y_train))
print(np.shape(X_test), np.shape(y_test))

(201668, 100) (201668,)
(50417, 100) (50417,)


In [19]:
from pycaret.classification import *

In [20]:
exp_mclf101 = setup(data = df, target = 'column100', session_id=123,use_gpu=True)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,column100
2,Target Type,Multiclass
3,Label Encoded,
4,Original Data,"(201668, 101)"
5,Missing Values,False
6,Numeric Features,100
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [21]:
#Generating a classification table for a few classifier techniques
best = compare_models(include = ['rf', 'dt', 'knn','svm','ridge','nb','qda'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9241,0.3969,0.8618,0.9242,0.9221,0.9077,0.9086,8.116
dt,Decision Tree Classifier,0.9177,0.3845,0.8633,0.9162,0.9166,0.9009,0.9009,5.962
knn,K Neighbors Classifier,0.8851,0.3899,0.8342,0.882,0.8824,0.8609,0.8612,4.488
ridge,Ridge Classifier,0.7133,0.0,0.5524,0.8101,0.6468,0.6233,0.6726,0.175
svm,SVM - Linear Kernel,0.7039,0.0,0.4967,0.7529,0.6305,0.6114,0.6594,1.031
qda,Quadratic Discriminant Analysis,0.3988,0.2966,0.2133,0.4091,0.3749,0.2994,0.3219,1.035
nb,Naive Bayes,0.3813,0.3418,0.5284,0.5785,0.364,0.3471,0.3899,0.425


In [145]:
clf = SVC()

In [146]:
clf.fit(X_train, y_train) 

SVC()

Training the SVM classifier took about 2 hours 

In [147]:
y_pred = clf.predict(X_test)

Using the SVM classifier to make predictions took about 15 mins 

In [148]:
pred_labels = le.inverse_transform(y_pred)

In [149]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

  _warn_prf(average, modifier, msg_start, len(result))


In [150]:
print("Accuracy = {0:.2f}".format(accuracy))
print("Precision = {0:.2f}".format(precision))
print("Recall = {0:.2f}".format(recall))

Accuracy = 0.73
Precision = 0.82
Recall = 0.73


In [169]:
textInput = 'بوتين ذهب الى المانيا في القارة الاوروبية'
tokens = textInput.split() 

In [170]:
test_data = []
for i in range(len(tokens)):
    test_data.append(svmW2V.wv[tokens[i]])

In [171]:
y_pred = clf.predict(test_data)
y_pred

array([7, 7, 1, 7, 1, 7, 7])

In [172]:
le.inverse_transform(y_pred)

array(['noun', 'noun', 'adp', 'noun', 'adp', 'noun', 'noun'], dtype='<U5')

As seen in the SVM Model, it is not able to differentiate between the nouns and the names very well, it treated the name "بوتين" at the beginning of the sentence as a noun where it should have been treated as a name. 