In [None]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier

In [None]:
#Reading the dataset
Data = pd.read_csv('updated_bijankhan_corpus.csv', dtype=str, encoding='utf-8-sig', engine='python')

In [None]:
Data.head()

Unnamed: 0.1,Unnamed: 0,token,pos_tag,ezafe_tag
0,0,!هنری!,SUBJ,0
1,1,#,DELM,0
2,2,مسعود,N_SING_PR,1
3,3,شجاعی,N_SING_PR,1
4,4,طباطبایی,N_SING_PR,0


In [None]:
#Converting the read data to array
Data = Data.to_numpy(dtype = object)

In [None]:
#we get the shape of data (which is 2602536 in 4 columns)
Data.shape

(2602536, 4)

In [None]:
#the target and feature data respectively
Data_x = Data[:,1:-1]
Data_y = Data[:,-1]

In [None]:
#here is the main part of the code
#cleaning up numbers and punctuations from the text by replacing them with space
#when a character similar to one in these arrays is detected
numbers = ['0','1','2','3','4','5','6','7','8','9']
punctuation = ['`','-','=',',','[',']',';','/','.',',','~','!','@','#','$','%','^','&','*',
               '(',')','_','+','|','{','}',':','"','?','<','>','}','’','“','”' , '\'' , '،', '«' , '»', '؟']

def text_cleaner(text_Data):

    data_processed = []

    for i in range(len(text_Data)):
        for character in text_Data[i,0]:
            if character in numbers or character in punctuation and text_Data[i,1] != 'DELM':
                text_Data[i,0] = text_Data[i,0].replace(character, "")
        text_Data[i,0] = text_Data[i,0].strip()
        data_processed.append(text_Data[i,:])

    return np.array(data_processed)

In [None]:
#removing stuff from data with the explained function
Data_x = text_cleaner(Data_x)

In [None]:
#using an encode we convert the word places to numbers
le = preprocessing.LabelEncoder()
Data_x_pos = le.fit_transform(np.array(Data_x[:,1], dtype = 'str'))

In [None]:
#splitting the train and text x and y's
#all except the last 100000 for training and the last 100000 for test
X_train = Data_x_pos[:-100000]
y_train = Data_y[:-100001].astype(np.int32)

X_test = Data_x_pos[-100000:]
y_test = Data_y[-100000:-1].astype(np.int32)

In [None]:
X_train = X_train.reshape(-1, 1)
X_train = np.hstack((X_train[ :-1], X_train[1: ]))

X_test = X_test.reshape(-1, 1)
X_test = np.hstack((X_test[ :-1], X_test[1: ]))

In [None]:
#the used model in this code is Decision Tree
model = DecisionTreeClassifier().fit(X_train, y_train)
Accuracy = model.score(X_test, y_test)

In [None]:
print('Accuracy is: ', Accuracy)

Accuracy is:  0.9409794097940979


In [None]:
#for a given sentence we want to return the predicted "kasre ezafe"
#tokenizing the sentence
sentence = input('The Given Sentence: ')
words = sentence.split()

In [None]:
#in this array we store the 0 and 1's according to the predicted "kasre ezafe"
predictions = []
for i in range(len(words) - 1):
  sample = np.zeros((1, 2))
  if words[i] in Data_x[:,0]:
    index = np.where(words[i] == Data_x[:,0])[0][0]
    sample[0, 0] = Data_x_pos[index]

  if words[i + 1] in Data_x[:,0]:
    index = np.where(words[i + 1] == Data_x[:,0])[0][0]
    sample[0, 1] = Data_x_pos[index]

  predict = model.predict(sample)[0]
  predictions.append(predict)
predictions.append(0)
#if 1 is appended for an according word that word is predicted to be having a "kasre ezafe"
#otherwise 0

In [None]:
print(predictions)