In [48]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

### Data Preprocessing

In [45]:
import pandas as pd
df=pd.read_csv('/Users/youssefeldeeb/Documents/Collage/4th Year/Second Term/Arabization(تعريف لغات الحاسب)/Project/Arabization-Project-2022-4th.CS/Datasets/all_topics.csv',encoding='utf-8')

df.head()

Unnamed: 0,text,label
0,سحبت شركة غوغل في خطوة مفاجئة خدمة يوتيوب الشه...,Tech
1,عاد لاعب باريس سان جيرمان البرازيلي نيمار دا ...,Sports
2,أخبارنا المغربية ـ متابعة في أول رد للبرلمان...,Politics
3,شيرين تتهرب من المؤتمر الصحفي لمهرجان تطوان و ...,Culture
4,منظمو أولمبياد طوكيو 2020 يطالبون بالتبرع باله...,Sports


In [46]:
#preprocessing steps (Normalization of data)

def normalization(txt):
    #Tokenization
    tokenText = txt.split(' ')
    
    #StopWord Removing
    SWList = []
    stopWordList = open("arabic-stop-words.txt").read().splitlines()
    for token in tokenText:
        if token not in stopWordList:
            SWList.append(token)
    SWText = ' '.join(SWList)
    
    # Punctuations Removing
    import unicodedata as ud
    puncText = ''.join(c for c in SWText if not ud.category(c).startswith('P'))
    
    # ISRI Stemmer (Root-based stemmer)
    import nltk
    st = nltk.ISRIStemmer()
    tkText = puncText.split(' ')
    result = ' '.join([st.stem(w) for w in tkText])
    
    return result

In [47]:
df['text'].apply(normalization)
df.head()

Unnamed: 0,text,label
0,سحبت شركة غوغل في خطوة مفاجئة خدمة يوتيوب الشه...,Tech
1,عاد لاعب باريس سان جيرمان البرازيلي نيمار دا ...,Sports
2,أخبارنا المغربية ـ متابعة في أول رد للبرلمان...,Politics
3,شيرين تتهرب من المؤتمر الصحفي لمهرجان تطوان و ...,Culture
4,منظمو أولمبياد طوكيو 2020 يطالبون بالتبرع باله...,Sports


In [54]:
# df.iloc[2]

### Feature Extraction
### Classifing the data 
### Testing the mode

In [55]:
# Features Extraction by TF-IDF
tfidf_vec=TfidfVectorizer()
tfidf_features=tfidf_vec.fit_transform(df['text'])

#split data into train and test
x_train,x_test,y_train,y_test=train_test_split(tfidf_features,df['label'],test_size=0.2)

In [56]:
print(x_train.shape)
print(x_test.shape)

(62742, 466853)
(15686, 466853)


In [57]:
# classifier
tfidf_model=MultinomialNB()
tfidf_model.fit(x_train,y_train)

MultinomialNB()

In [58]:
#Testing data
tfidf_predected=tfidf_model.predict(x_test)

accuracy_score(y_test, tfidf_predected)

0.8730077776361086

In [59]:
cm = confusion_matrix(y_test,tfidf_predected)
print(cm)

[[ 551   15   15  589    4  132   45]
 [   3 1162   12  629    2   16   39]
 [   1    8 2586    4    1    5   37]
 [   1   28    1 2664    1    5    5]
 [   0    1   54  104 1376   16   10]
 [   0    1    1   51    0 3071    1]
 [   0    7  115   26    0    7 2284]]


In [60]:
# #represent the confusion matrix with GUI form
# import matplotlib.pyplot as plt
# import seaborn as sns

# plt.rcParams['figure.figsize'] = (8, 8)
# sns.heatmap(cm, annot = True, annot_kws = {'size':10})
# sns.set_style({'xtick.bottom': True})

In [61]:
# Features Extraction by Binary Encoding
binary_vec=CountVectorizer(binary=True)
binary_features=binary_vec.fit_transform(df['text'])

#split data into train and test
x_train,x_test,y_train,y_test=train_test_split(binary_features,df['label'],test_size=0.2)

In [62]:
# classefier
binary_model=MultinomialNB()
binary_model.fit(x_train,y_train)

MultinomialNB()

In [63]:
# testing data
binary_predicted=binary_model.predict(x_test)
accuracy_score(y_test, binary_predicted)

0.9196098431722555

In [64]:
print(confusion_matrix(binary_predicted,y_test))

[[1076    7    1   23   24   23   10]
 [  46 1497   17  132    9   10   25]
 [  11    2 2485    2   25    6   92]
 [ 175  316    4 2636   34   71   19]
 [   7    0    8    8 1413    3    2]
 [  20   11    4    5    9 3046    3]
 [  16   15   53    1    7    5 2272]]


In [66]:
# Features Extraction by Counting
count_vec=CountVectorizer()
count_features=count_vec.fit_transform(df['text'])

#split data into train and test
x_train,x_test,y_train,y_test=train_test_split(count_features,df['label'],test_size=0.2)

In [67]:
# classefier
count_model=MultinomialNB()
count_model.fit(x_train,y_train)

MultinomialNB()

In [68]:
count_predicted=count_model.predict(x_test)
accuracy_score(y_test, count_predicted)

0.9310850439882697

In [69]:
print(confusion_matrix(count_predicted,y_test))

[[1114   11    3   18   27   16   15]
 [  35 1565   25  119    9   14   23]
 [   5    5 2557    5   23    5   93]
 [ 119  270    5 2578   33   43    8]
 [   9    0   10    6 1423    0    1]
 [  19   14    2    3    3 3115    1]
 [  17   12   43    2    5    5 2253]]
