In [1]:
!pip install nltk



In [2]:
# !conda create -n hazm_env python=3.10
# !conda activate hazm_env
# !pip install hazm

In [43]:
import pandas as pd
import numpy as np
import nltk
import hazm
from hazm import word_tokenize
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix

In [39]:
def save_pickle(obj, filename):
    if not filename.endswith('.pkl'):
        filename += '.pkl'
    with open(filename, 'wb') as f:
        pickle.dump(obj, f)
    print(f"Saved to {filename}")

def load_pickle(filename):
    with open(filename, 'rb') as f:
        obj = pickle.load(f)
    print(f"Loaded from {filename}")
    return obj

In [5]:
data = pd.read_csv('per.csv', encoding='utf8')
data.head()

Unnamed: 0,NewsID,Title,Body,Date,Time,Category,Category2
0,843656,\r\nوزير علوم درجمع استادان نمونه: سن بازنشستگ...,\r\nوزير علوم در جمع استادان نمونه كشور گفت: ا...,\r\n138/5//09,\r\n0:9::18,\r\nآموزشي-,\r\nآموزشي
1,837144,\r\nگردهمايي دانش‌آموختگان موسسه آموزش عالي سو...,\r\nبه گزارش سرويس صنفي آموزشي خبرگزاري دانشجو...,\r\n138/5//09,\r\n1:4::11,\r\nآموزشي-,\r\nآموزشي
2,436862,\r\nنتايج آزمون دوره‌هاي فراگير دانشگاه پيام‌ن...,\r\nنتايج آزمون دوره‌هاي فراگير مقاطع كارشناسي...,\r\n138/3//07,\r\n1:0::03,\r\nآموزشي-,\r\nآموزشي
3,227781,\r\nهمايش يكروزه آسيب شناسي مفهوم روابط عمومي ...,\r\n,\r\n138/2//02,\r\n1:3::42,\r\nاجتماعي-خانواده-,\r\nاجتماعي
4,174187,\r\nوضعيت اقتصادي و ميزان تحصيلات والدين از مه...,\r\nمحمدتقي علوي يزدي، مجري اين طرح پژوهشي در ...,\r\n138/1//08,\r\n1:1::49,\r\nآموزشي-,\r\nآموزشي


In [6]:
data.shape

(10999, 7)

In [7]:
with open('stopwords.txt', encoding='utf8') as stopwords_file:
    stopwords = stopwords_file.readlines()
stopwords = [line.replace('\n', '') for line in stopwords] 
stopwords[:10], len(stopwords)

(['!', '"', '#', '$', '%', '&', "'", '(', ')', '*'], 2797)

In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/amirrezajahantab/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
nltk_stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(nltk_stopwords)
len(stopwords)

2995

In [17]:
stemmer = hazm.Stemmer()
stemmer.stem('کتاب ها')

'کتاب '

In [22]:
dataset = pd.DataFrame(columns=('title_body', 'category'))
for index, row in data.iterrows():
    title_body = row['Title'] + ' ' + row['Body']
    title_body = title_body.replace('\r', '')
    title_body_tokenized = word_tokenize(title_body)
    title_body_tokenized_filtered = [w for w in title_body_tokenized if not w in stopwords]
    title_body_tokenized_filtered_stemmed = [stemmer.stem(w) for w in title_body_tokenized_filtered]
    dataset.loc[index] = {
        'title_body': ' '.join(title_body_tokenized_filtered_stemmed),
        'category': row['Category2'].replace('\n', '').replace('\r', '')
    }
dataset.head()

Unnamed: 0,title_body,category
0,وزير علو درجمع استاد نمونه سن بازنشستگي استاد ...,آموزشي
1,گردهمايي دانش‌آموختگ موسسه آموز عالي سوره برگز...,آموزشي
2,نتايج آزمون دوره‌هاي فراگير دانشگاه پيام‌نور ن...,آموزشي
3,هماي يكروزه آسيب مفهو روابط عمومي بابلسر برگزا...,اجتماعي
4,وضعي اقتصادي تحصيل والدين مهمترين عوامل موفقي ...,آموزشي


In [29]:
dataset.shape

(10999, 2)

In [23]:
# Convert a collection of raw documents to a matrix of TF-IDF features.
vectorizer = TfidfVectorizer()
vectorizer.fit(dataset['title_body'])

In [41]:
save_pickle(vectorizer, 'vectorizer')

Saved to vectorizer.pkl


In [26]:
x = vectorizer.transform(dataset['title_body'])
x

<10999x60555 sparse matrix of type '<class 'numpy.float64'>'
	with 1556085 stored elements in Compressed Sparse Row format>

In [35]:
le = LabelEncoder()
y = le.fit_transform(dataset['category'])

In [36]:
le.classes_

array(['آموزشي', 'اجتماعي', 'اقتصادي', 'بهداشتي', 'تاريخي', 'سياسي',
       'علمي', 'فرهنگي', 'فقه و حقوق', 'مذهبي', 'ورزشي'], dtype=object)

In [45]:
y, len(y), np.unique(y)

(array([ 0,  0,  0, ..., 10, 10, 10]),
 10999,
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10]))

In [42]:
save_pickle(le, 'label_encoder')

Saved to label_encoder.pkl


In [46]:
X_train, X_test, y_train, y_test = train_test_split(x, y)

In [47]:
svmc = svm.SVC()
svmc.fit(x, y)

In [48]:
save_pickle(svmc, 'svmc')

Saved to svmc.pkl


In [49]:
svmc.score(X_test, y_test)

0.9810909090909091

In [51]:
y_pred = svmc.predict(X_test)

In [52]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98       267
           1       0.98      0.95      0.96       263
           2       0.98      0.97      0.98       251
           3       0.96      1.00      0.98       240
           4       0.99      1.00      1.00       247
           5       0.99      0.95      0.97       231
           6       0.99      0.95      0.97       231
           7       0.99      0.98      0.99       260
           8       0.98      1.00      0.99       250
           9       0.98      1.00      0.99       254
          10       0.99      1.00      0.99       256

    accuracy                           0.98      2750
   macro avg       0.98      0.98      0.98      2750
weighted avg       0.98      0.98      0.98      2750



In [53]:
print(confusion_matrix(y_test, y_pred))

[[265   1   0   0   0   0   1   0   0   0   0]
 [  2 249   2   5   0   2   1   0   1   0   1]
 [  1   4 244   0   0   0   0   0   0   2   0]
 [  0   0   0 240   0   0   0   0   0   0   0]
 [  0   0   0   0 247   0   0   0   0   0   0]
 [  0   1   2   0   1 219   0   2   5   0   1]
 [  6   0   0   5   1   0 219   0   0   0   0]
 [  2   0   0   0   0   0   0 255   0   2   1]
 [  0   0   0   0   0   0   0   0 250   0   0]
 [  0   0   0   0   0   0   0   0   0 254   0]
 [  0   0   0   0   0   0   0   0   0   0 256]]
