In [1]:
import numpy as np
import pandas as pd
import nltk 
import parsivar

In [2]:
data = pd.read_csv('per.csv')
data.head()

Unnamed: 0,NewsID,Title,Body,Date,Time,Category,Category2
0,843656,\nوزير علوم درجمع استادان نمونه: سن بازنشستگي ...,\nوزير علوم در جمع استادان نمونه كشور گفت: از ...,\n138/5//09,\n0:9::18,\nآموزشي-,\nآموزشي
1,837144,\nگردهمايي دانش‌آموختگان موسسه آموزش عالي سوره...,\nبه گزارش سرويس صنفي آموزشي خبرگزاري دانشجويا...,\n138/5//09,\n1:4::11,\nآموزشي-,\nآموزشي
2,436862,\nنتايج آزمون دوره‌هاي فراگير دانشگاه پيام‌نور...,\nنتايج آزمون دوره‌هاي فراگير مقاطع كارشناسي و...,\n138/3//07,\n1:0::03,\nآموزشي-,\nآموزشي
3,227781,\nهمايش يكروزه آسيب شناسي مفهوم روابط عمومي در...,\n,\n138/2//02,\n1:3::42,\nاجتماعي-خانواده-,\nاجتماعي
4,174187,\nوضعيت اقتصادي و ميزان تحصيلات والدين از مهمت...,\nمحمدتقي علوي يزدي، مجري اين طرح پژوهشي در اي...,\n138/1//08,\n1:1::49,\nآموزشي-,\nآموزشي


In [3]:
with open('stopwords.txt' , 'r' , encoding='utf-8') as stopwords_file:
    stopwords = stopwords_file.readlines()
stopwords = [line.replace('\n' , '') for line in stopwords]

In [4]:
nltk_stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(nltk_stopwords)
len(stopwords)

1495

In [5]:
stemmer = parsivar.FindStems()
tokenizer = parsivar.Tokenizer()
normalizer = parsivar.Normalizer()

In [6]:
dataset = pd.DataFrame(columns=('title_body' , 'category'))
for index , row in data.iterrows():
    title_body = row['Title'] + ' ' + row['Body']
    title_body_normalized = normalizer.normalize(title_body)
    title_body_normalized_tokenized = tokenizer.tokenize_words(title_body_normalized)
    title_body_normalized_tokenized_filtered = [w for w in title_body_normalized_tokenized if not w in stopwords]
    title_body_normalized_tokenized_filtered_stemmed = [stemmer.convert_to_stem(w).replace('&' , ' ') for w in title_body_normalized_tokenized_filtered]
    dataset.loc[index] = {
        'title_body' : ' '.join(title_body_normalized_tokenized_filtered_stemmed),
        'category' : row['Category2'].replace('\n' , ''),
    }

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
vectorizer = TfidfVectorizer(binary=True , sublinear_tf=True , ngram_range=(1 , 3))
vectorizer.fit(dataset['title_body'])

In [9]:
X = vectorizer.transform(dataset['title_body'])
X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6631395 stored elements and shape (10999, 3416661)>

In [10]:
from sklearn.preprocessing import LabelEncoder

In [11]:
le = LabelEncoder()
y = le.fit_transform(dataset['category'])

In [12]:
y

array([ 0,  0,  0, ..., 10, 10, 10])

In [13]:
le.classes_

array(['آموزشي', 'اجتماعي', 'اقتصادي', 'بهداشتي', 'تاريخي', 'سياسي',
       'علمي', 'فرهنگي', 'فقه و حقوق', 'مذهبي', 'ورزشي'], dtype=object)

In [14]:
from sklearn.model_selection import train_test_split as tts

In [15]:
X_trian , X_test , y_train , y_test = tts(X , y , random_state=1441)

In [16]:
from sklearn.svm import SVC

In [17]:
model = SVC(cache_size=10_000 , random_state=1441 , C=10)
model.fit(X_trian , y_train)

In [18]:
model.score(X_test , y_test)

0.854909090909091

In [19]:
from sklearn.metrics import classification_report , confusion_matrix

In [20]:
y_pred = model.predict(X_test)
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.93      0.89      0.91       254
           1       0.60      0.71      0.65       255
           2       0.82      0.81      0.81       248
           3       0.93      0.94      0.93       247
           4       0.87      0.93      0.90       257
           5       0.81      0.69      0.75       259
           6       0.88      0.84      0.86       249
           7       0.84      0.88      0.86       259
           8       0.87      0.88      0.87       231
           9       0.95      0.89      0.92       234
          10       0.98      0.96      0.97       257

    accuracy                           0.85      2750
   macro avg       0.86      0.86      0.86      2750
weighted avg       0.86      0.85      0.86      2750



In [21]:
print(confusion_matrix(y_test , y_pred))

[[225  13   1   2   0   3   9   1   0   0   0]
 [  9 180  19   3   4   7   6  20   3   3   1]
 [  0  25 200   0   1  10   4   2   3   3   0]
 [  0   8   0 231   0   2   5   1   0   0   0]
 [  0   5   1   0 239   4   0   6   1   1   0]
 [  1  15  11   1  21 180   2   7  19   0   2]
 [  7   7   8  10   2   0 208   3   1   0   3]
 [  1  17   0   0   3   3   3 229   1   2   0]
 [  0  14   1   0   2   9   0   1 203   1   0]
 [  0  12   1   1   4   3   0   3   2 208   0]
 [  0   6   1   0   0   0   0   1   1   0 248]]
