In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn import tree
from sklearn.neural_network import MLPClassifier

# 1 - category, 2 - title, 3 - text
data = pd.read_excel("azeri_news.xlsx", encoding='utf-8')

In [2]:
data['corpus'] = data['Title']+data['News_Article'] 
#using both columns at the same time doesn't work the way we need. combination of 2 columns gives slightly better accuracy
y = data["Category"]

In [3]:
tfidf = TfidfVectorizer()
x_tfidf = tfidf.fit_transform(data["corpus"]) 

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x_tfidf, y, test_size=0.2, random_state=1)

## SVM MACHINE

In [10]:
svc = SVC(kernel = 'linear', gamma='auto', random_state = 1)
svc.fit(x_train, y_train)

# 30 MINUTES

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=1, shrinking=True, tol=0.001,
    verbose=False)

In [11]:
pred_svc = svc.predict(x_test)
print(classification_report(y_test, pred_svc))

              precision    recall  f1-score   support

       Dünya       0.80      0.83      0.82      2676
     Maraqlı       0.82      0.78      0.80      3182
  Mədəniyyət       0.84      0.67      0.75       301
     Siyasət       0.85      0.86      0.86      1300
       İdman       0.90      0.96      0.93      1256
İqtisadiyyat       0.87      0.88      0.88      1285

    accuracy                           0.84     10000
   macro avg       0.85      0.83      0.84     10000
weighted avg       0.84      0.84      0.84     10000



In [12]:
print(confusion_matrix(y_test, pred_svc))

[[2221  335    4   47   35   34]
 [ 399 2489   21   91   90   92]
 [  11   69  203   16    0    2]
 [  80   56    5 1124    3   32]
 [  19   22    0    5 1206    4]
 [  41   49    8   44    6 1137]]


## MULTILAYER PERCEPTRON

In [5]:
mlp = MLPClassifier(max_iter=100, random_state=1, hidden_layer_sizes=(10,10,10))

mlp.fit(x_train, y_train)

# 63 MINUTES



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(10, 10, 10), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=100,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=1, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [6]:
pred_mlp = mlp.predict(x_test)
print(classification_report(y_test, pred_mlp))

              precision    recall  f1-score   support

       Dünya       0.76      0.79      0.78      2676
     Maraqlı       0.78      0.76      0.77      3182
  Mədəniyyət       0.67      0.65      0.66       301
     Siyasət       0.80      0.83      0.81      1300
       İdman       0.90      0.88      0.89      1256
İqtisadiyyat       0.87      0.85      0.86      1285

    accuracy                           0.80     10000
   macro avg       0.80      0.79      0.80     10000
weighted avg       0.80      0.80      0.80     10000



In [7]:
print(confusion_matrix(y_test, pred_mlp))

[[2123  410    6   81   23   33]
 [ 473 2414   53   86   82   74]
 [   7   62  197   22    1   12]
 [  87   72   15 1080   10   36]
 [  45   67    1   31 1111    1]
 [  53   58   24   56    5 1089]]


## DECISION TREE

In [25]:
# with just criterion=entropy accuracy is 73
# (criterion='entropy', random_state = 1) accuracy = 73
# (criterion='entropy', random_state = 1, max_features='auto') accuracy = 61
# (criterion='entropy', random_state = 1, max_features='log2') accuracy = 58
# default parameters (criterion = gini), accuracy = 74
# (criterion = gini, random_state = 1), accuracy = 74
# (criterion='gini', random_state = 1, max_features='auto'), accuracy = 63
# therefore best option is 74 with default parameters

tree = tree.DecisionTreeClassifier()
tree.fit(x_train, y_train)

# 4 MINUTES

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [26]:
pred_tree = tree.predict(x_test)
print(classification_report(y_test, pred_tree))

              precision    recall  f1-score   support

       Dünya       0.72      0.74      0.73      2676
     Maraqlı       0.72      0.74      0.73      3182
  Mədəniyyət       0.58      0.50      0.53       301
     Siyasət       0.74      0.74      0.74      1300
       İdman       0.86      0.82      0.84      1256
İqtisadiyyat       0.75      0.74      0.75      1285

    accuracy                           0.74     10000
   macro avg       0.73      0.71      0.72     10000
weighted avg       0.74      0.74      0.74     10000



In [27]:
print(confusion_matrix(y_test, pred_tree))

[[1980  481   19   71   48   77]
 [ 470 2356   42  121   86  107]
 [  33   68  149   33    1   17]
 [  95  130   20  958   14   83]
 [  84  100    7   17 1025   23]
 [  90  118   21   98   12  946]]
