In [130]:
from sklearn.model_selection import cross_val_predict, cross_val_score, StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import pandas as pd

In [131]:
data = pd.read_csv("./DATASET/Stemmed/X_data_stemmed.csv", encoding = 'utf-8-sig')
data

Unnamed: 0.1,Unnamed: 0,0
0,0,آرتمیس دازده ساله با کمک محافظ شخص فق ماهر بات...
1,1,کرده بد در مسیر ماجخط ساحل تاالب میراند جاییکه...
2,2,به رابطه دستان که آدامسن دا شر میکرد دامن نزد ...
3,3,بد زن پنجاه پنج کیل اگر چیز را پنجبار لمس می...
4,4,مال چند سال پیشه فل با تیک صب یک از سم را زمین...
...,...,...
1376,1376,نارنیا پاسخ داده خاهد شد انتقا تیسراک بس سخ...
1377,1377,خندیدن کردند البته نم تانستند جل خد را بگیرند ...
1378,1378,تجیز کرد فراه ساز همه گنه سایل راحت که ض کنن ا...
1379,1379,اسـبپـسـرکا زندگ پیشتاز خدا خدا م کردند دباره ...


In [132]:
y_data = pd.read_csv("./DATASET/y_data.csv", encoding = 'utf-8-sig')
y_data

Unnamed: 0.1,Unnamed: 0,Author,Author_ID
0,0,Artemis Fowl,1
1,1,Artemis Fowl,1
2,2,Artemis Fowl,1
3,3,Artemis Fowl,1
4,4,Artemis Fowl,1
...,...,...,...
1376,1376,c.s.lewis,10
1377,1377,c.s.lewis,10
1378,1378,c.s.lewis,10
1379,1379,c.s.lewis,10


In [133]:
texts = list(data['0'].values)  # List of input texts
labels = list((y_data['Author_ID']-1).values)

In [134]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Convert text data into numerical feature vectors using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=100)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [135]:
# Function to perform cross-validation and return performance metrics
def evaluate_model(classifier, X, y):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    accuracy_scores = cross_val_score(classifier, X, y, cv=skf, scoring='accuracy')
    f1_scores = cross_val_score(classifier, X, y, cv=skf, scoring='f1_macro')
    precision_scores = cross_val_score(classifier, X, y, cv=skf, scoring='precision_macro')
    recall_scores = cross_val_score(classifier, X, y, cv=skf, scoring='recall_macro')

    # Calculate mean scores
    mean_accuracy = np.mean(accuracy_scores)
    mean_f1 = np.mean(f1_scores)
    mean_precision = np.mean(precision_scores)
    mean_recall = np.mean(recall_scores)

    return mean_accuracy, mean_f1, mean_precision, mean_recall

# Train the SVM classifier
svm_classifier = SVC(kernel='linear', C=10000000)
mean_accuracy, mean_f1, mean_precision, mean_recall = evaluate_model(svm_classifier, X_train_tfidf, y_train)

# Make predictions using 5-fold cross-validation
y_pred_cv = cross_val_predict(svm_classifier, X_test_tfidf, y_test, cv=5)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_cv)

# Print performance metrics
print("Mean Accuracy:", mean_accuracy)
print("Mean F1 Score:", mean_f1)
print("Mean Precision:", mean_precision)
print("Mean Recall:", mean_recall)

# Print confusion matrix
print("\nConfusion Matrix:")
print(conf_matrix)

Mean Accuracy: 0.9356766762649116
Mean F1 Score: 0.920790243481697
Mean Precision: 0.9369052738711726
Mean Recall: 0.9152685429089568

Confusion Matrix:
[[24  0  0  0  0  3  0  0  0  0]
 [ 0 18  0  0  1  2  3  0  0  0]
 [ 0  0 48  0  0  1  0  0  0  0]
 [ 0  0  0 35  0  1  0  0  0  0]
 [ 0  1  2  0 39  0  0  0  0  0]
 [ 1  2  2  2  1 33  0  0  1  0]
 [ 0  3  0  0  0  3 11  0  0  0]
 [ 0  0  1  0  0  2  0 13  0  0]
 [ 0  0  0  0  0  3  0  0  5  0]
 [ 0  1  0  0  0  0  0  0  0 15]]


In [136]:
print(classification_report(y_pred_cv, y_test))

              precision    recall  f1-score   support

           0       0.89      0.96      0.92        25
           1       0.75      0.72      0.73        25
           2       0.98      0.91      0.94        53
           3       0.97      0.95      0.96        37
           4       0.93      0.95      0.94        41
           5       0.79      0.69      0.73        48
           6       0.65      0.79      0.71        14
           7       0.81      1.00      0.90        13
           8       0.62      0.83      0.71         6
           9       0.94      1.00      0.97        15

    accuracy                           0.87       277
   macro avg       0.83      0.88      0.85       277
weighted avg       0.87      0.87      0.87       277

