<a href="https://colab.research.google.com/github/antussa2016/thesis_work/blob/master/model/multilabelClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Generate a multilabel dataset
X, y = make_multilabel_classification(n_samples=100, n_classes=3, n_labels=2, random_state=42)

# Flatten the target labels array
y_flattened = y.argmax(axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_flattened, test_size=0.2, random_state=42)

# Initialize and train an SVM classifier with hyperparameter tuning
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear']}
grid_search = GridSearchCV(SVC(), param_grid, cv=3)
grid_search.fit(X_train, y_train)

# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Make predictions on the test set
predictions = best_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.7


In [None]:
for i, (pred_label, true_label) in enumerate(zip(predictions, y_test)):
    print(f"Instance {i + 1}:")
    print("Predicted label:", pred_label)
    print("True label:", true_label)
    print("=" * 40)

Instance 1:
Predicted label: 0
True label: 0
Instance 2:
Predicted label: 1
True label: 1
Instance 3:
Predicted label: 0
True label: 1
Instance 4:
Predicted label: 0
True label: 0
Instance 5:
Predicted label: 2
True label: 1
Instance 6:
Predicted label: 1
True label: 0
Instance 7:
Predicted label: 0
True label: 0
Instance 8:
Predicted label: 0
True label: 1
Instance 9:
Predicted label: 0
True label: 1
Instance 10:
Predicted label: 1
True label: 1
Instance 11:
Predicted label: 0
True label: 0
Instance 12:
Predicted label: 1
True label: 1
Instance 13:
Predicted label: 0
True label: 0
Instance 14:
Predicted label: 1
True label: 1
Instance 15:
Predicted label: 1
True label: 1
Instance 16:
Predicted label: 1
True label: 1
Instance 17:
Predicted label: 1
True label: 0
Instance 18:
Predicted label: 0
True label: 0
Instance 19:
Predicted label: 0
True label: 0
Instance 20:
Predicted label: 0
True label: 0


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
from sklearn.metrics import accuracy_score

# Sample dataset: news articles and their categories (multi-label)
articles = [
    ("Movie A", ["Action", "Thriller"]),
    ("Movie B", ["Comedy"]),
    ("Movie C", ["Drama", "Romance"]),
    ("Movie D", ["Comedy", "Romance"]),
    ("Movie E", ["Action"])
]

# Separate articles and labels
X = [article[0] for article in articles]
y = [article[1] for article in articles]

# Convert labels into binary format
mlb = MultiLabelBinarizer()
y_binary = mlb.fit_transform(y)

# Vectorize article text using TF-IDF
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_binary, test_size=0.2, random_state=42)

# Initialize the classifier
classifier = MultiOutputClassifier(KNeighborsClassifier(n_neighbors=1))

# Train the classifier
classifier.fit(X_train, y_train)

# Predict categories for test articles
y_pred = classifier.predict(X_test)

# Convert predictions back to label format
y_pred_labels = mlb.inverse_transform(y_pred)
y_test_labels = mlb.inverse_transform(y_test)




def predict_article_categories(input_article, vectorizer, classifier, mlb):
    # Vectorize the input article using the same TF-IDF vectorizer
    input_article_tfidf = vectorizer.transform([input_article])

    # Predict categories for the input article
    predicted_categories = classifier.predict(input_article_tfidf)

    # Convert predictions to label format
    predicted_labels = mlb.inverse_transform(predicted_categories)

    return predicted_labels[0] if predicted_labels else []

# # Display the results
# for i, article in enumerate(X_test):
#     print(f"Article: {X[i]}")
#     print(f"Predicted Categories: {y_pred_labels[i]}")
#     print(f"Actual Categories: {y_test_labels[i]}")
#     print("=" * 20)


# input_article = "Movie A"
# predicted_categories = predict_article_categories(input_article, vectorizer, classifier, mlb)
# print(f"Input Article: {input_article}")
# print(f"Predicted Categories: {predicted_categories}")

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.0
