In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# Updated dataset for better balance (adding a few similar items to each category)
data = {
    'Headline': [
        "Local team wins championship",         # Sports
        "Government passes new education reform", # Politics
        "Stock market reaches new high",          # Economics
        "Community gathers for annual festival",   # Social
        "Top player transfers to new team",       # Sports
        "New policies for healthcare announced",   # Politics
        "Unemployment rate drops significantly",   # Economics
        "Social media campaign raises awareness",   # Social
        "Olympics to be held next year",          # Sports
        "Election results bring major changes",    # Politics
        "Economic growth forecasted to rise",      # Economics
        "New trade agreements to boost economy",    # Economics
        "Inflation rates hit record lows",         # Economics
        "Government invests in renewable energy",   # Economics
        "Sports teams adapt to new financial regulations", # Sports
        "Political debates spark public interest",  # Politics
        "Charity event helps local businesses",     # Social
        "Small businesses thrive with new economic policies", # Economics
        "Community rallies for local sports teams", # Sports
        "New healthcare policies affect citizens",  # Politics
        "Major tech investments reshape economic landscape" # Economics
    ],
    'Category': [
        "Sports", "Politics", "Economics", "Social", "Sports", "Politics",
        "Economics", "Social", "Sports", "Politics", "Economics", "Economics",
        "Economics", "Economics", "Sports", "Politics", "Social",
        "Economics", "Sports", "Politics", "Economics"
    ]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Separate features and target
X = df["Headline"]
y = df["Category"]

# Text preprocessing with TF-IDF vectorization
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X)

# Encode the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# SVM hyperparameter tuning using GridSearchCV
param_grid = {'C': [0.1, 1, 10, 100]}
grid_search = GridSearchCV(SVC(kernel='linear', class_weight='balanced', random_state=42), param_grid, cv=3)
grid_search.fit(X_tfidf, y_encoded)

# Using the best estimator from GridSearchCV
best_svm = grid_search.best_estimator_

# Stratified K-Fold Cross-Validation setup with 3 splits
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
accuracies = []
all_y_true = []
all_y_pred = []

# Train the SVM model on each fold
for train_index, test_index in skf.split(X_tfidf, y_encoded):
    X_train, X_test = X_tfidf[train_index], X_tfidf[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]
    
    # Fit the model and make predictions
    best_svm.fit(X_train, y_train)
    y_pred = best_svm.predict(X_test)
    
    # Collect predictions for evaluation
    all_y_true.extend(y_test)
    all_y_pred.extend(y_pred)
    
    # Accuracy for this fold
    accuracies.append(accuracy_score(y_test, y_pred))

# Final Evaluation
print("Cross-Validation Accuracy (average across folds):", sum(accuracies) / len(accuracies))
print("Classification Report:\n", classification_report(all_y_true, all_y_pred, target_names=label_encoder.classes_))


Cross-Validation Accuracy (average across folds): 0.6190476190476191
Classification Report:
               precision    recall  f1-score   support

   Economics       0.62      1.00      0.76         8
    Politics       1.00      0.40      0.57         5
      Social       0.00      0.00      0.00         3
      Sports       0.60      0.60      0.60         5

    accuracy                           0.62        21
   macro avg       0.55      0.50      0.48        21
weighted avg       0.62      0.62      0.57        21



In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# Load dataset from the CSV file
df = pd.read_csv('news_articles.csv')

# Separate features and target
X = df["Headline"]
y = df["Category"]

# Text preprocessing with TF-IDF vectorization
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X)

# Encode the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# SVM hyperparameter tuning using GridSearchCV
param_grid = {'C': [0.1, 1, 10, 100]}
grid_search = GridSearchCV(SVC(kernel='linear', class_weight='balanced', random_state=42), param_grid, cv=3)
grid_search.fit(X_tfidf, y_encoded)

# Using the best estimator from GridSearchCV
best_svm = grid_search.best_estimator_

# Stratified K-Fold Cross-Validation setup with 3 splits
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
accuracies = []
all_y_true = []
all_y_pred = []

# Train the SVM model on each fold
for train_index, test_index in skf.split(X_tfidf, y_encoded):
    X_train, X_test = X_tfidf[train_index], X_tfidf[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]
    
    # Fit the model and make predictions
    best_svm.fit(X_train, y_train)
    y_pred = best_svm.predict(X_test)
    
    # Collect predictions for evaluation
    all_y_true.extend(y_test)
    all_y_pred.extend(y_pred)
    
    # Accuracy for this fold
    accuracies.append(accuracy_score(y_test, y_pred))

# Final Evaluation
print("Cross-Validation Accuracy (average across folds):", sum(accuracies) / len(accuracies))
print("Classification Report:\n", classification_report(all_y_true, all_y_pred, target_names=label_encoder.classes_))


Cross-Validation Accuracy (average across folds): 0.6190476190476191
Classification Report:
               precision    recall  f1-score   support

   Economics       0.62      1.00      0.76         8
    Politics       1.00      0.40      0.57         5
      Social       0.00      0.00      0.00         3
      Sports       0.60      0.60      0.60         5

    accuracy                           0.62        21
   macro avg       0.55      0.50      0.48        21
weighted avg       0.62      0.62      0.57        21

