In [5]:
import pandas as pd
import re
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import sklearn_crfsuite
from sklearn_crfsuite import CRF

# Load the dataset
df = pd.read_csv("Articles.csv", encoding="ISO-8859-1")

# Text Preprocessing Function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters
    return text

df["Article"] = df["Article"].apply(clean_text)

# Selecting features and labels
X = df["Article"].tolist()  # Convert to list for tokenization
y = df["NewsType"].tolist()

# Data Augmentation: Duplicate and slightly modify some entries
def augment_text(text):
    words = text.split()
    if len(words) > 5:
        idx = random.randint(0, len(words) - 1)
        words[idx] = words[idx][::-1]  # Reverse a random word
    return ' '.join(words)

df_augmented = df.copy()
df_augmented["Article"] = df_augmented["Article"].apply(augment_text)
df = pd.concat([df, df_augmented])

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Convert text data into features using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Convert labels into lists of lists for CRF
y_train_encoded = [[label] for label in y_train_encoded]
y_test_encoded = [[label] for label in y_test_encoded]

# Convert to dictionary format for CRF
X_train_crf = [dict(enumerate(row.toarray()[0])) for row in X_train_tfidf]
X_test_crf = [dict(enumerate(row.toarray()[0])) for row in X_test_tfidf]

# Train Conditional Random Field model
crf = CRF(algorithm='lbfgs', max_iterations=100, all_possible_transitions=True)
crf.fit(X_train_crf, y_train_encoded)

# Predict on test set
y_pred_crf = crf.predict(X_test_crf)
y_pred_crf = [pred[0] for pred in y_pred_crf]  # Flatten list of lists
print(f"CRF Model Accuracy: {accuracy_score(y_test_encoded, y_pred_crf):.4f}")

# Function to predict class of a given text
def predict_news_category_crf(text):
    text = clean_text(text)
    text_tfidf = vectorizer.transform([text])
    text_crf = [dict(enumerate(text_tfidf.toarray()[0]))]
    prediction = crf.predict(text_crf)
    predicted_label = label_encoder.inverse_transform([prediction[0]])
    return predicted_label[0]

# Example prediction
sample_text = "The star player scores a winning goal in the final match."
predicted_class_crf = predict_news_category_crf(sample_text)
print(f"Predicted News Category (CRF): {predicted_class_crf}")

TypeError: expected bytes, numpy.int64 found