In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from textblob import TextBlob
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectFromModel

# Load the data
data = pd.read_csv('bbc_text.csv')  

# Preprocess the text data
data['text'] = data['text'].str.replace(r'<[^>]+>', ' ', regex=True)  # Remove HTML tags
data['text'] = data['text'].str.replace(r'\s+', ' ', regex=True)  # Replace multiple spaces with a single space
data['text'] = data['text'].str.lower()  # Convert to lower case

# Feature Engineering
# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english', ngram_range=(1,2))

# Sentiment Analysis
data['polarity'] = data['text'].apply(lambda x: TextBlob(x).sentiment.polarity)
data['subjectivity'] = data['text'].apply(lambda x: TextBlob(x).sentiment.subjectivity)

# Document Length
data['doc_length'] = data['text'].apply(len)

# Select features and labels
features = data.drop('category', axis=1)
labels = data['category']

# Split data into training, development, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(features, labels, test_size=0.3, random_state=42)
X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Pipeline setup
# Creating a column transformer to handle different types of features
preprocessor = ColumnTransformer(
    transformers=[
        ('tfidf', tfidf_vectorizer, 'text'),
        ('num', StandardScaler(), ['polarity', 'subjectivity', 'doc_length'])
    ])

# Feature selection
feature_selector = SelectFromModel(RandomForestClassifier(n_estimators=100))

# Random Forest Classifier with parameter tuning
classifier = RandomForestClassifier()
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [10, 20, None],
    'classifier__min_samples_split': [2, 5, 10]
}
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('feature_selector', feature_selector),
                           ('classifier', classifier)])

# GridSearchCV for hyperparameter tuning on the development set
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_dev, y_dev)

# Display best parameters
print("Best parameters found on development set: ", grid_search.best_params_)

# Train the model with the best parameters on the combined training and development set
final_model = grid_search.best_estimator_
final_model.fit(pd.concat([X_train, X_dev]), pd.concat([y_train, y_dev]))  # Concatenate for final training

# Predict and evaluate the model on the test set
y_pred = final_model.predict(X_test)
print("Final Test Set Classification Report:\n", classification_report(y_test, y_pred))
print("Final Test Set Accuracy Score:", accuracy_score(y_test, y_pred))

Best parameters found on development set:  {'classifier__max_depth': None, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}
Final Test Set Classification Report:
                precision    recall  f1-score   support

     business       0.92      0.95      0.93        83
entertainment       1.00      0.92      0.96        52
     politics       0.93      0.96      0.95        54
        sport       0.96      0.94      0.95        79
         tech       0.94      0.95      0.95        66

     accuracy                           0.95       334
    macro avg       0.95      0.95      0.95       334
 weighted avg       0.95      0.95      0.95       334

Final Test Set Accuracy Score: 0.9461077844311377
