# Random Forest for Topic Classification

In this notebook, a Random Forest is implemented in order to perform topic-classification on the "GenericMixOfTopic" dataset. The classification is multi-labeled with a total of 39 labels corresponding to a certain topic present in the text.

### Import necessary libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, hamming_loss, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import make_scorer
from sklearn.pipeline import Pipeline

### Dataset loading and splitting

In [3]:
file_path = "main_dataset_correctedSingleTopics.pkl"
data = pd.read_pickle(file_path)

X = data['text']
y = data.iloc[:, 5:]

X_train_text, X_test_text, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train_text, X_val_text, y_train, y_val = train_test_split(X_train_text, y_train, test_size=0.3, random_state=42)

### Grid search cross validation
To optimize the number of trees and the maximum depth

In [3]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000, stop_words='english')),
    ('clf', RandomForestClassifier(random_state=42, n_jobs=-1))
])

param_grid = {
    'clf__n_estimators': [100, 150, 200],
    'clf__max_depth': [30, None]
}

scorer = make_scorer(hamming_loss, greater_is_better=False)
grid_search = GridSearchCV(pipeline, param_grid, scoring=scorer, cv=3, verbose=1)
grid_search.fit(X_train_text, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


In [8]:
print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation score (Hamming Loss):", -grid_search.best_score_)

results = pd.DataFrame(grid_search.cv_results_)
print(results[['param_clf__n_estimators', 'param_clf__max_depth', 'mean_test_score', 'std_test_score']])

Best parameters found: {'clf__max_depth': None, 'clf__n_estimators': 200}
Best cross-validation score (Hamming Loss): 0.03534752561772133
  param_clf__n_estimators param_clf__max_depth  mean_test_score  \
0                     100                   30        -0.040749   
1                     150                   30        -0.040747   
2                     200                   30        -0.040716   
3                     100                 None        -0.035458   
4                     150                 None        -0.035354   
5                     200                 None        -0.035348   

   std_test_score  
0        0.000076  
1        0.000075  
2        0.000096  
3        0.000039  
4        0.000041  
5        0.000007  


The selected model has 200 estimators and no limitation on the maximum depth

# Evaluation of the selected model

In [4]:
tfidf = TfidfVectorizer(max_features=10000, stop_words='english')
X_train = tfidf.fit_transform(X_train_text)
X_val = tfidf.transform(X_val_text)
X_test = tfidf.transform(X_test_text)

In [5]:
rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf_classifier.fit(X_train, y_train)

### Model evaluation

In [6]:
predictions = rf_classifier.predict(X_test)

In [7]:
from sklearn.preprocessing import normalize

prob_predictions = rf_classifier.predict_proba(X_test)

prob_matrix = np.array([probs[:, 1] for probs in prob_predictions]).T

# Normalize probabilities so that each row sums to 1
#normalized_probs = normalize(prob_matrix, norm='l1', axis=1)

def select_top_labels(row, threshold=0.1, max_labels=3):
    filtered_labels = [i for i, prob in enumerate(row) if prob > threshold]
    sorted_labels = sorted(filtered_labels, key=lambda i: row[i], reverse=True)[:max_labels]
    
    label_array = np.zeros(len(row))
    for label in sorted_labels:
        label_array[label] = 1
    return label_array

selected_labels = np.apply_along_axis(select_top_labels, 1, prob_matrix)

In [8]:
pd.DataFrame(selected_labels).sum(axis=1).value_counts().sort_values(ascending=False)

3.0    50796
2.0     8296
1.0      869
0.0       41
Name: count, dtype: int64

In [9]:
col_names = data.iloc[:,5:].columns
col_names

Index(['Academic_disciplines', 'Business', 'Communication', 'Concepts',
       'Culture', 'Economy', 'Education', 'Energy', 'Engineering',
       'Entertainment', 'Entities', 'Ethics', 'Food_and_drink', 'Geography',
       'Government', 'Health', 'History', 'Human_behavior', 'Humanities',
       'Information', 'Internet', 'Knowledge', 'Language', 'Law', 'Life',
       'Mass_media', 'Mathematics', 'Military', 'Nature', 'People',
       'Philosophy', 'Politics', 'Religion', 'Science', 'Society', 'Sports',
       'Technology', 'Time', 'Universe'],
      dtype='object')

In [10]:
print("Hamming Loss:", hamming_loss(y_test, selected_labels))
print(y.columns)
print("Classification Report:\n", classification_report(y_test, selected_labels, target_names=y.columns))

Hamming Loss: 0.047985579967847226
Index(['Academic_disciplines', 'Business', 'Communication', 'Concepts',
       'Culture', 'Economy', 'Education', 'Energy', 'Engineering',
       'Entertainment', 'Entities', 'Ethics', 'Food_and_drink', 'Geography',
       'Government', 'Health', 'History', 'Human_behavior', 'Humanities',
       'Information', 'Internet', 'Knowledge', 'Language', 'Law', 'Life',
       'Mass_media', 'Mathematics', 'Military', 'Nature', 'People',
       'Philosophy', 'Politics', 'Religion', 'Science', 'Society', 'Sports',
       'Technology', 'Time', 'Universe'],
      dtype='object')
Classification Report:
                       precision    recall  f1-score   support

Academic_disciplines       0.35      0.31      0.33      1852
            Business       0.50      0.41      0.45      3304
       Communication       0.64      0.05      0.09       153
            Concepts       0.57      0.32      0.41       656
             Culture       0.43      0.42      0.42      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
from sklearn.metrics import accuracy_score, classification_report, jaccard_score

# Calculating Exact Match Ratio (Subset Accuracy)
exact_match = accuracy_score(y_test, selected_labels)

# Hamming Loss is already calculated
hamming_accuracy = 1 - hamming_loss(y_test, selected_labels)

# Label-based Accuracy (precision and recall average per label)
classification_report_dict = classification_report(y_test, selected_labels, target_names=y.columns, output_dict=True)
label_based_accuracy = {label: (info['precision'] + info['recall']) / 2 for label, info in classification_report_dict.items() if label in y.columns}

# Jaccard Score (Average of all samples)
jaccard_index = jaccard_score(y_test, selected_labels, average='samples')

print(f"Exact Match Ratio (Subset Accuracy): {exact_match}")
print(f"Hamming Accuracy: {hamming_accuracy}")
print("Label-based Accuracies:", label_based_accuracy)
print(f"Average Jaccard Index (Partial Match Accuracy): {jaccard_index}")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Exact Match Ratio (Subset Accuracy): 0.2074264191193627
Hamming Accuracy: 0.9520144200321528
Label-based Accuracies: {'Academic_disciplines': 0.33039389106935957, 'Business': 0.45712743949185564, 'Communication': 0.3410576351752822, 'Concepts': 0.44787034917936314, 'Culture': 0.4250723882725249, 'Economy': 0.5757525436024422, 'Education': 0.6227938105517808, 'Energy': 0.5858702742562183, 'Engineering': 0.3607795866592848, 'Entertainment': 0.7584137882746262, 'Entities': 0.34996966651692885, 'Ethics': 0.5058139534883721, 'Food_and_drink': 0.5680338817742345, 'Geography': 0.6906531614633986, 'Government': 0.6453585536762706, 'Health': 0.5452142357030332, 'History': 0.5337660978611143, 'Human_behavior': 0.4894740958275996, 'Humanities': 0.5511826539596602, 'Information': 0.30066889632107024, 'Internet': 0.39844155844155843, 'Knowledge': 0.4529939785905442, 'Language': 0.6249022062275074, 'Law': 0.5629314969242237, 'Life': 0.834692547252065, 'Mass_media': 0.7008046492372434, 'Mathematics':

In [24]:
# Sort the label-based accuracies dictionary by value
sorted_label_accuracies = sorted(label_based_accuracy.items(), key=lambda x: x[1], reverse=True)

# Extract top 3 and bottom 3 accuracies
top_3_accuracies = sorted_label_accuracies[:3]
bottom_3_accuracies = sorted_label_accuracies[-3:]

print("Top 3 Label-based Accuracies:")
for label, accuracy in top_3_accuracies:
    print(f"{label}: {accuracy:.4f}")

print("\nBottom 3 Label-based Accuracies:")
for label, accuracy in bottom_3_accuracies:
    print(f"{label}: {accuracy:.4f}")

Top 3 Label-based Accuracies:
Sports: 0.8945
Life: 0.8347
People: 0.7935

Bottom 3 Label-based Accuracies:
Communication: 0.3411
Academic_disciplines: 0.3304
Information: 0.3007
