In [48]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import SVC

In [49]:
# Load the data form the json file
df = pd.read_json('top_level_RR_reviews.json')

# Instantiate the binarizer
mlb = MultiLabelBinarizer()

# Fit and transform the 'categories' column
df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('categories')),
                          columns=mlb.classes_,
                          index=df.index))

print(df.head())


                                                text   
0  Efficient estimation of stereo thresholds: wha...  \
1  Safety Stock: Predicting demand on the supply ...   
2  If you are not prepared to be wrong, then your...   
3  Visual short-term memory for 3D shapes reveals...   
4  Fast and easy disinfection of coronavirus-cont...   

   Biological and Chemical Sciences  Biomedical Sciences  General   
0                                 0                    0        0  \
1                                 0                    0        0   
2                                 0                    0        0   
3                                 0                    0        1   
4                                 0                    0        0   

   Genetics/Genomics/Epigenetics  Humanities/Social Sciences  Immunology   
0                              0                           0           0  \
1                              0                           1           0   
2                   

In [50]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(df['text'], df[df.columns.difference(['text'])], test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(2645,) (662,) (2645, 13) (662, 13)


In [52]:
vectorizer = TfidfVectorizer(max_features=1000)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

AttributeError: 'csr_matrix' object has no attribute 'lower'

In [12]:
# from sklearn.multioutput import MultiOutputClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, hamming_loss

# # Initialize the base classifier
# base_rf = RandomForestClassifier(n_estimators=100, random_state=42)

# # Create the One-vs-Rest classifier
# ovr_rf = MultiOutputClassifier(base_rf, n_jobs=-1)

# # Train the classifier
# ovr_rf.fit(X_train, y_train)

# # Make predictions
# y_pred = ovr_rf.predict(X_test)

In [42]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

# Initialize the base classifier
base_rf = RandomForestClassifier(random_state=42)

# Create the One-vs-Rest classifier
ovr_rf = MultiOutputClassifier(base_rf, n_jobs=-1)

# Define the parameter grid to search
param_distributions = {
    'estimator__n_estimators': [100, 200, 300],  # Note the 'estimator__' prefix
    'estimator__max_depth': [None, 10, 20, 30],
    'estimator__min_samples_split': [2, 5, 10],
    'estimator__min_samples_leaf': [1, 2, 4],
    'estimator__max_features': ['auto', 'sqrt']
}

# Initialize the randomized search
random_search = RandomizedSearchCV(estimator=ovr_rf,
                                   param_distributions=param_distributions,
                                   n_iter=50,
                                   cv=3,
                                   verbose=2,
                                   random_state=42,
                                   n_jobs=-1)

# Fit the randomized search instance
random_search.fit(X_train, y_train)

# Best parameters
print("Best parameters:", random_search.best_params_)


Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END estimator__max_depth=30, estimator__max_features=auto, estimator__min_samples_leaf=2, estimator__min_samples_split=10, estimator__n_estimators=100; total time=   0.0s
[CV] END estimator__max_depth=30, estimator__max_features=auto, estimator__min_samples_leaf=2, estimator__min_samples_split=10, estimator__n_estimators=100; total time=   0.1s
[CV] END estimator__max_depth=30, estimator__max_features=auto, estimator__min_samples_leaf=2, estimator__min_samples_split=10, estimator__n_estimators=100; total time=   0.1s
[CV] END estimator__max_depth=None, estimator__max_features=auto, estimator__min_samples_leaf=2, estimator__min_samples_split=10, estimator__n_estimators=100; total time=   0.1s
[CV] END estimator__max_depth=None, estimator__max_features=auto, estimator__min_samples_leaf=2, estimator__min_samples_split=10, estimator__n_estimators=100; total time=   0.0s
[CV] END estimator__max_depth=20, estimator__max_featu

75 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
37 fits failed with the following error:
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/joblib/_parallel_backends.py", line 273, in _wrap_func_call
    return func()
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/joblib/parallel.py", line 589, in __call__
    return [func(*args, **kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/joblib/parallel.py", line 589, in <listcomp>
    return [func(*args, **kwargs)
  File "/Library/Framewor

Best parameters: {'estimator__n_estimators': 100, 'estimator__min_samples_split': 5, 'estimator__min_samples_leaf': 1, 'estimator__max_features': 'sqrt', 'estimator__max_depth': None}


In [45]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

# Configure RandomForestClassifier with the best parameters
optimized_rf = RandomForestClassifier(
    n_estimators=100,
    min_samples_split=5,
    min_samples_leaf=1,
    max_features='sqrt',
    max_depth=40,
    random_state=42
)

# Wrap the optimized RandomForestClassifier with MultiOutputClassifier
optimized_ovr_rf = MultiOutputClassifier(optimized_rf, n_jobs=-1)

# Train the classifier with the optimized parameters
optimized_ovr_rf.fit(X_train, y_train)

# Make predictions with the optimized model
y_pred_optimized = optimized_ovr_rf.predict(X_test)

In [46]:
from sklearn.metrics import accuracy_score, hamming_loss, f1_score

# Evaluate the optimized model
print("Accuracy:", accuracy_score(y_test, y_pred_optimized))
print("Hamming Loss:", hamming_loss(y_test, y_pred_optimized))
print("F1 Score (Micro):", f1_score(y_test, y_pred_optimized, average='micro'))


Accuracy: 0.6616314199395771
Hamming Loss: 0.037996746455960954
F1 Score (Micro): 0.8407208962493911


In [14]:
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier

# Initialize the base XGBoost classifier
base_xgb = XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)

# Wrap it with MultiOutputClassifier to handle multi-labels
multioutput_xgb = MultiOutputClassifier(base_xgb, n_jobs=-1)

# Train the model
multioutput_xgb.fit(X_train, y_train)

# Make predictions
y_pred = multioutput_xgb.predict(X_test)

In [15]:
from sklearn.metrics import accuracy_score, hamming_loss, f1_score

# Example evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Hamming Loss:", hamming_loss(y_test, y_pred))
print("F1 Score (Micro):", f1_score(y_test, y_pred, average='micro'))


Accuracy: 0.6374622356495468
Hamming Loss: 0.04066930048803161
F1 Score (Micro): 0.8312439729990357


In [20]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multioutput import MultiOutputClassifier

# Initialize the k-NN classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)

# Wrap k-NN classifier with MultiOutputClassifier
multioutput_knn = MultiOutputClassifier(knn_classifier, n_jobs=-1)

# Train the model on the training data
multioutput_knn.fit(X_train, y_train)


In [21]:
from sklearn.metrics import accuracy_score, hamming_loss

# Make predictions
y_pred_knn = multioutput_knn.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Hamming Loss:", hamming_loss(y_test, y_pred_knn))


Accuracy: 0.36706948640483383
Hamming Loss: 0.07483151289797815
