## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import re

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import classification_report

np.random.seed(13)
print(sklearn.__version__)
print(np.__version__)
print(pd.__version__)

1.6.1
2.0.2
2.2.3


## Import data

In [2]:
df = pd.read_csv("../data/JFP Credential Data_All_Journals_Final_1.2025.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5630 entries, 0 to 5629
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Title         5630 non-null   object
 1   Abstract      4046 non-null   object
 2   Authors       5466 non-null   object
 3   documentType  5630 non-null   object
 4   pub_month     5337 non-null   object
 5   pub_year      5630 non-null   int64 
 6   pubtitle      5630 non-null   object
 7   subjectTerms  4067 non-null   object
 8   Database      5630 non-null   object
dtypes: int64(1), object(8)
memory usage: 396.0+ KB


## Extract Credentials

In [4]:
df["Credentials"] = (df
                     .Authors
                     .apply(lambda x: re.findall(r"(CFP|PhD|CFA|JD|CPA|CLU|ChFC|LLM|AIF|CRC)", str(x)))
                     .apply(lambda x: np.nan if len(x) == 0 else x)
                     )

## Group Credentials into target groups

In [5]:
def target_grouping(x):
    if type(x) != list:
        return np.nan
    elif "PhD" in set(x):
        return "Academic"
    else:
        return "Practitioner"

df["target_grouping"] = df.Credentials.apply(target_grouping)

## Drop nulls

In [6]:
df = df.dropna(subset=["target_grouping", "Abstract"]).reset_index(drop=True)

In [7]:
df.head()

Unnamed: 0,Title,Abstract,Authors,documentType,pub_month,pub_year,pubtitle,subjectTerms,Database,Credentials,target_grouping
0,How Do Commodities Fit into Client Portfolios?,The standard stock--bond portfolio mix remains...,"Fink, Jason D, PhD;Fink, Kristin E, PhD",Commentary,Oct,2022,Journal of Financial Planning,"Financial planning , Investment , Bonds , Sto...",ProQuest One Academic,"[PhD, PhD]",Academic
1,The Role of Financial Planners on African Amer...,"* Addressing the issue of ""wealth gap,"" partic...","Chen, Leon, PhD, FRM;Duffy, Sophia, JD, CPA;Hi...",Commentary,Nov,2022,Journal of Financial Planning,"Credit , Small business , Wealth distribution...",ProQuest One Academic,"[PhD, JD, CPA, PhD, CFP]",Academic
2,"How to Effectively Manage Six ""Oh Bleep"" Clien...",Managing difficult client situations is part o...,"Heye, Chris, PhD",Journal Article,May,2022,Journal of Financial Planning,"Interest rates , Financial management , Finan...",ProQuest One Academic,[PhD],Academic
3,Inflation Uncertainty Calls for Portfolio Prot...,"Inflation currently sits at 7.9 percent, havin...","Marotta, Ryann, CFP®, CFA",Journal Article,May,2022,Journal of Financial Planning,"Investors , Investments , Pandemics , Monetar...",ProQuest One Academic,"[CFP, CFA]",Practitioner
4,Financial Planners' Gray Divorce Checklist,Divorce today is less common for adults younge...,"Stephenson, Angie M, CFP®, CPA/PFS",Journal Article,May,2022,Journal of Financial Planning,"Divorce , Marital separation , Marriage , Tax...",ProQuest One Academic,"[CFP, CPA]",Practitioner


In [8]:
X = df.Abstract
y = df.target_grouping

In [9]:
X.shape, y.shape

((2608,), (2608,))

In [10]:
y.value_counts()

target_grouping
Practitioner    1620
Academic         988
Name: count, dtype: int64

In [11]:
y.value_counts().to_csv("../results/binary_target_distribution.csv")

## Encode Target

In [12]:
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [13]:
encoder.classes_

array(['Academic', 'Practitioner'], dtype=object)

## Create Pipeline

In [14]:
models = [LogisticRegression(class_weight="balanced"), 
          DecisionTreeClassifier(class_weight="balanced"), 
          RandomForestClassifier(class_weight="balanced"),
          HistGradientBoostingClassifier(class_weight="balanced")]

# Define the custom transformer
class DenseTransformer:
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # make the array dense
        X = X.toarray()
        return X

pipelines = [Pipeline([
    ("vectorizer", TfidfVectorizer()),
    ("densify", DenseTransformer()),
    ("classifier", model)
    ]) for model in models]

In [15]:
pipelines

[Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                 ('densify',
                  <__main__.DenseTransformer object at 0x7fc725027cb0>),
                 ('classifier', LogisticRegression(class_weight='balanced'))]),
 Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                 ('densify',
                  <__main__.DenseTransformer object at 0x7fc7223ff320>),
                 ('classifier',
                  DecisionTreeClassifier(class_weight='balanced'))]),
 Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                 ('densify',
                  <__main__.DenseTransformer object at 0x7fc723673110>),
                 ('classifier',
                  RandomForestClassifier(class_weight='balanced'))]),
 Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                 ('densify',
                  <__main__.DenseTransformer object at 0x7fc722409100>),
                 ('classifier',
                  HistGradientBoostingClassifier(class_weight='balan

## Cross Validation

In [16]:
results = [cross_validate(pipeline, X, y, scoring=["precision_weighted", "recall_weighted", "accuracy", "f1_weighted"], cv=5, n_jobs=-1) for pipeline in pipelines]

In [26]:
result_df = pd.DataFrame({
    "model": models,
    "avg_precision_weighted": [result["test_precision_weighted"].mean() for result in results],
    "avg_recall_weighted": [result["test_recall_weighted"].mean() for result in results],
    "avg_f1_weighted": [result["test_f1_weighted"].mean() for result in results],
    "avg_accuracy": [result["test_accuracy"].mean() for result in results]
})

In [28]:
result_df.round(3)

Unnamed: 0,model,avg_precision_weighted,avg_recall_weighted,avg_f1_weighted,avg_accuracy
0,LogisticRegression(class_weight='balanced'),0.769,0.766,0.767,0.766
1,DecisionTreeClassifier(class_weight='balanced'),0.657,0.656,0.656,0.656
2,"(DecisionTreeClassifier(max_features='sqrt', r...",0.772,0.766,0.752,0.766
3,HistGradientBoostingClassifier(class_weight='b...,0.767,0.768,0.767,0.768


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [20]:
print(f"Train set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

Train set shape: (1956,)
Test set shape: (652,)


In [21]:
lr_pipeline = pipelines[0]

lr_pipeline.fit(X_train, y_train)

y_pred = lr_pipeline.predict(X_test)

lr_report = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True, target_names=encoder.classes_))

print("Logistic Regression Classification Report")
lr_report


Logistic Regression Classification Report


Unnamed: 0,Academic,Practitioner,accuracy,macro avg,weighted avg
precision,0.736264,0.878628,0.819018,0.807446,0.824696
recall,0.813765,0.822222,0.819018,0.817994,0.819018
f1-score,0.773077,0.84949,0.819018,0.811283,0.820542
support,247.0,405.0,0.819018,652.0,652.0


In [22]:
dt_pipeline = pipelines[1]

dt_pipeline.fit(X_train, y_train)

y_pred = dt_pipeline.predict(X_test)

dt_report = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True, target_names=encoder.classes_))

print("Decision Tree Classification Report")
dt_report

Decision Tree Classification Report


Unnamed: 0,Academic,Practitioner,accuracy,macro avg,weighted avg
precision,0.551724,0.736573,0.662577,0.644149,0.666546
recall,0.582996,0.711111,0.662577,0.647054,0.662577
f1-score,0.566929,0.723618,0.662577,0.645274,0.664259
support,247.0,405.0,0.662577,652.0,652.0


In [23]:
rf_pipeline = pipelines[2]

rf_pipeline.fit(X_train, y_train)

y_pred = rf_pipeline.predict(X_test)

rf_report = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True, target_names=encoder.classes_))

print("Random Forest Classification Report")
rf_report

Random Forest Classification Report


Unnamed: 0,Academic,Practitioner,accuracy,macro avg,weighted avg
precision,0.822857,0.784067,0.794479,0.803462,0.798762
recall,0.582996,0.923457,0.794479,0.753226,0.794479
f1-score,0.682464,0.848073,0.794479,0.765269,0.785335
support,247.0,405.0,0.794479,652.0,652.0


In [24]:
gbt_pipeline = pipelines[3]

gbt_pipeline.fit(X_train, y_train)

y_pred = gbt_pipeline.predict(X_test)

gbt_report = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True, target_names=encoder.classes_))

print("Gradient Boosted Trees Classification Report")
gbt_report

Gradient Boosted Trees Classification Report


Unnamed: 0,Academic,Practitioner,accuracy,macro avg,weighted avg
precision,0.738197,0.821002,0.791411,0.7796,0.789633
recall,0.696356,0.849383,0.791411,0.772869,0.791411
f1-score,0.716667,0.834951,0.791411,0.775809,0.790141
support,247.0,405.0,0.791411,652.0,652.0


## Save Results

In [25]:
result_df.to_csv("../results/binary_baseline_model_results.csv", index=False)