## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import re

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_validate

np.random.seed(13)
print(sklearn.__version__)
print(np.__version__)
print(pd.__version__)

1.6.1
2.2.0
2.2.3


## Import data

In [3]:
df = pd.read_csv("../data/JFP Credential Data_All_Journals_Final_1.2025.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5630 entries, 0 to 5629
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Title         5630 non-null   object
 1   Abstract      4046 non-null   object
 2   Authors       5466 non-null   object
 3   documentType  5630 non-null   object
 4   pub_month     5337 non-null   object
 5   pub_year      5630 non-null   int64 
 6   pubtitle      5630 non-null   object
 7   subjectTerms  4067 non-null   object
 8   Database      5630 non-null   object
dtypes: int64(1), object(8)
memory usage: 396.0+ KB


## Extract Credentials

In [5]:
df["Credentials"] = (df
                     .Authors
                     .apply(lambda x: re.findall(r"(CFP|PhD|CFA|JD|CPA|CLU|ChFC|LLM|AIF|CRC)", str(x)))
                     .apply(lambda x: np.nan if len(x) == 0 else x)
                     )

## Group Credentials into target groups

In [6]:
def target_grouping(x):
    if type(x) != list:
        return np.nan
    elif "PhD" in set(x):
        return "Academic"
    else:
        return "Practitioner"

df["target_grouping"] = df.Credentials.apply(target_grouping)

## Drop nulls

In [7]:
df = df.dropna(subset=["target_grouping", "Abstract"]).reset_index(drop=True)

In [8]:
df.head()

Unnamed: 0,Title,Abstract,Authors,documentType,pub_month,pub_year,pubtitle,subjectTerms,Database,Credentials,target_grouping
0,How Do Commodities Fit into Client Portfolios?,The standard stock--bond portfolio mix remains...,"Fink, Jason D, PhD;Fink, Kristin E, PhD",Commentary,Oct,2022,Journal of Financial Planning,"Financial planning , Investment , Bonds , Sto...",ProQuest One Academic,"[PhD, PhD]",Academic
1,The Role of Financial Planners on African Amer...,"* Addressing the issue of ""wealth gap,"" partic...","Chen, Leon, PhD, FRM;Duffy, Sophia, JD, CPA;Hi...",Commentary,Nov,2022,Journal of Financial Planning,"Credit , Small business , Wealth distribution...",ProQuest One Academic,"[PhD, JD, CPA, PhD, CFP]",Academic
2,"How to Effectively Manage Six ""Oh Bleep"" Clien...",Managing difficult client situations is part o...,"Heye, Chris, PhD",Journal Article,May,2022,Journal of Financial Planning,"Interest rates , Financial management , Finan...",ProQuest One Academic,[PhD],Academic
3,Inflation Uncertainty Calls for Portfolio Prot...,"Inflation currently sits at 7.9 percent, havin...","Marotta, Ryann, CFP®, CFA",Journal Article,May,2022,Journal of Financial Planning,"Investors , Investments , Pandemics , Monetar...",ProQuest One Academic,"[CFP, CFA]",Practitioner
4,Financial Planners' Gray Divorce Checklist,Divorce today is less common for adults younge...,"Stephenson, Angie M, CFP®, CPA/PFS",Journal Article,May,2022,Journal of Financial Planning,"Divorce , Marital separation , Marriage , Tax...",ProQuest One Academic,"[CFP, CPA]",Practitioner


In [9]:
X = df.Abstract
y = df.target_grouping

In [10]:
X.shape, y.shape

((2608,), (2608,))

In [11]:
y.value_counts()

target_grouping
Practitioner    1620
Academic         988
Name: count, dtype: int64

In [11]:
y.value_counts().to_csv("../results/binary_target_distribution.csv")

## Encode Target

In [12]:
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [13]:
encoder.classes_

array(['Academic', 'Practitioner'], dtype=object)

## Create Pipeline

In [14]:
pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer()),
    #("scaler", StandardScaler(with_mean=False))
])

models = [LogisticRegression(), MultinomialNB(), DecisionTreeClassifier(), RandomForestClassifier()]

pipelines = [Pipeline([
    ("vectorizer", TfidfVectorizer()),
    #("scaler", StandardScaler(with_mean=False)),
    ("classifier", model)
    ]) for model in models]

In [15]:
pipelines

[Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                 ('classifier', LogisticRegression())]),
 Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                 ('classifier', MultinomialNB())]),
 Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                 ('classifier', DecisionTreeClassifier())]),
 Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                 ('classifier', RandomForestClassifier())])]

## Cross Validation

In [16]:
results = [cross_validate(pipeline, X, y, scoring=["precision", "recall", "accuracy"], cv=5, n_jobs=-1) for pipeline in pipelines]

In [17]:
result_df = pd.DataFrame({
    "model": models,
    "avg_precision": [result["test_precision"].mean() for result in results],
    "avg_recall": [result["test_recall"].mean() for result in results],
    "avg_accuracy": [result["test_accuracy"].mean() for result in results]
})

In [18]:
result_df

Unnamed: 0,model,avg_precision,avg_recall,avg_accuracy
0,LogisticRegression(),0.754157,0.910937,0.756641
1,MultinomialNB(),0.671811,0.982722,0.687273
2,DecisionTreeClassifier(),0.73147,0.720749,0.659878
3,RandomForestClassifier(),0.754647,0.913597,0.757058


## Save Results

In [19]:
result_df.to_csv("../results/binary_baseline_model_results.csv", index=False)