## Import Libraries

In [26]:
import numpy as np
import pandas as pd
import re

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_validate

np.random.seed(13)
print(sklearn.__version__)

1.0.2


## Import data

In [2]:
df = pd.read_csv("../data/JFP Credential Data - ProQuest v2.csv")

## Extract Credentials

In [3]:
df["Credentials"] = (df
                     .Authors
                     .apply(lambda x: re.findall(r"(CFP|PhD|CFA|JD|CPA|CLU|ChFC|LLM|AIF|CRC)", str(x)))
                     .apply(lambda x: np.nan if len(x) == 0 else x)
                     )

## Group Credentials into target groups

In [4]:
def target_grouping(x):
    if type(x) != list:
        return np.nan
    elif set(x) == {"PhD"}:
        return "Academic"
    elif "PhD" in x and len(set(x)) == 2:
        return "Both"
    else:
        return "Practitioner"

df["target_grouping"] = df.Credentials.apply(target_grouping)

## Drop nulls

In [5]:
df = df.dropna().reset_index(drop=True)

In [6]:
df.head()

Unnamed: 0,Journal,Title,Abstract,Authors,Tax_Title_Count,Tax_Title_Binary,Tax_Abstract_Count,Tax_Abstract_Binary,Tax_Total_Count,Tax_Total_Binary,...,LLM_Count,LLM_Binary,AIF_Count,AIF_Binary,CRC_Count,CRC_Binary,pubdate,year,Credentials,target_grouping
0,JFP,Tax Considerations for Relatively Wealthy Hous...,Reichenstein talks about households whose inco...,"Reichenstein, William, PhD CFA",1,1,2,1,3,1,...,0,0,0,0,0,0,21-Dec,2021,"[PhD, CFA]",Both
1,JFP,Using Scaffolding Learning Theory as a Framewo...,Financial professionals face client resistance...,"Sterbenz, Elizabeth, LMFT;Ross, Dylan L, CFP® ...",0,0,0,0,0,0,...,0,0,0,0,0,0,21-Dec,2021,"[CFP, PhD, PhD, CFP]",Both
2,JFP,How to Have Successful Client Conversations Ab...,Heye discusses how to have successful client c...,"Heye, Chris, PhD",0,0,0,0,0,0,...,0,0,0,0,0,0,21-Dec,2021,[PhD],Academic
3,JFP,To Reduce the Risk of Retirement Portfolio Exh...,Financial planners have long regarded diversif...,"Walker, Philip;Sacks, Barry H, PhD JD;Sacks, S...",0,0,0,0,0,0,...,0,0,0,0,0,0,21-Dec,2021,"[PhD, JD, PhD]",Both
4,JFP,My Career Path: Featuring TJ Burkett,Burkett shares his career path. He joined the ...,"Burkett, T J, CFP®",0,0,0,0,0,0,...,0,0,0,0,0,0,21-Dec,2021,[CFP],Practitioner


## Split Train and Target

In [11]:
X = df.Abstract
y = df.target_grouping

## Encode Target

In [12]:
encoder = LabelEncoder()
y = encoder.fit_transform(y)

## Create Pipeline

In [29]:
pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer()),
    ("scaler", StandardScaler(with_mean=False)),
    ("classifier", LogisticRegression(multi_class="ovr"))
])

## Cross Validation

In [33]:
results = cross_validate(pipeline, X, y, scoring=["precision_micro", "precision_macro"], cv=5, n_jobs=-1)

In [34]:
results

{'fit_time': array([0.40340686, 0.39232993, 0.32241297, 0.31832409, 0.38730097]),
 'score_time': array([0.02608991, 0.02849293, 0.0234189 , 0.02388883, 0.02189994]),
 'test_precision_micro': array([0.74261603, 0.75949367, 0.75527426, 0.76271186, 0.75847458]),
 'test_precision_macro': array([0.53339589, 0.44444444, 0.42526158, 0.7154797 , 0.55777778])}