# Credential Study Data Prep

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
import re

np.random.seed(13)
print(sklearn.__version__)

1.0.2


## Import Data

In [14]:
df = pd.read_csv("JFP Credential Data - ProQuest v2.csv")

In [15]:
df.head()

Unnamed: 0,Journal,Title,Abstract,Authors,Tax_Title_Count,Tax_Title_Binary,Tax_Abstract_Count,Tax_Abstract_Binary,Tax_Total_Count,Tax_Total_Binary,...,ChFC_Count,ChFC_Binary,LLM_Count,LLM_Binary,AIF_Count,AIF_Binary,CRC_Count,CRC_Binary,pubdate,year
0,JFP,Seeking Nirvana Amidst Chaos,Buddhism teaches that the only constant in lif...,"Cummings, Bridger",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,21-Dec,2021
1,JFP,Tax Considerations for Relatively Wealthy Hous...,Reichenstein talks about households whose inco...,"Reichenstein, William, PhD CFA",1,1,2,1,3,1,...,0,0,0,0,0,0,0,0,21-Dec,2021
2,JFP,Credits,,Anonymous,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,21-Dec,2021
3,JFP,Establishing Financial Self-efficacy Among Afr...,A financial literacy gap exists between Africa...,Anonymous,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,21-Dec,2021
4,JFP,STAT BANK,,Anonymous,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,21-Dec,2021


## Extract Credentials

In [16]:
df["Credentials"] = (df
                     .Authors
                     .apply(lambda x: re.findall(r"(CFP|PhD|CFA|JD|CPA|CLU|ChFC|LLM|AIF|CRC)", str(x)))
                     .apply(lambda x: np.nan if len(x) == 0 else x)
                     )

## Group Credentials into target groups

In [17]:
def target_grouping(x):
    if type(x) != list:
        return np.nan
    elif set(x) == {"PhD"}:
        return "Academic"
    elif "PhD" in x and len(set(x)) == 2:
        return "Both"
    else:
        return "Practitioner"

df["target_grouping"] = df.Credentials.apply(target_grouping)

## Drop nulls

In [18]:
df = df.dropna()

In [19]:
df.head()

Unnamed: 0,Journal,Title,Abstract,Authors,Tax_Title_Count,Tax_Title_Binary,Tax_Abstract_Count,Tax_Abstract_Binary,Tax_Total_Count,Tax_Total_Binary,...,LLM_Count,LLM_Binary,AIF_Count,AIF_Binary,CRC_Count,CRC_Binary,pubdate,year,Credentials,target_grouping
1,JFP,Tax Considerations for Relatively Wealthy Hous...,Reichenstein talks about households whose inco...,"Reichenstein, William, PhD CFA",1,1,2,1,3,1,...,0,0,0,0,0,0,21-Dec,2021,"[PhD, CFA]",Both
5,JFP,Using Scaffolding Learning Theory as a Framewo...,Financial professionals face client resistance...,"Sterbenz, Elizabeth, LMFT;Ross, Dylan L, CFP® ...",0,0,0,0,0,0,...,0,0,0,0,0,0,21-Dec,2021,"[CFP, PhD, PhD, CFP]",Both
8,JFP,How to Have Successful Client Conversations Ab...,Heye discusses how to have successful client c...,"Heye, Chris, PhD",0,0,0,0,0,0,...,0,0,0,0,0,0,21-Dec,2021,[PhD],Academic
9,JFP,To Reduce the Risk of Retirement Portfolio Exh...,Financial planners have long regarded diversif...,"Walker, Philip;Sacks, Barry H, PhD JD;Sacks, S...",0,0,0,0,0,0,...,0,0,0,0,0,0,21-Dec,2021,"[PhD, JD, PhD]",Both
10,JFP,My Career Path: Featuring TJ Burkett,Burkett shares his career path. He joined the ...,"Burkett, T J, CFP®",0,0,0,0,0,0,...,0,0,0,0,0,0,21-Dec,2021,[CFP],Practitioner


## Vectorize Abstracts

In [57]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=150)

X = vectorizer.fit_transform(df.Abstract)

In [64]:
top_150_words = pd.DataFrame(vectorizer.get_feature_names_out(), columns=["top_150_words"])

In [66]:
top_150_words.to_csv("abstract_top_150_words.csv", index=False)

## Analyze Target Grouping Counts

In [75]:
target_distribution = pd.DataFrame(df.target_grouping.value_counts().rename("target_grouping_counts"))

In [76]:
target_distribution.to_csv("target_distribution.csv")