# Credential Study EDA

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
import re

np.random.seed(13)
print(sklearn.__version__)

1.0.2


## Import Data

In [2]:
df = pd.read_csv("../data/JFP Credential Data_All_Journals_Final.csv")

## Check nulls

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5086 entries, 0 to 5085
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Title         5086 non-null   object
 1   Abstract      3624 non-null   object
 2   Authors       4941 non-null   object
 3   documentType  5086 non-null   object
 4   pub_month     4816 non-null   object
 5   pub_year      5086 non-null   int64 
 6   pubtitle      5086 non-null   object
 7   subjectTerms  3651 non-null   object
 8   Database      5086 non-null   object
dtypes: int64(1), object(8)
memory usage: 357.7+ KB


## Extract Credentials

In [4]:
df["Credentials"] = (df
                     .Authors
                     .apply(lambda x: re.findall(r"(CFP|PhD|CFA|JD|CPA|CLU|ChFC|LLM|AIF|CRC)", str(x)))
                     .apply(lambda x: np.nan if len(x) == 0 else x)
                     )

## Group Credentials into target groups

In [5]:
def target_grouping(x):
    if type(x) != list:
        return np.nan
    elif set(x) == {"PhD"}:
        return "Academic"
    elif "PhD" in x and len(set(x)) == 2:
        return "Both"
    else:
        return "Practitioner"

df["target_grouping"] = df.Credentials.apply(target_grouping)

## Drop nulls

In [6]:
df = df.dropna()

In [7]:
df.head()

Unnamed: 0,Title,Abstract,Authors,documentType,pub_month,pub_year,pubtitle,subjectTerms,Database,Credentials,target_grouping
1,Tax Considerations for Relatively Wealthy Hous...,Reichenstein talks about households whose inco...,"Reichenstein, William, PhD CFA",Commentary,Dec,2021,Journal of Financial Planning,"Households , Income taxes , Medicare",ProQuest One Academic,"[PhD, CFA]",Both
5,Using Scaffolding Learning Theory as a Framewo...,Financial professionals face client resistance...,"Sterbenz, Elizabeth, LMFT;Ross, Dylan L, CFP® ...",Feature,Dec,2021,Journal of Financial Planning,"Financial planning , Theory , Methods",ProQuest One Academic,"[CFP, PhD, PhD, CFP]",Both
8,How to Have Successful Client Conversations Ab...,Heye discusses how to have successful client c...,"Heye, Chris, PhD",Commentary,Dec,2021,Journal of Financial Planning,"Financial planning , Health , Client relation...",ProQuest One Academic,[PhD],Academic
9,To Reduce the Risk of Retirement Portfolio Exh...,Financial planners have long regarded diversif...,"Walker, Philip;Sacks, Barry H, PhD JD;Sacks, S...",Feature,Dec,2021,Journal of Financial Planning,"Equity , Retirement planning , Portfolio mana...",ProQuest One Academic,"[PhD, JD, PhD]",Both
10,My Career Path: Featuring TJ Burkett,Burkett shares his career path. He joined the ...,"Burkett, T J, CFP®",Commentary,Dec,2021,Journal of Financial Planning,"Financial planning , Financial services",ProQuest One Academic,[CFP],Practitioner


## Vectorize Abstracts

In [8]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=150)

X = vectorizer.fit_transform(df.Abstract)

In [9]:
top_150_words = pd.DataFrame(vectorizer.get_feature_names_out(), columns=["top_150_words"])

In [10]:
top_150_words.to_csv("../results/abstract_top_150_words.csv", index=False)

## Analyze Target Grouping Counts

In [11]:
target_distribution = pd.DataFrame(df.target_grouping.value_counts().rename("target_grouping_counts"))

In [12]:
target_distribution.to_csv("../results/multiclass_target_distribution.csv")