## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import re

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_validate

np.random.seed(13)
print(sklearn.__version__)

1.0.2


## Import data

In [2]:
df = pd.read_csv("../data/JFP Credential Data_All_Journals_Final.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5086 entries, 0 to 5085
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Title         5086 non-null   object
 1   Abstract      3624 non-null   object
 2   Authors       4941 non-null   object
 3   documentType  5086 non-null   object
 4   pub_month     4816 non-null   object
 5   pub_year      5086 non-null   int64 
 6   pubtitle      5086 non-null   object
 7   subjectTerms  3651 non-null   object
 8   Database      5086 non-null   object
dtypes: int64(1), object(8)
memory usage: 357.7+ KB


## Extract Credentials

In [4]:
df["Credentials"] = (df
                     .Authors
                     .apply(lambda x: re.findall(r"(CFP|PhD|CFA|JD|CPA|CLU|ChFC|LLM|AIF|CRC)", str(x)))
                     .apply(lambda x: np.nan if len(x) == 0 else x)
                     )

## Group Credentials into target groups

In [5]:
def target_grouping(x):
    if type(x) != list:
        return np.nan
    elif set(x) == {"PhD"}:
        return "Academic"
    elif "PhD" in x and len(set(x)) == 2:
        return "Both"
    else:
        return "Practitioner"

df["target_grouping"] = df.Credentials.apply(target_grouping)

In [6]:
df.head()

Unnamed: 0,Title,Abstract,Authors,documentType,pub_month,pub_year,pubtitle,subjectTerms,Database,Credentials,target_grouping
0,Seeking Nirvana Amidst Chaos,Buddhism teaches that the only constant in lif...,"Cummings, Bridger",Feature,Dec,2021,Journal of Financial Planning,"Millennials , Financial planners , Financial ...",ProQuest One Academic,,
1,Tax Considerations for Relatively Wealthy Hous...,Reichenstein talks about households whose inco...,"Reichenstein, William, PhD CFA",Commentary,Dec,2021,Journal of Financial Planning,"Households , Income taxes , Medicare",ProQuest One Academic,"[PhD, CFA]",Both
2,Credits,,Anonymous,Credits,Dec,2021,Journal of Financial Planning,,ProQuest One Academic,,
3,Establishing Financial Self-efficacy Among Afr...,A financial literacy gap exists between Africa...,Anonymous,Feature,Dec,2021,Journal of Financial Planning,"African Americans , Financial literacy , Fina...",ProQuest One Academic,,
4,STAT BANK,,Anonymous,General Information,Dec,2021,Journal of Financial Planning,,ProQuest One Academic,,


## Drop nulls

In [7]:
df = df.dropna(subset=["target_grouping", "Abstract"]).reset_index(drop=True)

## Split Train and Target

In [8]:
X = df.Abstract
y = df.target_grouping

In [9]:
X.shape, y.shape

((2408,), (2408,))

In [10]:
y.value_counts()

Practitioner    1713
Academic         387
Both             308
Name: target_grouping, dtype: int64

## Encode Target

In [11]:
encoder = LabelEncoder()
y = encoder.fit_transform(y)

## Create Pipeline

In [12]:
pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer()),
    ("scaler", StandardScaler(with_mean=False))
])

models = [LogisticRegression(multi_class="ovr"), MultinomialNB(), DecisionTreeClassifier(), RandomForestClassifier()]

pipelines = [Pipeline([
    ("vectorizer", TfidfVectorizer()),
    ("scaler", StandardScaler(with_mean=False)),
    ("classifier", model)
    ]) for model in models]

In [13]:
pipelines

[Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                 ('scaler', StandardScaler(with_mean=False)),
                 ('classifier', LogisticRegression(multi_class='ovr'))]),
 Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                 ('scaler', StandardScaler(with_mean=False)),
                 ('classifier', MultinomialNB())]),
 Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                 ('scaler', StandardScaler(with_mean=False)),
                 ('classifier', DecisionTreeClassifier())]),
 Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                 ('scaler', StandardScaler(with_mean=False)),
                 ('classifier', RandomForestClassifier())])]

## Cross Validation

In [14]:
results = [cross_validate(pipeline, X, y, scoring=["precision_weighted", "recall_weighted"], cv=5, n_jobs=-1) for pipeline in pipelines]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
result_df = pd.DataFrame({
    "model": models,
    "avg_precision_weighted": [result["test_precision_weighted"].mean() for result in results],
    "avg_recall_weighted": [result["test_recall_weighted"].mean() for result in results]
})

In [16]:
result_df

Unnamed: 0,model,avg_precision_weighted,avg_recall_weighted
0,LogisticRegression(multi_class='ovr'),0.675067,0.72219
1,MultinomialNB(),0.668272,0.675269
2,DecisionTreeClassifier(),0.607661,0.598836
3,RandomForestClassifier(),0.65327,0.718029


## Save Results

In [17]:
result_df.to_csv("../results/baseline_model_results.csv", index=False)