## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import re

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_validate

np.random.seed(13)
print(sklearn.__version__)
print(np.__version__)
print(pd.__version__)

1.0.2
1.20.3
1.3.3


## Import data

In [2]:
df = pd.read_csv("../data/JFP Credential Data_All_Journals_Final.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5081 entries, 0 to 5080
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Title         5081 non-null   object
 1   Abstract      3623 non-null   object
 2   Authors       4936 non-null   object
 3   documentType  5081 non-null   object
 4   pub_month     4811 non-null   object
 5   pub_year      5081 non-null   int64 
 6   pubtitle      5081 non-null   object
 7   subjectTerms  3650 non-null   object
 8   Database      5081 non-null   object
dtypes: int64(1), object(8)
memory usage: 357.4+ KB


## Extract Credentials

In [4]:
df["Credentials"] = (df
                     .Authors
                     .apply(lambda x: re.findall(r"(CFP|PhD|CFA|JD|CPA|CLU|ChFC|LLM|AIF|CRC)", str(x)))
                     .apply(lambda x: np.nan if len(x) == 0 else x)
                     )

## Group Credentials into target groups

In [5]:
def target_grouping(x):
    if type(x) != list:
        return np.nan
    elif "PhD" in set(x):
        return "Academic"
    else:
        return "Practitioner"

df["target_grouping"] = df.Credentials.apply(target_grouping)

## Drop nulls

In [7]:
df = df.dropna(subset=["target_grouping", "Abstract"]).reset_index(drop=True)

## Split Train and Target

In [8]:
X = df.Abstract
y = df.target_grouping

In [9]:
X.shape, y.shape

((2408,), (2408,))

In [10]:
y.value_counts()

Practitioner    1504
Academic         904
Name: target_grouping, dtype: int64

In [11]:
y.value_counts().to_csv("../results/binary_target_distribution.csv")

## Encode Target

In [12]:
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [13]:
encoder.classes_

array(['Academic', 'Practitioner'], dtype=object)

## Create Pipeline

In [14]:
pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer()),
    #("scaler", StandardScaler(with_mean=False))
])

models = [LogisticRegression(), MultinomialNB(), DecisionTreeClassifier(), RandomForestClassifier()]

pipelines = [Pipeline([
    ("vectorizer", TfidfVectorizer()),
    #("scaler", StandardScaler(with_mean=False)),
    ("classifier", model)
    ]) for model in models]

In [15]:
pipelines

[Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                 ('classifier', LogisticRegression())]),
 Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                 ('classifier', MultinomialNB())]),
 Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                 ('classifier', DecisionTreeClassifier())]),
 Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                 ('classifier', RandomForestClassifier())])]

## Cross Validation

In [16]:
results = [cross_validate(pipeline, X, y, scoring=["precision", "recall", "accuracy"], cv=5, n_jobs=-1) for pipeline in pipelines]

In [17]:
result_df = pd.DataFrame({
    "model": models,
    "avg_precision": [result["test_precision"].mean() for result in results],
    "avg_recall": [result["test_recall"].mean() for result in results],
    "avg_accuracy": [result["test_accuracy"].mean() for result in results]
})

In [18]:
result_df

Unnamed: 0,model,avg_precision,avg_recall,avg_accuracy
0,LogisticRegression(),0.754157,0.910937,0.756641
1,MultinomialNB(),0.671811,0.982722,0.687273
2,DecisionTreeClassifier(),0.73147,0.720749,0.659878
3,RandomForestClassifier(),0.754647,0.913597,0.757058


## Save Results

In [19]:
result_df.to_csv("../results/binary_baseline_model_results.csv", index=False)