In [1]:
import numpy as np
import pandas as pd
import ast


### GPU Acceleration --IGNORE IF NOT USING COLAB--

In [2]:
# Below line allows for GPU acceleration of sci-kit modeling apis
%load_ext cuml.accel

cuML: Accelerator installed.


In [3]:
#checking cuml and IPython versions, some versions are not compatible. cuml 25.06.00 and IPython 7.34.0 are compatible

import IPython
print(f"IPython version: {IPython.__version__}")

try:
    import cuml
    print(f"cuml version: {cuml.__version__}")
except ImportError:
    print("cuml is not installed or could not be imported.")

IPython version: 7.34.0
cuml version: 25.06.00


### If you are mounting the files from drive

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#### Using CSV

In [None]:
main_combined_df_path = '/content/drive/MyDrive/final combined data/all_features_df.csv'
# main_combined_df_path = 'all_features_df.csv'
df = pd.read_csv(main_combined_df_path)
df['claim_text'] = df['claim_text'].apply(ast.literal_eval)



#### Using Pickle File --RECOMMENDED--

In [None]:
pkl_path = ''
df = pd.read_pickle()

In [None]:
pd.set_option('display.max_columns', None)
print(df.columns)
print(len(df))
df.head()

### Manual Vectorizer Creation --IGNORE--

Vectorization does work if you do it from scratch, however it does not save to the joblib file properly for some reason. This is not the preferred way to make a sparse matrix since Karl has provided a vectorizationn object that does work with claims text

In [None]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

# tfidf_features = joblib.load('./tfidf_features.joblib')
# tlidf_vectorizer = joblib.load('./tfidf_vectorizer.joblib')

# suppose texts is a NumPy array (or list) of token lists
# e.g. [['novel', 'catalyst'], ['apparatus', 'for', '...'], ...]

def identity(tokens):
    return tokens

tlidf_vectorizer = TfidfVectorizer(
    analyzer=identity,     # accept pre-tokenized lists
    lowercase=False,       # tokens already normalized
    dtype=float,
    norm="l2"
)

In [None]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

def build_vocabulary(input_df):
    #This builds a vocabulary based on the tokens in the claim_text column
    #it saves the vocabulary to whatever you put in output_vocab_file which should be a .json file

    vocab = set()

    reader = input_df['claim_text']
    print('Creating vocabulary...')
    for token_list in reader:
        vocab.update(token_list)
    print(f"Vocabulary created with {len(vocab):,} unique tokens")

    # Convert set to sorted list for stable saving
    vocab_list = sorted(vocab)

    return vocab_list

def build_claim_vectorizer(vocab):
    """Return a TF-IDF vectorizer with the same settings as fit_tfidf_vectorizer."""
    vectorizer = TfidfVectorizer(
        vocabulary={tok: i for i, tok in enumerate(vocab)},
        analyzer=lambda tokens: tokens,   # accept pre-tokenized lists
        lowercase=False,
        norm="l2",
        dtype=float,
        use_idf=True,

    )
    return vectorizer

# usage
# df['claim_text'] is a column of lists, e.g. ['apparatus', 'comprising', ...]

vocab_list = build_vocabulary(df)
vectorizer = build_claim_vectorizer(vocab_list)
sparse_matrix = vectorizer.fit_transform(df['claim_text'])

Creating vocabulary...
Vocabulary created with 637,906 unique tokens




In [None]:
from scipy import sparse
num_docs = len(df['claim_text'])
token_to_idx = vectorizer.vocabulary_
df_counts = np.zeros(len(token_to_idx), dtype=np.int64)

for tokens in df['claim_text']:
    for tok in set(tokens):
        idx = token_to_idx.get(tok)
        if idx is not None:
            df_counts[idx] += 1


idf = np.log((1 + num_docs) / (1 + df_counts)) + 1.0
vectorizer.idf_ = idf
vectorizer._tfidf._idf_diag = sparse.spdiags(idf, diags=0, m=len(idf), n=len(idf))
joblib.dump(vectorizer, './tfidf_vectorizer_v2.joblib')


PicklingError: Can't pickle <function build_claim_vectorizer.<locals>.<lambda> at 0x9ac9c4fe0>: it's not found as __main__.build_claim_vectorizer.<locals>.<lambda>

### Vectorizer Loading -- DONT IGNORE --

In [None]:
# sparse_matrix = tlidf_vectorizer.transform(df['claim_text'])
tlidf_vectorizer = joblib.load('/content/drive/MyDrive/final combined data/tfidf_vectorizer_for_bryant.joblib')
sparse_matrix = tlidf_vectorizer.fit_transform(df['claim_text'])






### SVD of tfidf matrix, Creation of X data and Y data for train-test-split

In [None]:
from sklearn.decomposition import TruncatedSVD
num_dimensions = 100
svd = TruncatedSVD(
    n_components=num_dimensions,
    n_iter=7,
    random_state=70
)
sparse_matrix.shape


(3909923, 637906)

In [None]:
from scipy.sparse import csr_matrix
from scipy.sparse import hstack

extra_np = (
    df.drop(columns=['claim_text', 'application_number', 'y_approved', 'y_101', 'y_102', 'y_103', 'y_112'])
    .apply(pd.to_numeric, errors="coerce")  # convert strings/bools -> numbers
    .fillna(0)                              # replace NaNs
    .to_numpy(dtype=np.float64)             # ensure a numeric dtype
)

extra_sparse = csr_matrix(extra_np)
print(f"Performing TruncatedSVD with {num_dimensions} components...")
tfidf_svd = svd.fit_transform(sparse_matrix)

X_aug_combined = hstack([tfidf_svd, extra_sparse])
X_aug_no_tfidf = extra_sparse


Performing TruncatedSVD with 100 components...


: 

In [None]:
y_rejected = df['y_approved'] == 0
y_rejected = y_rejected.astype(int)
y_has_rej101 = df['y_101']
y_has_rej102 = df['y_102']
y_has_rej103 = df['y_103']
y_has_rej112 = df['y_112']

In [None]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV


### Train test split of all data

Below is train test split of data without tfidf sparse matrix

In [None]:
#  getting train and test data for data with no tfidf
from sklearn.model_selection import train_test_split
X_train_rejected_no_tfidf, X_test_rejected_no_tfidf, y_train_rejected_no_tfidf, y_test_rejected_no_tfidf = train_test_split(X_aug_no_tfidf_svd, y_rejected, test_size=0.2, random_state=42)
X_train_has_rej101_no_tfidf, X_test_has_rej101_no_tfidf, y_train_has_rej101_no_tfidf, y_test_has_rej101_no_tfidf = train_test_split(X_aug_no_tfidf_svd, y_has_rej101, test_size=0.2, random_state=42)
X_train_has_rej102_no_tfidf, X_test_has_rej102_no_tfidf, y_train_has_rej102_no_tfidf, y_test_has_rej102_no_tfidf = train_test_split(X_aug_no_tfidf_svd, y_has_rej102, test_size=0.2, random_state=42)
X_train_has_rej103_no_tfidf, X_test_has_rej103_no_tfidf, y_train_has_rej103_no_tfidf, y_test_has_rej103_no_tfidf = train_test_split(X_aug_no_tfidf_svd, y_has_rej103, test_size=0.2, random_state=42)
X_train_has_rej112_no_tfidf, X_test_has_rej112_no_tfidf, y_train_has_rej112_no_tfidf, y_test_has_rej112_no_tfidf = train_test_split(X_aug_no_tfidf_svd, y_has_rej112, test_size=0.2, random_state=42)

NameError: name 'X_aug_no_tfidf_svd' is not defined

In [None]:
#  getting train and test data for data with tfidf
from sklearn.model_selection import train_test_split

X_train_rejected, X_test_rejected, y_train_rejected, y_test_rejected = train_test_split(X_aug_combined_svd, y_rejected, test_size=0.2, random_state=42)
X_train_has_rej101, X_test_has_rej101, y_train_has_rej101, y_test_has_rej101 = train_test_split(X_aug_combined_svd, y_has_rej101, test_size=0.2, random_state=42)
X_train_has_rej102, X_test_has_rej102, y_train_has_rej102, y_test_has_rej102 = train_test_split(X_aug_combined_svd, y_has_rej102, test_size=0.2, random_state=42)
X_train_has_rej103, X_test_has_rej103, y_train_has_rej103, y_test_has_rej103 = train_test_split(X_aug_combined_svd, y_has_rej103, test_size=0.2, random_state=42)
X_train_has_rej112, X_test_has_rej112, y_train_has_rej112, y_test_has_rej112 = train_test_split(X_aug_combined_svd, y_has_rej112, test_size=0.2, random_state=42)

### Modeling

In [None]:
def get_best_random_gb_params(X_train, y_train, scoring = 'f1', n_iter = 20):
    gb = GradientBoostingClassifier(random_state=0)
    gb_space = {
        "n_estimators": np.arange(50, 401),
        "learning_rate": np.logspace(-3, 0, 100),
        "max_depth": np.arange(1, 6),
        "min_samples_split": np.arange(2, 21),
    }
    gb_rand = RandomizedSearchCV(
        gb,
        gb_space,
        n_iter=n_iter,
        scoring=scoring,
        cv=5,
        random_state=0,
        n_jobs=-1,
    )
    gb_rand.fit(X_train, y_train)
    gb_best = gb_rand.best_params_
    return gb_best

def get_best_random_rf_params(X_train, y_train, scoring = 'f1', n_iter = 20):
    rf = RandomForestClassifier(random_state=0)
    rf_space = {
        "n_estimators": np.arange(100, 801),
        "max_depth": [None, *np.arange(5, 31)],
        "min_samples_split": np.arange(2, 21),
        "min_samples_leaf": np.arange(1, 11),
        "max_features": ["sqrt", "log2", 0.5, None],
    }
    rf_rand = RandomizedSearchCV(
        rf,
        rf_space,
        n_iter=n_iter,
        scoring=scoring,
        cv=5,
        random_state=0,
        n_jobs=-1,
    )
    rf_rand.fit(X_train, y_train)
    rf_best = rf_rand.best_params_
    return rf_best

def get_best_grid_search_gb_model(X_train, y_train, gb_best, scoring = 'f1'):
    gb_grid = {
        "n_estimators": [gb_best["n_estimators"] - 50,
                        gb_best["n_estimators"],
                        gb_best["n_estimators"] + 50],
        "learning_rate": [gb_best["learning_rate"] * f for f in [0.5, 1.0, 1.5]],
        "max_depth": [gb_best["max_depth"]],
        "min_samples_split": [gb_best["min_samples_split"] - 1,
                            gb_best["min_samples_split"],
                            gb_best["min_samples_split"] + 1],
    }
    gb_grid = {k: [v] if not isinstance(v, list) else v for k, v in gb_grid.items()}
    gb_grid = {k: [val for val in vals if val is not None and val > 0] for k, vals in gb_grid.items()}
    gb_gridsearch = GridSearchCV(
        GradientBoostingClassifier(random_state=0),
        gb_grid,
        scoring=scoring,
        cv=5,
        n_jobs=-1,
    )
    return gb_gridsearch

def get_best_grid_search_rf_model(X_train, y_train, rf_best, scoring = 'f1'):
    rf_grid = {
        "n_estimators": [rf_best["n_estimators"] - 100,
                        rf_best["n_estimators"],
                        rf_best["n_estimators"] + 100],
        "max_depth": [None if rf_best["max_depth"] is None else d for d in [
            rf_best["max_depth"] - 5,
            rf_best["max_depth"],
            rf_best["max_depth"] + 5,
        ] if d is None or d > 0],
        "min_samples_split": [max(2, rf_best["min_samples_split"] - 1),
                            rf_best["min_samples_split"],
                            rf_best["min_samples_split"] + 1],
        "min_samples_leaf": [max(1, rf_best["min_samples_leaf"] - 1),
                            rf_best["min_samples_leaf"],
                            rf_best["min_samples_leaf"] + 1],
        "max_features": [rf_best["max_features"]],
    }
    rf_grid = {k: [v] if not isinstance(v, list) else v for k, v in rf_grid.items()}
    rf_grid = {k: [val for val in vals if val is not None] for k, vals in rf_grid.items()}
    rf_gridsearch = GridSearchCV(
        RandomForestClassifier(random_state=0),
        rf_grid,
        scoring=scoring,
        cv=5,
        n_jobs=-1,
    )
    return rf_gridsearch

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import time

def create_models(X_train, y_train, rf_params = None, gb_params = None):
    models = {}
    models['LogR'] = LogisticRegression(solver='saga')
    models['LinR'] = LinearRegression()
    if rf_params:
        models['RF'] = RandomForestClassifier(**rf_params)
    else:
        models['RF'] = RandomForestClassifier(n_estimators=100)
    if gb_params:
        models['GB'] = GradientBoostingClassifier(**gb_params)
    else:
        models['GB'] = GradientBoostingClassifier(n_estimators=100)
    print("Fitting Logistic Regression")
    start_time = time.time()
    models['LogR'].fit(X_train, y_train)
    end_time = time.time()
    print(f"Logistic Regression training time: {end_time - start_time} seconds")
    print("Fitting Linear Regression")
    start_time = time.time()
    models['LinR'].fit(X_train, y_train)
    end_time = time.time()
    print(f"Linear Regression training time: {end_time - start_time} seconds")
    print("Fitting Random Forest")
    start_time = time.time()
    models['RF'].fit(X_train, y_train)
    end_time = time.time()
    print(f"Random Forest training time: {end_time - start_time} seconds")
    print("Fitting Gradient Boosting")
    start_time = time.time()
    models['GB'].fit(X_train, y_train)
    end_time = time.time()
    print(f"Gradient Boosting training time: {end_time - start_time} seconds")
    return models

In [None]:
# Fill in below with the right data, currently filled in for overall rejections using X values with no tfidf data
X_train = X_train_rejected_no_tfidf
y_train = y_train_rejected_no_tfidf
X_test = X_test_rejected_no_tfidf
y_test = y_test_rejected_no_tfidf

In [None]:
rf_params_random = get_best_random_rf_params(X_train, y_train)
gb_params_random = get_best_random_gb_params(X_train, y_train)
rf_params_grid = get_best_grid_search_rf_model(X_train, y_train, rf_params_random)
gb_params_grid = get_best_grid_search_gb_model(X_train, y_train, gb_params_random)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix



#fill in rf params and gb params with the best params from randomized search or grid search
rf_params = rf_params_grid
gb_params = gb_params_grid
# get accuracy, F1, precision, recall, and confusion matrix
models = create_models(X_train, y_train)
for model_name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"F1 Score: {f1_score(y_test, y_pred)}")
    print(f"Precision: {precision_score(y_test, y_pred)}")
    print(f"Recall: {recall_score(y_test, y_pred)}")
    print(f"Confusion Matrix: {confusion_matrix(y_test, y_pred)}")
    print("\n")



Fitting Logistic Regression




Logistic Regression training time: 276.5072569847107 seconds
Fitting Linear Regression
Linear Regression training time: 18.159186840057373 seconds
Fitting Random Forest
