# 02 - Feature Engineering
## Pre-Scorer Features, TF-IDF, and Money Extraction

Deterministic features computed before LLM scoring. Zero cost, runs locally.

**Data source:** Parameterized via `AWESOMEBITS_DB` env var.

In [None]:
import sys
sys.path.insert(0, '.')
from helpers import *
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from price_parser import Price
import re

setup_plotting()
con = connect()
df = con.execute('SELECT * FROM projects').df()
df['label'] = df.apply(label_project, axis=1)
df['text'] = df.apply(combined_text, axis=1)
print(f'Loaded {len(df):,} applications')

## Pre-Scorer Feature Replication

Replicate the Ruby `PreScorer` features in Python for analysis.

In [None]:
def compute_features(row):
    """Replicate Ruby PreScorer features."""
    features = {}
    texts = {f: str(row.get(f, '') or '') for f in TEXT_FIELDS}
    all_text = ' '.join(t.strip() for t in texts.values() if t.strip())

    # Word counts
    for f in TEXT_FIELDS:
        features[f'wc_{f}'] = word_count(texts[f])
    features['wc_total'] = sum(features[f'wc_{f}'] for f in TEXT_FIELDS)

    # Field length variance
    counts = [features[f'wc_{f}'] for f in TEXT_FIELDS]
    mean = np.mean(counts)
    features['field_length_variance'] = np.var(counts)

    # Sentence stats
    sentences = [s.strip() for s in re.split(r'[.!?]+', all_text) if s.strip()]
    sent_lens = [len(s.split()) for s in sentences]
    features['sentence_count'] = len(sentences)
    features['avg_sentence_length'] = np.mean(sent_lens) if sent_lens else 0
    features['sentence_length_variance'] = np.var(sent_lens) if sent_lens else 0

    # Punctuation
    features['exclamation_count'] = all_text.count('!')
    features['question_mark_count'] = all_text.count('?')

    # Content signals
    features['url_count'] = len(re.findall(r'https?://\S+', all_text))
    features['money_mention_count'] = money_mention_count(all_text)
    features['number_count'] = len(re.findall(r'\b\d[\d,.]*\b', all_text))
    features['email_count'] = len(re.findall(r'\S+@\S+\.\S+', all_text))

    # Empty fields
    features['empty_field_count'] = sum(1 for f in TEXT_FIELDS if not texts[f].strip())

    return pd.Series(features)

# Compute for all projects
print('Computing pre-scorer features...')
feat_df = df.apply(compute_features, axis=1)
df = pd.concat([df, feat_df], axis=1)
print(f'Features computed: {list(feat_df.columns)}')

## Feature Distributions: Funded vs Hidden

In [None]:
# Compare feature distributions
labeled = df[df.label.isin(['funded', 'hidden'])].copy()
feature_cols = [c for c in feat_df.columns if c != 'empty_field_count']

fig, axes = plt.subplots(3, 4, figsize=(16, 12))
for ax, col in zip(axes.flat, feature_cols):
    for label, color in [('funded', 'mediumseagreen'), ('hidden', 'salmon')]:
        data = labeled[labeled.label == label][col]
        q99 = data.quantile(0.99)
        ax.hist(data[data <= q99], bins=30, alpha=0.6, color=color, label=label, density=True)
    ax.set_title(col, fontsize=10)
    ax.legend(fontsize=8)
for ax in axes.flat[len(feature_cols):]:
    ax.set_visible(False)
plt.suptitle('Feature Distributions: Funded vs Hidden', y=1.01)
plt.tight_layout()
plt.show()

In [None]:
# Feature means by label
summary = df.groupby('label')[feature_cols].mean().round(2).T
summary['funded_hidden_ratio'] = (summary['funded'] / summary['hidden'].replace(0, np.nan)).round(2)
print(summary[['funded', 'hidden', 'unlabeled', 'funded_hidden_ratio']].to_string())

## Money Extraction Analysis

Using regex pattern matching to find currency mentions across the full corpus. This feeds the `money_mention_count` pre-scorer feature.

In [None]:
# Money mention distribution
print('Money mention counts by label:')
for label in ['funded', 'hidden', 'unlabeled']:
    subset = df[df.label == label]
    mc = subset.money_mention_count
    print(f'  {label:>10}: mean={mc.mean():.2f}, median={mc.median():.0f}, '
          f'zero={((mc == 0).mean()):.1%}, 3+={((mc >= 3).mean()):.1%}')

# Show examples of extracted money mentions
print('\nSample money extractions (funded apps):')
funded_with_money = df[(df.label == 'funded') & (df.money_mention_count > 0)].sample(5, random_state=42)
for _, row in funded_with_money.iterrows():
    mentions = extract_money_mentions(row.text)
    print(f'  [{row.money_mention_count} mentions] {mentions[:5]}')

## TF-IDF Analysis

Compute corpus-wide IDF values to identify distinctive vocabulary in funded vs hidden applications. The static IDF table can be exported for use in the Ruby pre-scorer.

In [None]:
# Build TF-IDF on full corpus
tfidf = TfidfVectorizer(
    max_features=10000,
    stop_words='english',
    min_df=10,
    max_df=0.8,
    ngram_range=(1, 2),
)
tfidf_matrix = tfidf.fit_transform(df.text.fillna(''))
feature_names = tfidf.get_feature_names_out()
idf_values = tfidf.idf_

print(f'Vocabulary size: {len(feature_names):,}')
print(f'IDF range: {idf_values.min():.2f} to {idf_values.max():.2f}')

# Most and least common terms
idf_df = pd.DataFrame({'term': feature_names, 'idf': idf_values}).sort_values('idf')
print('\nMost common terms (lowest IDF):')
print(idf_df.head(20).to_string(index=False))
print('\nRarest terms (highest IDF):')
print(idf_df.tail(20).to_string(index=False))

In [None]:
# Terms most distinctive of funded vs hidden
funded_mask = (df.label == 'funded').values
hidden_mask = (df.label == 'hidden').values

funded_mean = np.asarray(tfidf_matrix[funded_mask].mean(axis=0)).flatten()
hidden_mean = np.asarray(tfidf_matrix[hidden_mask].mean(axis=0)).flatten()

diff = funded_mean - hidden_mean
diff_df = pd.DataFrame({'term': feature_names, 'funded_mean': funded_mean, 'hidden_mean': hidden_mean, 'diff': diff})
diff_df = diff_df.sort_values('diff', ascending=False)

print('Terms most associated with FUNDED:')
print(diff_df.head(25)[['term', 'funded_mean', 'hidden_mean', 'diff']].to_string(index=False))
print('\nTerms most associated with HIDDEN:')
print(diff_df.tail(25)[['term', 'funded_mean', 'hidden_mean', 'diff']].to_string(index=False))

In [None]:
# Visualize top distinctive terms
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 8))

top_funded = diff_df.head(20)
ax1.barh(top_funded.term, top_funded['diff'], color='mediumseagreen')
ax1.set_title('Top 20 Terms: Funded')
ax1.invert_yaxis()

top_hidden = diff_df.tail(20).iloc[::-1]
ax2.barh(top_hidden.term, top_hidden['diff'].abs(), color='salmon')
ax2.set_title('Top 20 Terms: Hidden')
ax2.invert_yaxis()

plt.suptitle('TF-IDF: Most Distinctive Terms by Label')
plt.tight_layout()
plt.show()

## Export Static IDF Table

Export the IDF values as JSON for use in the Ruby pre-scorer. This avoids recomputing TF-IDF at scoring time.

In [None]:
import json
from pathlib import Path

idf_export = dict(zip(feature_names.tolist(), idf_values.round(4).tolist()))

out_path = Path(DB_PATH).parent / 'idf_table.json'
with open(out_path, 'w') as f:
    json.dump(idf_export, f)
print(f'Exported {len(idf_export):,} IDF values to {out_path}')

## Feature Correlations

In [None]:
# Correlation matrix of pre-scorer features
corr = df[feature_cols].corr()
fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='RdBu_r', center=0, ax=ax)
ax.set_title('Pre-Scorer Feature Correlations')
plt.tight_layout()
plt.show()