In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
pd.set_option("display.float_format", lambda v: f"{v:0.3f}")
sns.set_theme(style="whitegrid", context="talk")

spam = pd.read_csv("https://jdgrossman.com/assets/spam.csv")
spam.sample(10, random_state=42)

Unnamed: 0,make,address,all,3d,our,over,remove,internet,order,mail,...,char_semicolon,char_left_paren,char_left_bracket,char_exclamation,char_dollar,char_pound,capital_run_length_average,capital_run_length_longest,capital_run_length_total,is_spam
3683,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,3,0
4412,0.71,0.0,0.71,0.0,0.0,0.0,0.0,0.0,0.0,0.71,...,0.0,0.0,0.0,0.0,0.0,0.0,1.032,2,32,0
2584,0.0,0.0,0.91,0.0,0.0,0.0,0.0,0.45,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.32,7,103,0
69,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.201,0.0,0.0,0.1,0.0,4.548,59,141,1
1844,0.0,0.0,0.54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.188,0.047,0.0,0.0,0.0,1.745,12,89,0
33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.302,0.0,1.7,5,17,1
3085,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.314,0.0,0.0,2.473,10,47,0
4034,0.51,0.0,0.51,0.0,1.53,0.51,0.0,0.0,0.0,0.0,...,0.0,0.27,0.0,0.0,0.0,0.0,1.983,24,121,0
4470,0.0,0.0,0.44,0.0,0.0,0.44,0.0,0.0,0.0,0.0,...,0.0,0.074,0.0,0.149,0.0,0.0,1.115,2,29,0
4579,0.27,0.05,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.607,0.064,0.036,0.055,0.0,0.202,3.766,43,1789,0


In [None]:
num_emails = len(spam)
spam_fraction = spam['is_spam'].mean()
money_idx = spam['money'].idxmax()
money_row = spam.loc[money_idx, ['money']]
print(f"Total emails: {num_emails}")
print(f"Fraction spam: {spam_fraction:.3f}")
print(f"Email with highest 'money' share: row {money_idx} with {money_row['money']:.2f}% of words")
spam.loc[money_idx]

Total emails: 4601
Fraction spam: 0.394
Email with highest 'money' share: row 545 with 12.50% of words


Unnamed: 0,545
make,0.0
address,0.0
all,0.0
3d,0.0
our,0.0
over,0.0
remove,0.0
internet,0.0
order,0.0
mail,0.0


In [None]:
import statsmodels.api as sm
from statsmodels.tools import add_constant

predictors = ['char_dollar', 'credit', 'money', 're']
X = add_constant(spam[predictors])
y = spam['is_spam']
lpm = sm.OLS(y, X).fit()
print(lpm.summary())

                            OLS Regression Results                            
Dep. Variable:                is_spam   R-squared:                       0.179
Model:                            OLS   Adj. R-squared:                  0.178
Method:                 Least Squares   F-statistic:                     250.3
Date:                Tue, 18 Nov 2025   Prob (F-statistic):          8.51e-195
Time:                        07:52:33   Log-Likelihood:                -2780.3
No. Observations:                4601   AIC:                             5571.
Df Residuals:                    4596   BIC:                             5603.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const           0.3346      0.007     45.696      

In [None]:
pred_probs = lpm.predict(X)
min_prob = pred_probs.min()
max_prob = pred_probs.max()
print(f"Smallest predicted probability: {min_prob:.3f}")
print(f"Largest predicted probability: {max_prob:.3f}")
print("Any predictions < 0?", (pred_probs < 0).any())
print("Any predictions > 1?", (pred_probs > 1).any())
pred_probs.describe()

Smallest predicted probability: -0.813
Largest predicted probability: 3.849
Any predictions < 0? True
Any predictions > 1? True


Unnamed: 0,0
count,4601.0
mean,0.394
std,0.207
min,-0.813
25%,0.335
50%,0.335
75%,0.395
max,3.849


In [None]:
def prob_to_odds(p):
    p = np.asarray(p)
    return p / (1 - p)

def odds_to_prob(o):
    o = np.asarray(o)
    return o / (1 + o)

print("Prob 2/3 -> odds:", prob_to_odds(2/3))
print("Odds 2 -> prob:", odds_to_prob(2))

p_win = 0.60
odds_win = prob_to_odds(p_win)
doubled_odds_prob = odds_to_prob(2 * odds_win)
print(f"Original prob 60% -> odds {odds_win:.3f}; doubling odds gives prob {doubled_odds_prob:.3f}")

Prob 2/3 -> odds: 1.9999999999999998
Odds 2 -> prob: 0.6666666666666666
Original prob 60% -> odds 1.500; doubling odds gives prob 0.750


In [None]:

logit_predictors = ['char_dollar', 'credit', 'money', 're']
X_logit = add_constant(spam[logit_predictors])
y_logit = spam['is_spam']

logit_model = sm.GLM(y_logit, X_logit, family=sm.families.Binomial())
logit_results = logit_model.fit()
logit_results.summary()

0,1,2,3
Dep. Variable:,is_spam,No. Observations:,4601.0
Model:,GLM,Df Residuals:,4596.0
Model Family:,Binomial,Df Model:,4.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-2213.9
Date:,"Tue, 18 Nov 2025",Deviance:,4427.8
Time:,07:53:51,Pearson chi2:,10000000000.0
No. Iterations:,8,Pseudo R-squ. (CS):,0.3152
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.0666,0.043,-24.680,0.000,-1.151,-0.982
char_dollar,11.8176,0.605,19.549,0.000,10.633,13.002
credit,2.3119,0.343,6.741,0.000,1.640,2.984
money,1.9933,0.248,8.022,0.000,1.506,2.480
re,-0.7755,0.099,-7.805,0.000,-0.970,-0.581


In [None]:
intercept = logit_results.params['const']
money_coef = logit_results.params['money']

intercept_odds = np.exp(intercept)
money_odds_ratio = np.exp(money_coef)

intercept_prob = odds_to_prob(intercept_odds)
prob_with_money_plus1 = odds_to_prob(intercept_odds * money_odds_ratio)
prob_change = prob_with_money_plus1 - intercept_prob

In [None]:
print(f"Intercept (log-odds): {intercept:.3f}")
print(f"Intercept odds: {intercept_odds:.3f}")
print(f"Baseline spam probability when predictors are zero: {intercept_prob:.3f}")

print(f"'money' coefficient (log-odds per 1% point): {money_coef:.3f}")
print(f"'money' odds ratio (multiplicative effect): {money_odds_ratio:.3f}")
print(f"Approximate probability change for +1 point in 'money': {prob_change:.3f}")
print(f"Divide-by-4 heuristic (probability points): {money_coef / 4:.3f}")

Intercept (log-odds): -1.067
Intercept odds: 0.344
Baseline spam probability when predictors are zero: 0.256
'money' coefficient (log-odds per 1% point): 1.993
'money' odds ratio (multiplicative effect): 7.340
Approximate probability change for +1 point in 'money': 0.460
Divide-by-4 heuristic (probability points): 0.498
