In [38]:
from ISLP import load_data

# Load the College dataset
college = load_data('College')

# Display the first 10 rows
print(college.head(10))

# List all column names
print(college.columns)


  Private  Apps  Accept  Enroll  Top10perc  Top25perc  F.Undergrad  \
0     Yes  1660    1232     721         23         52         2885   
1     Yes  2186    1924     512         16         29         2683   
2     Yes  1428    1097     336         22         50         1036   
3     Yes   417     349     137         60         89          510   
4     Yes   193     146      55         16         44          249   
5     Yes   587     479     158         38         62          678   
6     Yes   353     340     103         17         45          416   
7     Yes  1899    1720     489         37         68         1594   
8     Yes  1038     839     227         30         63          973   
9     Yes   582     498     172         21         44          799   

   P.Undergrad  Outstate  Room.Board  Books  Personal  PhD  Terminal  \
0          537      7440        3300    450      2200   70        78   
1         1227     12280        6450    750      1500   29        30   
2           9

In [54]:
# Describe the dataset and the meaning of the variables 

# Print dataset summary and variable descriptions using existing 'college' dataframe
print(f"The dataset is a collection of data about colleges, containing {college.shape[0]} rows and {college.shape[1]} columns. ")

print("Each row represents a college, and each column represents a specific attribute of the college.\n")

print("The dataset contains both numerical and categorial features.\n")

print("Below is a brief description of the requested variables:\n")
print("- Outstate: The tuition fee for out-of-state students.")
print("- Private: Indicates whether the college is private (`Yes`) or public (`No`).")
print("- Room.Board: The estimated cost of room and board for students.")
print("- PhD: The percentage of faculty members with a PhD degree.")
print("- Top10perc: The percentage of new students who graduated in the top 10% of their high school class.")

print("\nFor more detailed information about each variable, please refer the information below:\n")

# Show summary statistics for outstate column
print(college['Outstate'].describe())

# Show summary statistics for private columns
print(college['Private'].describe())

# Show summary statistics for room.board columns
print(college['Room.Board'].describe())

# Show summary statistics for Phd columns
print(college['PhD'].describe())

# Show summary statistics for top10perc columns
print(college['Top10perc'].describe())


The dataset is a collection of data about colleges, containing 777 rows and 18 columns. 
Each row represents a college, and each column represents a specific attribute of the college.

The dataset contains both numerical and categorial features.

Below is a brief description of the requested variables:

- Outstate: The tuition fee for out-of-state students.
- Private: Indicates whether the college is private (`Yes`) or public (`No`).
- Room.Board: The estimated cost of room and board for students.
- PhD: The percentage of faculty members with a PhD degree.
- Top10perc: The percentage of new students who graduated in the top 10% of their high school class.

For more detailed information about each variable, please refer the information below:

count      777.000000
mean     10440.669241
std       4023.016484
min       2340.000000
25%       7320.000000
50%       9990.000000
75%      12925.000000
max      21700.000000
Name: Outstate, dtype: float64
count     777
unique      2
top       Ye

In [55]:
import statsmodels.formula.api as smf

# Fit simple linear regression: Outstate ~ Top10perc
model = smf.ols('Outstate ~ Top10perc', data=college).fit()

# Extract coefficients and statistics
intercept = model.params.iloc[0]
slope = model.params.iloc[1]
pvalue = model.pvalues.iloc[1]
r2 = model.rsquared

# Report results
print(f"Estimated coefficients:")
print(f"  Intercept: {intercept:.2f}")
print(f"  Slope (Top10perc): {slope:.2f}")
print(f"P-value for slope: {pvalue:.3e}")
print(f"R-squared: {r2:.3f}\n")

# Interpretation
print("Interpretation:")
print(f"  The slope indicates that, on average, a one percentage-point increase in Top10perc")
print(f"  is associated with a change of about {slope:.2f} dollars in out-of-state tuition.")
if pvalue < 0.05:
    print("  This association is statistically significant at the 5% level.")
else:
    print("  This association is not statistically significant at the 5% level.")

Estimated coefficients:
  Intercept: 6906.46
  Slope (Top10perc): 128.24
P-value for slope: 5.459e-66
R-squared: 0.316

Interpretation:
  The slope indicates that, on average, a one percentage-point increase in Top10perc
  is associated with a change of about 128.24 dollars in out-of-state tuition.
  This association is statistically significant at the 5% level.


In [56]:
# Fit multiple linear regression: Outstate ~ Top10perc + Room.Board + PhD
model_mult = smf.ols('Outstate ~ Top10perc + Q("Room.Board") + PhD', data=college).fit()

# Extract coefficients, p-values, and R-squared
params = model_mult.params
pvalues = model_mult.pvalues
r2_mult = model_mult.rsquared

intercept_mult = params['Intercept']
coef_top10 = params['Top10perc']
coef_room = params['Q("Room.Board")']
coef_phd = params['PhD']

p_top10 = pvalues['Top10perc']
p_room = pvalues['Q("Room.Board")']
p_phd = pvalues['PhD']

# Report results
print("Multiple linear regression: Outstate ~ Top10perc + Room.Board + PhD\n")
print(f"Intercept: {intercept_mult:.2f}")
print(f"Coefficient (Top10perc): {coef_top10:.3f}    (p-value: {p_top10:.3e})")
print(f"Coefficient (Room.Board): {coef_room:.3f}    (p-value: {p_room:.3e})")
print(f"Coefficient (PhD): {coef_phd:.3f}    (p-value: {p_phd:.3e})")
print(f"R-squared: {r2_mult:.3f}\n")

# Interpretations
print("Interpretation (holding other predictors constant):")
print(f"- Top10perc: a one percentage-point increase in Top10perc is associated with a change of about {coef_top10:.2f} dollars in out-of-state tuition.", 
    "Statistically significant." if p_top10 < 0.05 else "Not statistically significant.")
print(f"- Room.Board: a one-dollar increase in Room.Board is associated with a change of about {coef_room:.2f} dollars in out-of-state tuition.", 
    "Statistically significant." if p_room < 0.05 else "Not statistically significant.")
print(f"- PhD: a one percentage-point increase in the percentage of faculty with PhDs is associated with a change of about {coef_phd:.2f} dollars in out-of-state tuition.", 
    "Statistically significant." if p_phd < 0.05 else "Not statistically significant.")

Multiple linear regression: Outstate ~ Top10perc + Room.Board + PhD

Intercept: -430.64
Coefficient (Top10perc): 82.000    (p-value: 2.206e-31)
Coefficient (Room.Board): 1.882    (p-value: 1.354e-68)
Coefficient (PhD): 5.623    (p-value: 4.317e-01)
R-squared: 0.547

Interpretation (holding other predictors constant):
- Top10perc: a one percentage-point increase in Top10perc is associated with a change of about 82.00 dollars in out-of-state tuition. Statistically significant.
- Room.Board: a one-dollar increase in Room.Board is associated with a change of about 1.88 dollars in out-of-state tuition. Statistically significant.
- PhD: a one percentage-point increase in the percentage of faculty with PhDs is associated with a change of about 5.62 dollars in out-of-state tuition. Not statistically significant.


In [64]:
# Fit quadratic model adding Top10perc^2
model_quad = smf.ols('Outstate ~ Top10perc + I(Top10perc ** 2) + Q("Room.Board") + PhD', data=college).fit()

# Extract coefficients, p-values, and R-squared
params_q = model_quad.params
pvals_q = model_quad.pvalues
r2_q = model_quad.rsquared

intercept_q = params_q['Intercept']
coef_top10_q = params_q['Top10perc']
coef_top10_sq = params_q['I(Top10perc ** 2)']
coef_room_q = params_q['Q("Room.Board")']
coef_phd_q = params_q['PhD']

p_top10_q = pvals_q['Top10perc']
p_top10_sq = pvals_q['I(Top10perc ** 2)']
p_room_q = pvals_q['Q("Room.Board")']
p_phd_q = pvals_q['PhD']

# Report results
print("Quadratic model: Outstate ~ Top10perc + Top10perc^2 + Room.Board + PhD\n")
print(f"Intercept: {intercept_q:.3f}")
print(f"Coef Top10perc (linear): {coef_top10_q:.6f}    (p-value: {p_top10_q:.3e})")
print(f"Coef Top10perc^2 (quadratic): {coef_top10_sq:.6f}    (p-value: {p_top10_sq:.3e})")
print(f"Coef Room.Board: {coef_room_q:.6f}    (p-value: {p_room_q:.3e})")
print(f"Coef PhD: {coef_phd_q:.6f}    (p-value: {p_phd_q:.3e})")
print(f"R-squared: {r2_q:.3f}\n")

# Interpretation of the quadratic term
mean_top10 = college['Top10perc'].mean()
marginal_at_mean = coef_top10_q + 2 * coef_top10_sq * mean_top10

shape = ("convex (U-shaped)" if coef_top10_sq > 0 else
         "concave (inverted-U)" if coef_top10_sq < 0 else
         "no curvature (quadratic coef = 0)")

signif = "statistically significant" if p_top10_sq < 0.05 else "not statistically significant"

print("Interpretation of quadratic term:")
print(f"- Quadratic coefficient = {coef_top10_sq:.6f} ({signif}), indicating a {shape} relationship.")
print(f"- This means schools benefit from attracting more high-achieving students (raising out-of-state revenue), but each additional increase in top-decile share contributes less than the previous one.")
print("- The marginal effect varies with Top10perc as d/dTop10perc = coef_top10 + 2 * coef_top10_sq * Top10perc.")
print(f"- At the mean Top10perc of {mean_top10:.2f}, the marginal effect is approximately {marginal_at_mean:.3f} dollars per percentage point increase in Top10perc.")


Quadratic model: Outstate ~ Top10perc + Top10perc^2 + Room.Board + PhD

Intercept: -1087.276
Coef Top10perc (linear): 138.067460    (p-value: 3.591e-13)
Coef Top10perc^2 (quadratic): -0.685852    (p-value: 1.343e-03)
Coef Room.Board: 1.908079    (p-value: 2.240e-70)
Coef PhD: 1.961703    (p-value: 7.852e-01)
R-squared: 0.553

Interpretation of quadratic term:
- Quadratic coefficient = -0.685852 (statistically significant), indicating a concave (inverted-U) relationship.
- This means schools benefit from attracting more high-achieving students (raising out-of-state revenue), but each additional increase in top-decile share contributes less than the previous one.
- The marginal effect varies with Top10perc as d/dTop10perc = coef_top10 + 2 * coef_top10_sq * Top10perc.
- At the mean Top10perc of 27.56, the marginal effect is approximately 100.265 dollars per percentage point increase in Top10perc.


In [65]:
from statsmodels.stats.anova import anova_lm

# Compare nested models: model_mult (reduced) vs model_quad (full)
anova_results = anova_lm(model_mult, model_quad)
print(anova_results)

# Extract F-statistic and p-value for the comparison (full vs reduced)
f_stat = anova_results['F'].iloc[1]
p_val = anova_results['Pr(>F)'].iloc[1]

print(f"\nF-statistic comparing reduced vs full model: {f_stat:.4f}")
print(f"p-value: {p_val:.4e}")

if p_val < 0.05:
    print("The quadratic term significantly improves the model at the 5% level.")
else:
    print("The quadratic term does not significantly improve the model at the 5% level.")

   df_resid           ssr  df_diff       ss_diff          F    Pr(>F)
0     773.0  5.693394e+09      0.0           NaN        NaN       NaN
1     772.0  5.618020e+09      1.0  7.537388e+07  10.357499  0.001343

F-statistic comparing reduced vs full model: 10.3575
p-value: 1.3434e-03
The quadratic term significantly improves the model at the 5% level.
