In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression


In [67]:
data = pd.read_csv("data/king_county.csv", sep="\t")

In [68]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22687 entries, 1 to 27063
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   DocumentDate     22687 non-null  object 
 1   SalePrice        22687 non-null  int64  
 2   PropertyID       22687 non-null  int64  
 3   PropertyType     22687 non-null  object 
 4   ym               22687 non-null  object 
 5   zhvi_px          22687 non-null  int64  
 6   zhvi_idx         22687 non-null  float64
 7   AdjSalePrice     22687 non-null  float64
 8   NbrLivingUnits   22687 non-null  int64  
 9   SqFtLot          22687 non-null  int64  
 10  SqFtTotLiving    22687 non-null  int64  
 11  SqFtFinBasement  22687 non-null  int64  
 12  Bathrooms        22687 non-null  float64
 13  Bedrooms         22687 non-null  int64  
 14  BldgGrade        22687 non-null  int64  
 15  YrBuilt          22687 non-null  int64  
 16  YrRenovated      22687 non-null  int64  
 17  TrafficNoise

In [69]:
data.head()

Unnamed: 0,DocumentDate,SalePrice,PropertyID,PropertyType,ym,zhvi_px,zhvi_idx,AdjSalePrice,NbrLivingUnits,SqFtLot,...,Bathrooms,Bedrooms,BldgGrade,YrBuilt,YrRenovated,TrafficNoise,LandVal,ImpsVal,ZipCode,NewConstruction
1,2014-09-16,280000,1000102,Multiplex,2014-09-01,405100,0.930836,300805.0,2,9373,...,3.0,6,7,1991,0,0,70000,229000,98002,False
2,2006-06-16,1000000,1200013,Single Family,2006-06-01,404400,0.929228,1076162.0,1,20156,...,3.75,4,10,2005,0,0,203000,590000,98166,True
3,2007-01-29,745000,1200019,Single Family,2007-01-01,425600,0.977941,761805.0,1,26036,...,1.75,4,8,1947,0,0,183000,275000,98166,False
4,2008-02-25,425000,2800016,Single Family,2008-02-01,418400,0.961397,442065.0,1,8618,...,3.75,5,7,1966,0,0,104000,229000,98168,False
5,2013-03-29,240000,2800024,Single Family,2013-03-01,351600,0.807904,297065.0,1,8620,...,1.75,4,7,1948,0,0,104000,205000,98168,False


In [70]:
predictors = ['SqFtTotLiving', 'SqFtLot', 'Bathrooms', 'Bedrooms', 'BldgGrade']
outcome = 'AdjSalePrice'
data[predictors].head()

Unnamed: 0,SqFtTotLiving,SqFtLot,Bathrooms,Bedrooms,BldgGrade
1,2400,9373,3.0,6,7
2,3764,20156,3.75,4,10
3,2060,26036,1.75,4,8
4,3200,8618,3.75,5,7
5,1720,8620,1.75,4,7


In [71]:
model = LinearRegression()
model.fit(data[predictors], data[outcome])

print(f'Intercept: {model.intercept_:.3f}')
print('Coefficients:')
for name, coef in zip(predictors, model.coef_):
    print(f' {name}: {coef}')

Intercept: -521871.368
Coefficients:
 SqFtTotLiving: 228.8306036024076
 SqFtLot: -0.06046682065307258
 Bathrooms: -19442.840398320994
 Bedrooms: -47769.955185214334
 BldgGrade: 106106.96307898096


In [72]:
zip_groups = pd.DataFrame([
    *pd.DataFrame({
        'ZipCode': data['ZipCode'],
        'residual' : data[outcome] - model.predict(data[predictors]),
    })
    .groupby(['ZipCode'])
    .apply(lambda x: {
        'ZipCode': x.iloc[0,0],
        'count': len(x),
        'median_residual': x.residual.median()
    })
]).sort_values('median_residual')
zip_groups['cum_count'] = np.cumsum(zip_groups['count'])
zip_groups['ZipGroup'] = pd.qcut(zip_groups['cum_count'], 5, labels=False, retbins=False)
to_join = zip_groups[['ZipCode', 'ZipGroup']].set_index('ZipCode')
data = data.join(to_join, on='ZipCode')
data['ZipGroup'] = data['ZipGroup'].astype('category')

In [73]:
new_predictors = ['SqFtTotLiving', 'SqFtLot', 'Bathrooms', 'Bedrooms', 'BldgGrade', 'PropertyType', 'ZipGroup']
outcome = 'AdjSalePrice'
X = pd.get_dummies(data[new_predictors], drop_first=True)
confounding_model = LinearRegression()
confounding_model.fit(X, data[outcome])
print(f'Intercept: {confounding_model.intercept_:.3f}')
print('Coefficients:')
for name, coef in zip(X.columns, confounding_model.coef_):
    print(f' {name}: {coef}')

Intercept: -666637.469
Coefficients:
 SqFtTotLiving: 210.61266005580183
 SqFtLot: 0.4549871385465901
 Bathrooms: 5928.425640001864
 Bedrooms: -41682.871840744745
 BldgGrade: 98541.18352725943
 PropertyType_Single Family: 19323.625287919334
 PropertyType_Townhouse: -78198.72092762386
 ZipGroup_1: 53317.173306597986
 ZipGroup_2: 116251.58883563547
 ZipGroup_3: 178360.53178793367
 ZipGroup_4: 338408.60185652017


In [74]:
lung = pd.read_csv("data/lung.txt", sep="\t")

In [75]:
lung.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 725 entries, 0 to 724
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   LungCap    725 non-null    float64
 1   Age        725 non-null    int64  
 2   Height     725 non-null    float64
 3   Smoke      725 non-null    object 
 4   Gender     725 non-null    object 
 5   Caesarean  725 non-null    object 
dtypes: float64(2), int64(1), object(3)
memory usage: 34.1+ KB


In [76]:
lung.head()

Unnamed: 0,LungCap,Age,Height,Smoke,Gender,Caesarean
0,6.475,6,62.1,no,male,no
1,10.125,18,74.7,yes,female,no
2,9.55,16,69.7,no,female,yes
3,11.125,14,71.0,no,male,no
4,4.8,5,56.9,no,male,no


In [79]:
import plotly.express as px
fig = px.box(lung, x="Smoke", y="Age")
fig.show()

In [80]:
fig = px.scatter(lung, x= "Age", y= "LungCap", labels={"LungCap": "FEV"})
fig.show()

In [83]:
lung.Age.corr(lung.LungCap)

0.8196748974989415

In [110]:
import statsmodels.api as sm
vars = ['Age', 'Smoke']
X= pd.get_dummies(lung[vars], drop_first=True)
model_adj = sm.OLS(lung['LungCap'], X.assign(const=1))
res = model_adj.fit()
res.summary()

0,1,2,3
Dep. Variable:,LungCap,R-squared:,0.677
Model:,OLS,Adj. R-squared:,0.676
Method:,Least Squares,F-statistic:,757.5
Date:,"Tue, 01 Nov 2022",Prob (F-statistic):,4.97e-178
Time:,08:08:02,Log-Likelihood:,-1328.1
No. Observations:,725,AIC:,2662.0
Df Residuals:,722,BIC:,2676.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Age,0.5554,0.014,38.628,0.000,0.527,0.584
Smoke_yes,-0.6486,0.187,-3.473,0.001,-1.015,-0.282
const,1.0857,0.183,5.933,0.000,0.726,1.445

0,1,2,3
Omnibus:,0.325,Durbin-Watson:,1.808
Prob(Omnibus):,0.85,Jarque-Bera (JB):,0.411
Skew:,-0.039,Prob(JB):,0.814
Kurtosis:,2.912,Cond. No.,44.8


In [112]:
vars = ['Smoke']
X= pd.get_dummies(lung[vars], drop_first=True)
model_adj = sm.OLS(lung['LungCap'], X.assign(const=1))
res = model_adj.fit()
res.summary()

0,1,2,3
Dep. Variable:,LungCap,R-squared:,0.01
Model:,OLS,Adj. R-squared:,0.009
Method:,Least Squares,F-statistic:,7.507
Date:,"Tue, 01 Nov 2022",Prob (F-statistic):,0.0063
Time:,08:08:07,Log-Likelihood:,-1734.3
No. Observations:,725,AIC:,3473.0
Df Residuals:,723,BIC:,3482.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Smoke_yes,0.8753,0.319,2.740,0.006,0.248,1.502
const,7.7702,0.104,74.637,0.000,7.566,7.975

0,1,2,3
Omnibus:,7.728,Durbin-Watson:,1.932
Prob(Omnibus):,0.021,Jarque-Bera (JB):,6.927
Skew:,-0.181,Prob(JB):,0.0313
Kurtosis:,2.686,Cond. No.,3.29
