In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
from sklearn import linear_model
import matplotlib.pyplot as plt
import statsmodels.api as sm
from hw3_helper_function import *
#显示所有列
pd.set_option('display.max_columns',1000)
#显示所有行
pd.set_option('display.max_rows', 50)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',1000)
pd.set_option('max_info_columns',300)
%matplotlib inline

In [None]:
df = pd.read_csv("/Users/zed/VSCode/regana/finnal-project/communities.csv")
df.shape 

In [None]:
df.info()

In [None]:
import missingno as msno

fig,axe = plt.subplots()
plt.style.use('ggplot')
axe = msno.matrix(df.iloc[:,5:])
plt.savefig("missing.png",)

In [None]:
# 创建相关度矩阵
corr_matrix = df.corr().abs()

# 选择相关度矩阵的上三角
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# # # 寻找相关度大于 0.95 的特征列的索引
# to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

# # # 丢弃特征
# df_drop =  df.drop(to_drop, axis=1)

In [None]:
corr_matrix.to_latex()

In [None]:
corr_matrix.to_latex('corr.pdf')

In [None]:
df_drop.shape 

In [None]:
df.info()

In [None]:
df = df.dropna(thresh=int(0.4*df.shape[0]),axis = 1)

In [None]:
df.info()

In [None]:
# fig = sns.pairplot(df)    

In [None]:
X = df.iloc[:,5:-4].fillna(method='pad')
y = df.iloc[:,-1].fillna(method='pad')


In [None]:
model = sm.OLS(y,sm.add_constant(X)).fit()

In [None]:
model.summary()

In [None]:
tbl=model.summary()
with open('temp.tex','w') as fh:
    fh.write( tbl.as_latex() )

In [None]:
def backward_stepwise(y, X, remaining_features):
    features = remaining_features.copy()
    RSS_list, R_squared_list, adj_R_squared_list, AIC_list, BIC_list = [],[],[],[],[] 
    features_list = dict()                                    # Intialize feature list using dictionary. This is one way
    k = len(remaining_features)
    for i in range(k,0,-1):
        best_RSS = np.inf                                    # initialize the best_RSS in each round to be infinity

        for combo in itertools.combinations(remaining_features,1): # iterate through all remaining features

            new_features = remaining_features.copy()
            new_features.remove(combo[0])
            X_c = sm.add_constant(X[new_features])  # we need to add constant term using sm.OLS
            model = sm.OLS(y, X_c).fit()

            if model.ssr < best_RSS:                          # compare the RSS value with the smallest value in this round
                best_RSS = model.ssr                          # update the best value
                best_R_squared = model.rsquared               # update best best_R_squared
                best_feature = combo[0]                       # the best feature in this round
                best_aic = model.aic
                best_bic = model.bic
                best_adj_R_squared = model.rsquared_adj

        #Updating variables for next loop
        features.remove(best_feature)                         # add the best feature in the features set
        remaining_features.remove(best_feature)               # remove it from candidate set

        #Saving values for plotting
        RSS_list.append(best_RSS)
        R_squared_list.append(best_R_squared)
        AIC_list.append(best_aic)
        BIC_list.append(best_bic)
        adj_R_squared_list.append(best_adj_R_squared)
        features_list[i] = features.copy()

    # store results in df_results, which is a joint of df_features and df_values
    df_features = pd.DataFrame({'features':features_list})
    df_values = pd.DataFrame({'RSS':RSS_list, 'R_squared': R_squared_list,'AIC':AIC_list,'BIC':BIC_list, 'adj_R_squared': adj_R_squared_list})
    df_values.index += 1  # shift the index by 1 to get aligned with df_features
    df_results = pd.concat([df_features,df_values], axis=1, join='inner')
    df_results['numb_features'] = df_results.index
    df_results["features"] = np.flip(df_results["features"].values)
    df_results["numb_features"] = np.flip(df_results["numb_features"].values)
    return df_results
result3 = backward_stepwise(y,X ,list(X.columns))

In [None]:
fstp = forward_stepwise(y,X,list(X.columns.values))

In [None]:
def plot_selection(df_results, standards):
    fig = plt.figure(figsize=(20, 6))

    for i, v in enumerate(standards):
        ax = fig.add_subplot(1, len(standards), i+1)
        ax.plot(df_results['numb_features'], df_results[v], color='lightblue')
        ax.scatter(df_results['numb_features'],
                   df_results[v], color='darkblue')
        l = len(df_results[v])
        if v == 'adj_R_squared':
            ax.plot(l-df_results[v].idxmax(), df_results[v].max(),
                    marker='x', markersize=20, color='r')
            
            print("According to adj_R_squared select features:",df_results.features[df_results[v].idxmax()],",total numbers:",str(len(df_results.features[df_results[v].idxmax()])))
            print("\n")
        else:
            print("According to ",v," select featuers:",df_results.features[df_results[v].idxmin()],",total numbers:",str(len(df_results.features[df_results[v].idxmin()])))
            print('\n')
            ax.plot(l-df_results[v].idxmin(), df_results[v].min(),
                    marker='x', markersize=20, color='r')

        ax.set_xlabel('Number of predictors')
        ax.set_ylabel(v)

    fig.suptitle('Subset selection using ' + ", ".join(standards), fontsize=16)
    plt.show()
plot_selection(result3,['RSS','AIC','BIC','adj_R_squared'])

In [None]:
plot_selection(fstp,['RSS','AIC','BIC','adj_R_squared'])

In [None]:
# backward seleted model
## AIC
features = ['racepctblack', 'racePctHisp', 'agePct12t29', 'pctUrban', 'medIncome', 'pctWWage', 'pctWFarmSelf', 'pctWInvInc', 'pctWRetire', 'medFamInc', 'whitePerCap', 'indianPerCap', 'AsianPerCap', 'OtherPerCap', 'PctPopUnderPov', 'PctLess9thGrade', 'PctEmploy', 'PctEmplManu', 'PctOccupManu', 'PctOccupMgmtProf', 'MalePctDivorce', 'MalePctNevMarr', 'TotalPctDiv', 'PctKids2Par', 'PctWorkMom', 'NumIlleg', 'PctIlleg', 'NumImmig', 'PctNotSpeakEnglWell', 'PctLargHouseOccup', 'PersPerOccupHous', 'PersPerRentOccHous', 'PctPersOwnOccup', 'PctPersDenseHous', 'HousVacant', 'PctHousOccup', 'PctHousOwnOcc', 'PctVacantBoarded', 'PctVacMore6Mos', 'OwnOccLowQuart', 'OwnOccMedVal', 'RentLowQ', 'MedRent', 'MedOwnCostPctIncNoMtg', 'NumInShelters', 'NumStreet', 'PctForeignBorn', 'PctSameCity85']
X_b = X[features]
model_b_aic = sm.OLS(y,sm.add_constant(X_b)).fit()
print(model_b_aic.summary())

In [None]:
features_b_r = ['racepctblack', 'racePctHisp', 'agePct12t29', 'pctUrban', 'medIncome', 'pctWWage', 'pctWFarmSelf', 'pctWInvInc', 'pctWSocSec', 'pctWRetire', 'medFamInc', 'whitePerCap', 'blackPerCap', 'indianPerCap', 'AsianPerCap', 'OtherPerCap', 'HispPerCap', 'PctPopUnderPov', 'PctLess9thGrade', 'PctEmploy', 'PctEmplManu', 'PctOccupManu', 'PctOccupMgmtProf', 'MalePctDivorce', 'MalePctNevMarr', 'TotalPctDiv', 'PctKids2Par', 'PctWorkMomYoungKids', 'PctWorkMom', 'NumIlleg', 'PctIlleg', 'NumImmig', 'PctNotSpeakEnglWell', 'PctLargHouseOccup', 'PersPerOccupHous', 'PersPerRentOccHous', 'PctPersOwnOccup', 'PctPersDenseHous', 'PctHousLess3BR', 'MedNumBR', 'HousVacant', 'PctHousOccup', 'PctHousOwnOcc', 'PctVacantBoarded', 'PctVacMore6Mos', 'OwnOccLowQuart', 'OwnOccMedVal', 'RentLowQ', 'RentHighQ', 'MedRent', 'MedRentPctHousInc', 'MedOwnCostPctInc', 'MedOwnCostPctIncNoMtg', 'NumInShelters', 'NumStreet', 'PctForeignBorn', 'PctSameCity85']
X_b_r = X[features_b_r]
model_b_aic = sm.OLS(y,sm.add_constant(X_b_r)).fit()
print(model_b_aic.summary())

In [None]:
def plot_selection(df_results, standards):
    fig = plt.figure(figsize=(20, 6))

    for i, v in enumerate(standards):
        ax = fig.add_subplot(1, len(standards), i+1)
        ax.plot(df_results['numb_features'], df_results[v], color='lightblue')
        ax.scatter(df_results['numb_features'],
                   df_results[v], color='darkblue')
        l = len(df_results[v])
        if v == 'adj_R_squared':
            ax.plot(df_results[v].idxmax(), df_results[v].max(),
                    marker='x', markersize=20, color='r')
            
            print("According to adj_R_squared select features:",df_results.features[df_results[v].idxmax()],",total numbers:",str(len(df_results.features[df_results[v].idxmax()])))
            print("\n")
        else:
            print("According to ",v," select featuers:",df_results.features[df_results[v].idxmin()],",total numbers:",str(len(df_results.features[df_results[v].idxmin()])))
            print('\n')
            ax.plot(df_results[v].idxmin(), df_results[v].min(),
                    marker='x', markersize=20, color='r')

        ax.set_xlabel('Number of predictors')
        ax.set_ylabel(v)

    fig.suptitle('Subset selection using ' + ", ".join(standards), fontsize=16)
    plt.show()
plot_selection(fstp,['RSS','AIC','BIC','adj_R_squared'])

In [None]:
features_f_a=['PctKids2Par', 'racePctWhite', 'HousVacant', 'pctUrban', 'PctWorkMom', 'NumStreet', 'MalePctDivorce', 'PctIlleg', 'numbUrban', 'PctPersDenseHous', 'racepctblack', 'agePct12t29', 'MedOwnCostPctIncNoMtg', 'OtherPerCap', 'pctWRetire', 'PctPopUnderPov', 'pctWWage', 'PctVacantBoarded', 'MedRentPctHousInc', 'RentLowQ', 'MedRent', 'whitePerCap', 'MalePctNevMarr', 'PctEmploy', 'PctVacMore6Mos', 'PctHousOccup', 'agePct16t24', 'AsianPerCap', 'pctWFarmSelf', 'indianPerCap', 'pctWInvInc', 'TotalPctDiv', 'PctBSorMore', 'MedOwnCostPctInc', 'PctLargHouseFam', 'PersPerOccupHous', 'PctLess9thGrade', 'PctHousLess3BR', 'pctWSocSec', 'PctPersOwnOccup', 'racePctHisp', 'HispPerCap', 'NumIlleg', 'NumInShelters', 'NumImmig', 'RentHighQ', 'PctForeignBorn', 'PctNotSpeakEnglWell', 'PersPerRentOccHous', 'PctHousOwnOcc']
X_fa = X[features_f_a]
model_fa = sm.OLS(y,sm.add_constant(X_fa)).fit()
model_fa.summary()

In [None]:
features_fr = ['PctKids2Par', 'racePctWhite', 'HousVacant', 'pctUrban', 'PctWorkMom', 'NumStreet', 'MalePctDivorce', 'PctIlleg', 'numbUrban', 'PctPersDenseHous', 'racepctblack', 'agePct12t29', 'MedOwnCostPctIncNoMtg', 'OtherPerCap', 'pctWRetire', 'PctPopUnderPov', 'pctWWage', 'PctVacantBoarded', 'MedRentPctHousInc', 'RentLowQ', 'MedRent', 'whitePerCap', 'MalePctNevMarr', 'PctEmploy', 'PctVacMore6Mos', 'PctHousOccup', 'agePct16t24', 'AsianPerCap', 'pctWFarmSelf', 'indianPerCap', 'pctWInvInc', 'TotalPctDiv', 'PctBSorMore', 'MedOwnCostPctInc', 'PctLargHouseFam', 'PersPerOccupHous', 'PctLess9thGrade', 'PctHousLess3BR', 'pctWSocSec', 'PctPersOwnOccup', 'racePctHisp', 'HispPerCap', 'NumIlleg', 'NumInShelters', 'NumImmig', 'RentHighQ', 'PctForeignBorn', 'PctNotSpeakEnglWell', 'PersPerRentOccHous', 'PctHousOwnOcc', 'blackPerCap', 'medFamInc', 'medIncome', 'PctWorkMomYoungKids', 'OwnOccLowQuart', 'OwnOccMedVal', 'PctSameCity85', 'MedNumBR', 'PctEmplManu']
X_fr = X[features_fr]
model_fa = sm.OLS(y,sm.add_constant(X_fr)).fit()
model_fa.summary()

In [None]:
from sklearn.model_selection import GridSearchCV
model = linear_model.Lasso()
alphas = np.logspace(-10, 10, 100)

tuned_parameters = [{'alpha': alphas}]
n_folds = 10
clf = GridSearchCV(model, tuned_parameters, cv=n_folds, refit=False, scoring='neg_mean_squared_error')
clf.fit(X, y)
scores = clf.cv_results_['mean_test_score']
scores_std = clf.cv_results_['std_test_score']

plt.figure().set_size_inches(8, 6)
plt.semilogx(alphas, scores)
plt.ylabel('test MSE')
plt.xlabel('alpha')
plt.axhline(np.max(scores), linestyle='--', color='.5')
plt.xlim([alphas[0], alphas[-1]])
plt.show()

print(clf.best_params_)
alpha = clf.best_params_["alpha"]
print("best \lambda:",2*alpha*y.shape[0])


In [None]:
from sklearn.linear_model import LassoCV
lassocv = LassoCV(alphas=np.logspace(-10,10,10),cv = 10,max_iter=1e5)
lassocv.fit(X, y)
lassocv.alpha_  # 0.1 

from sklearn.feature_selection import SelectFromModel
selection = SelectFromModel(lassocv,prefit = True)

selection.get_support()

In [None]:
sigx = []

In [None]:
sigx = []
for i in range(len(selection.get_support())):
    if selection.get_support()[i] == True:
        sigx.append(X.columns[i])
    

In [None]:
df1 = pd.DataFrame({'features:':sigx})

In [None]:
df1.to_latex()

In [None]:
X_slt = selection.transform(X)

In [None]:
sigxdf = pd.DataFrame(X_slt)

In [None]:
sigxdf.columns = sigx

In [None]:
sigxdf

In [None]:
X_slt.shape

In [None]:
model=sm.OLS(y,sm.add_constant(sigxdf)).fit()
model.summary()

In [None]:
from sklearn.model_selection import GridSearchCV
model = linear_model.Lasso(max_iter=1e5)
alphas = np.logspace(-10, 10, 10)

tuned_parameters = [{'alpha': alphas}]
n_folds = 10

clf = GridSearchCV(model, tuned_parameters, cv=n_folds, refit=False,scoring='neg_mean_squared_error')
clf.fit(X, y)
scores = clf.cv_results_['mean_test_score']
scores_std = clf.cv_results_['std_test_score']

print(clf.best_params_)
alpha = clf.best_params_["alpha"]
print("best \lambda:",2*alpha*y.shape[0])


In [None]:
plt.style.use('ggplot')
plt.figure().set_size_inches(6, 4)
plt.xscale('log')
plt.errorbar(y = scores,yerr=scores_std,x= alphas,fmt='o',ecolor='r',color='darkblue',elinewidth=1,capsize=5)
plt.plot(alphas,scores,c = 'lightblue')
plt.ylabel('test MSE')
plt.xlabel('alpha')
plt.axhline(np.max(scores), linestyle='--', color='.5')
plt.title('test MSE-alpha')
plt.savefig('lasso.png',dpi = 150)

In [None]:
X

In [None]:
form = ""
for i in range(X.shape[1]):
    form += "s("+str(i)+")+"

In [None]:
form 

In [None]:
from pygam import LinearGAM, s, f

gam = LinearGAM(s(0)+s(1)+s(2)+s(3)+s(4)+s(5)+s(6)+s(7)+s(8)+s(9)+s(10)+s(11)+s(12)+s(13)+s(14)+s(15)+s(16)+s(17)+s(18)+s(19)+s(20)+s(21)+s(22)+s(23)+s(24)+s(25)+s(26)+s(27)+s(28)+s(29)+s(30)+s(31)+s(32)+s(33)+s(34)+s(35)+s(36)+s(37)+s(38)+s(39)+s(40)+s(41)+s(42)+s(43)+s(44)+s(45)+s(46)+s(47)+s(48)+s(49)+s(50)+s(51)+s(52)+s(53)+s(54)+s(55)+s(56)+s(57)+s(58)+s(59)+s(60)+s(61)+s(62)+s(63)+s(64)+s(65)+s(66)+s(67)+s(68)+s(69)+s(70)+s(71)+s(72)+s(73)+s(74)+s(75)+s(76)+s(77)+s(78)+s(79)+s(80)+s(81)+s(82)+s(83)+s(84)+s(85)+s(86)+s(87)+s(88)+s(89)+s(90)+s(91)+s(92)+s(93)+s(94)+s(95)+s(96)).fit(X, y)

In [None]:
gam.summary()

In [None]:
lams = np.exp(np.random.rand(97, 97)*100 -3)
gam.gridsearch(X, y, lam=lams)

In [None]:
s(0)+s(2)+s(10)+s(28)+s(33)+s(39)+s(44)+s(45)+s(48)+s(50)+s(51)++s(68)+s(70+s(73)+s(74)+s(90)

In [None]:
gam2 = LinearGAM(s(0)+s(2)+s(10)+s(28)+s(33)+s(39)+s(44)+s(45)+s(48)+s(50)+s(51)+s(68)+s(70)+s(73)+s(74)+s(90)).fit(X,y)

lams = np.exp(np.random.rand(97, 97)*100 -3)
gam2.gridsearch(X, y, lam=lams)

In [None]:
gam2s = gam2.gridsearch(X, y)

In [None]:
gam2s.summary()

In [None]:
s(0)+s(2)+s(10)+s(28)+s(33)+s(39)+s(44)+s(45)+s(48)+s(50)+s(51)++s(68)+s(70+s(73)+s(74)+s(90)

In [None]:
ll =[0,2,10,28,33,39,44,45,48,50,51,6870,73,74,90]

In [None]:
len(ll)

In [None]:
titles = [X.columns[i] for i in [0,2,10,28,33,39,44,45,48,50,51,68,70,73,74,90]]
t = 0
fig, axs = plt.subplots(3,5,figsize = (30,20),dpi =140)
plt.style.use('seaborn-paper')
plt.subplots_adjust(wspace =0.0, hspace =0.2)#调整子图间距

for i in range(3):
    for j in range(5):
        XX = gam.generate_X_grid(term=t)
        pdep, confi = gam2s.partial_dependence(term=t, width=.95)
        axs[i][j].plot(XX[:, t], pdep)
        axs[i][j].plot(XX[:, t], confi, c='b', ls='--')
        axs[i][j].set_title(titles[t])
        t+=1
        
plt.savefig("gam2s.png")

In [None]:
x = df.iloc[:,5:-1]

In [None]:
from scipy import stats

In [None]:
x.apply(,axis= 1)

In [None]:
stats.shapiro(x.iloc[:,3])[1]

In [None]:
res = []
for i in range(x.shape[1]):
    if stats.shapiro(x.iloc[:,i])[1]>0.05:
        print("True")
        res.append(1)
    else:
        print("False")
        res.append(0)

In [None]:
(np.array(res) == 1).sum()