In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

import scipy.linalg as LA
import matplotlib
import matplotlib.image as mpimg
from PIL import Image
from scipy.misc import imread
from scipy.stats import skew
from scipy.stats.stats import pearsonr
from sklearn.linear_model import Ridge, LassoCV
from sklearn.model_selection import cross_val_score

import xgboost as xgb

Problem 1: Linear Discriminant Analysis
    Part 1:

In [None]:
mean1 = [0, 0, 0]
cov1 = [[1, 0.9, 0.9], [0.9, 1, 0.9], [0.9, 0.9, 1]]
label1 = np.random.multivariate_normal(mean1, cov1, 20)

mean2 = [0, 0, 1]
cov2 = [[1, 0.8, 0.8], [0.8, 1, 0.8], [0.8, 0.8, 1]]
label2 = np.random.multivariate_normal(mean2, cov2, 20)

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.scatter(label1.T[0], label1.T[1], label1.T[2], c='r', label='Labels 1')
ax.scatter(label2.T[0], label2.T[1], label2.T[2], c='b', label= 'Labels 2')

ax.set_xlabel('X Axis')
ax.set_ylabel('Y Axis')
ax.set_zlabel('Z Axis')

plt.legend()
plt.show()

Just experimenting projection onto Z axis

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)

ax.scatter(label1.T[0], label1.T[1], c='r', label='Labels 1')
ax.scatter(label2.T[0], label2.T[1], c='b', label= 'Labels 2')

ax.set_xlabel('X Axis')
ax.set_ylabel('Y Axis')

plt.legend()
plt.show()

Linear Discriminant Analysis

In [None]:
#calculate variance within class
Sw = np.dot((label1-mean1).T, (label1-mean1)) + np.dot((label2-mean2).T, (label2-mean2))

print Sw

#calculate weights which maximize linear separation
w = np.dot(np.linalg.inv(Sw), (np.subtract(mean2,mean1)))

print "vector of max weights", w
#projection of classes on 1D space
plt.plot(np.dot(label1, w), [0]*len(label1), "bo", label="label1")
plt.plot(np.dot(label2, w), [0]*len(label2), "go", label="label2")
plt.legend()

plt.show()

Part 3

In [None]:
# clf = LinearDiscriminantAnalysis()
# clf.fit(label1, label2)


# print(clf.predict([[-0.8, -1]]))

Problem 3:

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                      test.loc[:,'MSSubClass':'SaleCondition']))

matplotlib.rcParams['figure.figsize'] = (12.0, 6.0)
prices = pd.DataFrame({"price":train["SalePrice"], "log(price + 1)":np.log1p(train["SalePrice"])})
#prices.hist()
#log transform the target:
train["SalePrice"] = np.log1p(train["SalePrice"])

#log transform skewed numeric features:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
                               
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

all_data = pd.get_dummies(all_data)

#filling NA's with the mean of the column:
all_data = all_data.fillna(all_data.mean())

#creating matrices for sklearn:
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice

#creating a runner for alpha values
def ridgeRegTest(a):
    clf = Ridge(alpha=a)
    clf.fit(X_train, y)
    predict = np.expm1(clf.predict(X_test))

    results = pd.DataFrame({"Id":test.Id, "SalePrice":predict})
    results.to_csv("results.csv", index = False)

    results.plot(x='Id', y='SalePrice', kind='line')
    plt.show()
ridgeRegTest(.1)

#checkeing rmse on different values of alpha
alphas = []
for i in range(0,200):
    alphas.append(i*.01+9)
    
def rmse_cv(model):
    rmse= np.sqrt(-cross_val_score(model, X_train, y, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)
#model_ridge = Ridge()

#alphas = [.001, .01, .05, .1, .5, 1, 5, 10, 50]
#first try small range then determine min from graph in order to save running time

cv_ridge = [rmse_cv(Ridge(alpha = alpha)).mean() for alpha in alphas]

cv_ridge = pd.Series(cv_ridge, index = alphas)
# cv_ridge.plot(title = "Validation - Just Do It")
# plt.xlabel("alpha")
# plt.ylabel("rmse")
# plt.show()

print cv_ridge.min()

alphas2 = []
for i in range(0,10000):
    alphas2.append(i*.0001)

#model_lasso = LassoCV(alphas = alphas).fit(X_train, y)
model_lasso = LassoCV()

#cv_lasso = [rmse_cv(LassoCV(alphas = alpha)).mean()
#            for alpha in alphas2]

#cv_lasso = pd.Series(cv_lasso, index = alphas2)
#cv_lasso.plot(title = "Validation - Just Do It2")
#plt.xlabel("alpha")
#plt.ylabel("rmse")
#plt.show()
#print alphas2
print 'lasso mean:'
print rmse_cv(model_lasso).mean()
print 'lasso min:'
print rmse_cv(model_lasso).min()


model_lasso = LassoCV(alphas = [1, 0.5, 0.1, 0.001, 0.0005]).fit(X_train, y)
print rmse_cv(model_lasso)
print 'real lasso mean:'
print rmse_cv(model_lasso).mean()
print 'real lasso min:'
print rmse_cv(model_lasso).min()

Incorporating XGBoost (Part 5 onwards)

In [None]:
dtrain = xgb.DMatrix(X_train, label = y)
dtest = xgb.DMatrix(X_test)

params = {"max_depth":2, "eta":0.1}
model = xgb.cv(params, dtrain,  num_boost_round=500, early_stopping_rounds=100)

model.loc[30:,["test-rmse-mean", "train-rmse-mean"]].plot()
plt.show()

In [None]:
model_xgb = xgb.XGBRegression(n_estimators=360, max_depth=2, learning_rate=0.1)
model_xgb.fit(X_train, y)

xgb_preds = np.expm1(model_xgb.predict(X_test))

results = pd.DataFrame({"Id":test.Id, "SalePrice":xgb_preds})
results.to_csv("XGBresults.csv", index = False)

results.plot(x='Id', y='SalePrice', kind='line')
plt.show()

Initial XGBoost Submission resulted with .13427 (which is worse than the previous Ridge with CV)

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                      test.loc[:,'MSSubClass':'SaleCondition']))
#log transform the target:
train["SalePrice"] = np.log1p(train["SalePrice"])

#log transform skewed numeric features:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

all_data = pd.get_dummies(all_data)
all_data = all_data.fillna(all_data.mean())
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice

solver = sklearn.linear_model.RidgeCV(normalize=True)
solver.fit(X_train, y)
print("RidgeCV Score: ", solver.score(X_train, y))
error = sqrt(sklearn.metrics.mean_squared_error(y, solver.predict(X_train)))
print("RMSE of Training Predictions (RRCV): ", error)
print()

alphas = np.logspace(-100, 0, num = 100)

solver = sklearn.linear_model.LassoCV(tol=0.1, normalize=True, alphas=alphas)
solver.fit(X_train, y)
print("LassoCV Score: ", solver.score(X_train, y))
error = sqrt(sklearn.metrics.mean_squared_error(y, solver.predict(X_train)))
print("RMSE of Training Predictions (LCV): ", error)
print()

alpha_path, coef_path, _ = sklearn.linear_model.lasso_path(X_train, y, alphas=alphas)

l0_coeffs = [np.linalg.norm(coef, ord=0) for coef in coef_path.T]
scores = [sklearn.metrics.r2_score(y, X_train.dot(coef)) for coef in coef_path.T]
errors = [sklearn.metrics.mean_squared_error(y, X_train.dot(coef)) for coef in coef_path.T]
idx = np.argmax(scores)

print("Max Lasso Path Score: ", scores[idx])
print("RMSE of Training Predictions (LPCV): ", errors[idx])
print()

#df = pd.DataFrame({"id":test.Id, "SalePrice":np.expm1(X_test.dot(coef_path.T[idx]))})
#df.to_csv('Results.csv', index = False)

plt.scatter(alpha_path, scores)
plt.xlabel('Alpha')
plt.ylabel('Accuracy (R^2 Score)')
plt.show()


plt.scatter(alpha_path, l0_coeffs)
plt.xlabel('Alpha')
plt.ylabel('L0 Norm of Coefficients')
plt.show()

X_train['Results'] = X_train.dot(coef_path.T[-1])

solver = sklearn.linear_model.RidgeCV(normalize=True)
solver.fit(X_train, y)
print("Ridge Score w/ outputs added to data: ", solver.score(X_train, y))
error = sqrt(sklearn.metrics.mean_squared_error(y, solver.predict(X_train)))
print("RMSE of Training Predictions (RR+CV): ", error)
print()

X_train = X_train.drop('Results', 1)

dtrain = xgb.DMatrix(X_train.values, y.values)
dtest = xgb.DMatrix(X_test.values)
# specify parameters via map
param = {'max_depth':2, 'eta':0.1, 'silent':1, 'subsample':0.5}
num_round = 1000
bst = xgb.train(param, dtrain, num_round)
# make prediction
preds = bst.predict(dtrain)
error = sqrt(sklearn.metrics.mean_squared_error(y, preds))

submission = bst.predict(dtest)
#df = pd.DataFrame({"id":test.Id, "SalePrice":np.expm1(submission)})
#df.to_csv('Results.csv', index = False)

print("XGB Score: ", sklearn.metrics.r2_score(y, preds))
print("RMSE of Training Predictions (XGB): ", error)