In [24]:
#importing important libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, LogisticRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier

%matplotlib inline

In [25]:
train = pd.read_csv('./datasets/train_clean.csv')

In [26]:
test = pd.read_csv('./datasets/test_clean.csv')

In [27]:
train.drop('Unnamed: 0', axis=1, inplace=True)

In [28]:
test.drop('Unnamed: 0', axis=1, inplace=True)

### Attempt 1 with Linear Regression

In [29]:
X = train[['lot_frontage', 'lot_area', 'overall_qual', 'overall_cond', 'year_built', 'exter_qual', 'exter_cond', 
        'bsmt_qual', 'year_remod/add','bsmt_cond', 'heating_qc', 'central_air', 
        'electrical', 'totrms_abvgrd', 'kitchen_qual', 'functional', 'fireplaces', 'fireplace_qu', 
        'garage_qual', 'garage_area', 'garage_cars', 'pool_area', 'misc_val', 'total_baths', 'total_bsmt_sf', 
        '1st_flr_sf', '2nd_flr_sf']]
       
y = train['saleprice']

In [30]:
lr = LinearRegression()

ss = StandardScaler()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lr.fit(X_train, y_train)

LinearRegression()

In [31]:
lr.score(X_train, y_train)

0.8309613241675486

In [32]:
lr.score(X_test, y_test)

0.8619024706796021

In [33]:
preds = lr.predict(test[['lot_frontage', 'lot_area', 'overall_qual', 'overall_cond', 'year_built', 'exter_qual', 'exter_cond', 
        'bsmt_qual', 'year_remod/add','bsmt_cond', 'heating_qc', 'central_air', 
        'electrical', 'totrms_abvgrd', 'kitchen_qual', 'functional', 'fireplaces', 'fireplace_qu', 
        'garage_qual', 'garage_area', 'garage_cars', 'pool_area', 'misc_val', 'total_baths', 'total_bsmt_sf', 
        '1st_flr_sf', '2nd_flr_sf']])

In [34]:
to_submit = pd.DataFrame()

In [35]:
to_submit['Id'] = test['id']
to_submit['SalePrice'] = preds
to_submit.head()

Unnamed: 0,Id,SalePrice
0,2658,157247.393665
1,2718,187394.608833
2,2414,221535.736486
3,1989,120573.429632
4,625,189823.618047


In [36]:
to_submit.to_csv('./datasets/kaggle_submit_1.csv', index=False)

### Attempt 2 with Lasso

In [37]:
lasso = Lasso()

lasso.fit(X_train, y_train)

Lasso()

In [38]:
lasso.score(X_train, y_train)

0.8309613086048078

In [39]:
lasso.score(X_test, y_test)

0.8619207285925826

In [40]:
lasso_preds = lasso.predict(test[['lot_frontage', 'lot_area', 'overall_qual', 'overall_cond', 'year_built', 'exter_qual', 'exter_cond', 
        'bsmt_qual', 'year_remod/add','bsmt_cond', 'heating_qc', 'central_air', 
        'electrical', 'totrms_abvgrd', 'kitchen_qual', 'functional', 'fireplaces', 'fireplace_qu', 
        'garage_qual', 'garage_area', 'garage_cars', 'pool_area', 'misc_val', 'total_baths', 'total_bsmt_sf', 
        '1st_flr_sf', '2nd_flr_sf']])

In [41]:
second_submit = pd.DataFrame()

In [42]:
second_submit['Id'] = test['id']
second_submit['SalePrice'] = lasso_preds
second_submit.head()

Unnamed: 0,Id,SalePrice
0,2658,157252.977228
1,2718,187399.868346
2,2414,221541.544172
3,1989,120557.622931
4,625,189811.059364


In [43]:
second_submit.to_csv('./datasets/kaggle_submit_2.csv', index=False)

### Attempt 3 with Linear Regression After Feature Engineering

In [44]:
X_3 = train[['house_qual', 'total_sf', 'total_sf*total_qual', 'total_baths', 'lot_frontage', 'lot_area',
             'heating_qc', 'central_air', 'totrms_abvgrd', 'garage_area', 'garage_cars', 'garage_finish',
             'gr_liv_area', 'utilities', 'fireplace*fireplace_qu', 'electrical']]

y_3 = train['saleprice']

In [45]:
lm = LinearRegression()

X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X_3, y_3, test_size=0.33, random_state=42)

lm.fit(X_train_3, y_train_3)

LinearRegression()

In [46]:
lm.score(X_train_3, y_train_3)

0.8067141895714535

In [47]:
lm.score(X_test_3, y_test_3)

0.8656621521453193

In [48]:
new_preds = lm.predict(test[['house_qual', 'total_sf', 'total_sf*total_qual', 'total_baths', 'lot_frontage', 'lot_area',
             'heating_qc', 'central_air', 'totrms_abvgrd', 'garage_area', 'garage_cars', 'garage_finish',
             'gr_liv_area', 'utilities', 'fireplace*fireplace_qu', 'electrical']])

In [49]:
submit_3 = pd.DataFrame()

In [50]:
submit_3['Id'] = test['id']
submit_3['SalePrice'] = new_preds
submit_3.head()

Unnamed: 0,Id,SalePrice
0,2658,129202.775063
1,2718,175461.695419
2,2414,239929.50524
3,1989,121894.894198
4,625,197804.368884


In [51]:
submit_3.to_csv('./datasets/kaggle_submit_3.csv', index=False)