In [None]:
import pandas as pd
from matplotlib import pyplot as plt
plt.style.use('seaborn')

In [None]:
df = pd.read_csv('./dataset/train.csv')

### Explore

In [None]:
df.columns

In [None]:
#most commmon values of each feature
for col in df.columns:
    print(df[col].value_counts().sort_values(ascending=False))

In [None]:
df.isnull().sum().sort_values(ascending=False).head(20)

Dataset does not have werid Nan values, the attributes with high nan values are all intentional (eg. does not have pool so PoolQC is Nan)

#### Useful
Logically what might correlate lot with SalePrice: 

Category Type: MSSubClass, Neighborhood, OverallCond, 

Number: GrLivArea, YrSold, LotArea, YearBuilt, YearRemodAdd, PoolArea

In [None]:
df.loc[:,["GrLivArea","SalePrice"]].sort_values('GrLivArea', ascending=False).head(30)

In [None]:
df.loc[:,["GrLivArea","SalePrice"]].sort_values('GrLivArea', ascending=False).tail(30)

In [None]:
df.groupby('Neighborhood')['SalePrice'].median()

In [None]:
df.groupby('Neighborhood')['SalePrice'].describe()

In [None]:
fig1 = plt.figure()

ax1_x = df.loc[:,'GrLivArea']
ax1_y = df.loc[:,'SalePrice']

ax1 = fig1.add_subplot(211)
ax1.scatter(ax1_x, ax1_y , color='lightblue', linewidth=3, label="Above Ground Living Area v Price")
ax1.set( xlabel="Living Area", ylabel="Sale Price")
ax1.legend()

ax2 = fig1.add_subplot(212)
ax2_y = df.groupby('Neighborhood')['SalePrice'].median().values
ax2_x = df.groupby('Neighborhood')['SalePrice'].median().index
ax2.bar(ax2_x, ax2_y)
ax2.tick_params(axis='x', rotation=80)

plt.tight_layout()



#### Observation from graphs

Neighborhood is not a great predictor but there is a trend with above ground living area

In [None]:
df.sort_index()
dfT = df[['GrLivArea', 'SalePrice']]
dfT

In [None]:
X_all = dfT.loc[:,'GrLivArea'].values.reshape(-1,1)
y_all = dfT.loc[:,'SalePrice'].values.reshape(-1,1)


### Regression Fitting

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics 
import numpy as np


X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2)
# #for locally testing accuracy


regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)


regressor.intercept_

In [None]:
plt.scatter(y_test, y_pred)
plt.xlabel('Test')
plt.ylabel('Predict')

x_ref = np.linspace(1000,600000,2) 
y_ref = x_ref
plt.plot(x_ref, y_ref, color="red")
plt.show()


The red line help show how far off each prediction is from the true value

In [None]:
np.sqrt(metrics.mean_squared_error(y_test, y_pred))

To Use Model for the assigned test data

In [None]:
testing_df = pd.read_csv('./dataset/test.csv')
testing_df = testing_df[['Id','GrLivArea']]
testing_df


In [None]:
X_test = testing_df.loc[:,'GrLivArea'].values.reshape(-1,1)
y_pred = regressor.predict(X_test)
y_pred

In [None]:
# fit needs and returns a 2D array [[1,2,3]] so need to convert in and out

y_pred_flat = y_pred.ravel()
my_ans = pd.DataFrame({'Id': testing_df['Id'].values, 'SalePrice': y_pred_flat})
my_ans

In [None]:
# my_ans.to_csv('my_pred.csv',index=False)