In [2]:
import blurbs

import numpy as np
import matplotlib.pyplot as plt
import csv
from functools import reduce
import re 
from collections import defaultdict

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

import pandas as pd

import math

In [3]:
# create the dataframes

DIR = '/home/bryce/Projects/Data_Science/Apt_Prices/csvs/'

address_file =  DIR + 'pd_address_info.csv' 
apt_file = DIR + 'pd_apt_info.csv'

addr_df = pd.read_csv(address_file, sep=';')
apt_df = pd.read_csv(apt_file, sep=';')

In [4]:
# Let's go ahead and clean up the data so we don't have to do that later. Later on in this project, we can salvage data with missing info.
print("apt_df shape before: ", apt_df.shape)
print('addr_df shape before: ', addr_df.shape)

drop_apt_row = apt_df.isna().any(axis=1)
print("dropping this many rows: ", drop_apt_row.sum())
apt_df = apt_df.loc[~drop_apt_row]
apt_df.reset_index(inplace=True)

drop_addr_row = addr_df.isna().any(axis=1)
print("dropping this many addr rows: ", drop_addr_row.sum())
addr_df = addr_df.loc[~drop_addr_row]
addr_df.drop_duplicates(subset=['address'], inplace=True)
addr_df.reset_index(inplace=True)


print("apt_df shape after: ", apt_df.shape)
print('addr_df shape after: ', addr_df.shape)

apt_df shape before:  (1887, 8)
addr_df shape before:  (305, 4)
dropping this many rows:  14
dropping this many addr rows:  0
apt_df shape after:  (1873, 9)
addr_df shape after:  (293, 5)


Let's create functions to generate all the features we might want to use. What features could we want?

- 1) dummy variable for zip code (DONE)
- 2) features representing the description blurb of each property (e.g. TF-IDF matrix, or a reduced dimension version of that) (DONE)
- 3) squared terms and cross terms, e.g. zip code x square footage, zip code x num. beds, etc.
- 4) driving distance from apt to...the battery?
- 5) 'starbucks metric' - number of starbucks within X distance
- 6) school district score

In [5]:
# 1) this has been tested - looks good!

from feature_generation import ZipcodeEncoder


### Example usage:
# zip_encoder = ZipcodeEncoder()
# zip_encoder.generate_zipcode_dummy_vars(apt_df)
# zips_df = zip_encoder.dummy_zips_df

In [6]:
zip_encoder = ZipcodeEncoder()
zip_encoder.generate_zipcode_dummy_vars(apt_df)
zips_df = zip_encoder.dummy_zips_df
print(zips_df.shape)
zips_df.loc[1848]
apt_df.loc[1849]

AttributeError: 'ZipcodeEncoder' object has no attribute 'generate_zipcode_dummy_vars'

In [209]:
# 2)

from blurbs import BlurbFeatures


In [198]:
# import imp
# imp.reload(blurbs)
# from blurbs import BlurbFeatures
# blurb_features = BlurbFeatures()
# blurb_features.compute_training_tfidf_matrix(addr_df.blurb)
# blurb_features.compute_svd_matrix()

# svd_transformer = blurb_features.svd_transformer
# svd = blurb_features.Svd_matrix
svd_transformer


NameError: name 'svd_transformer' is not defined

In [10]:

from feature_generation import CrossTermComputer




Here's a little sample of how to do the linear regression:

We have 1, 2, and 3. Lets put all the features in a big dataframe, and then go for it!

Little complication. It seems that sometimes, apartments.com has multiple results for the same property. So in addr_df, there will be duplicate addresses, but with different blurbs (written differently by whoever posted them).

In [11]:
def get_all_features(apt_df, addr_df):
    y = apt_df[['price']]
    X = apt_df.drop(columns=['price'])
    # print("1848 index: ", X.loc[1848])
    X.reset_index(inplace=True)
    y.reset_index(inplace=True)
    y.drop(columns=['index'], inplace=True)
    print("1848 index: ", X.loc[1872])


    # 1) get dummy zips
    zip_encoder = ZipcodeEncoder()
    zips_df = zip_encoder.generate_training_zipcode_dummy_vars(apt_df)


    # 2) get blurb SVD features
    blurb_features = BlurbFeatures()
    blurb_features.compute_training_tfidf_matrix(addr_df.blurb)
    tfidf_mat = blurb_features.get_training_tfidf_matrix()
    blurb_features.compute_training_svd_df()
    svd_df = blurb_features.traning_svd_df
    

    addrs_and_blurb_svd_feats = pd.concat((addr_df, svd_df), axis = 1)
    addrs_and_blurb_svd_feats.drop(columns=['blurb', 'url', 'latlng'], inplace=True)
    addrs_and_blurb_svd_feats.drop_duplicates(subset=['address'], inplace=True)
    #print("well well well", addrs_and_blurb_svd_feats.loc[304])
    #print("And the addr_df? ", addr_df.loc[304])
    #print('And the sv_df?', svd_df.loc[304])


    addresses_from_apt_df = apt_df[['address']]
    X_blurb_feats = pd.merge(addrs_and_blurb_svd_feats, addresses_from_apt_df, on='address')

    X_blurb_feats.drop(columns=['address'], inplace=True)
    print("blurb feat shape: ", X_blurb_feats.shape)

    # clean up the expensive temporary variables
    del(addrs_and_blurb_svd_feats, addresses_from_apt_df, svd_df, blurb_features)




    X = pd.concat((X, zips_df, X_blurb_feats), axis=1)
    print("now X at loc 1872 ", X.loc[1872])


    # 3) get cross-terms
    first_columns = list(zips_df.columns)
    second_columns = ['beds', 'baths', 'sq_ft', 'units_in_building']
    column_pairs = [('sq_ft', 'sq_ft'), ('beds', 'beds'), ('baths', 'baths')]
    cross_term_computer = CrossTermComputer(first_columns=first_columns, second_columns=second_columns, column_pairs=column_pairs)
    cross_term_df = cross_term_computer.compute_cross_terms(X)


    X = pd.concat((X, cross_term_df), axis=1)
    del(cross_term_computer, cross_term_df)
    return (X, y)  


In [34]:
import imp
imp.reload(blurbs)
imp.reload(feature_generation)
from blurbs import BlurbFeatures

import feature_generation
from feature_generation import FeatureGenerator

fg = FeatureGenerator()
X_train, y_train = fg.get_training_features(apt_df, addr_df)


X, y = get_all_features(apt_df, addr_df)

print(X_train.columns)


dim training blurbs, before cleaning (293,)
dim cleaned_blurbs  (293,)
(293, 980)
training svd matrix shape  (293, 30)
about to return the training_svd_df, with shape (293, 30)
Index(['index', 'svd_1', 'svd_2', 'svd_3', 'svd_4', 'svd_5', 'svd_6', 'svd_7',
       'svd_8', 'svd_9', 'svd_10', 'svd_11', 'svd_12', 'svd_13', 'svd_14',
       'svd_15', 'svd_16', 'svd_17', 'svd_18', 'svd_19', 'svd_20', 'svd_21',
       'svd_22', 'svd_23', 'svd_24', 'svd_25', 'svd_26', 'svd_27', 'svd_28',
       'svd_29', 'svd_30'],
      dtype='object')
columns in dataframe:  Index(['index', 'address', 'beds', 'baths', 'sq_ft', 'location', 'zip',
       'units_in_building', 'zip_29401', 'zip_29403', 'zip_29405', 'zip_29406',
       'zip_29407', 'zip_29410', 'zip_29412', 'zip_29414', 'zip_29418',
       'zip_29455', 'zip_29464', 'zip_29492', 'index', 'svd_1', 'svd_2',
       'svd_3', 'svd_4', 'svd_5', 'svd_6', 'svd_7', 'svd_8', 'svd_9', 'svd_10',
       'svd_11', 'svd_12', 'svd_13', 'svd_14', 'svd_15', 'svd_16'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y.drop(columns=['index'], inplace=True)


In [31]:
def select_column_names(all_column_names, units_in_building = False, zip_feats = False, blurb_feats = False, cross_term_res = []):
    returned_cols = ['beds', 'baths', 'sq_ft']  # we will always include these features
    if units_in_building:
        returned_cols.append('units_in_building')
    
    if zip_feats:
        zip_re = r'^zip_2[0-9]{4}$'
        for col in all_column_names:
            m = re.match(zip_re, col)
            if m:
                returned_cols.append(col)
    
    if blurb_feats:
        blurb_re = r'^svd_[0-9]+$'
        for col in all_column_names:
            m = re.match(blurb_re, col)
            if m:
                returned_cols.append(col)
    
    if cross_term_res is None:
        return returned_cols
        
    cross_term_cols = [col for col in all_column_names if col.find(' x ') != -1]
    returned_cross_term_cols = set()
    for col in cross_term_cols:
        for regexp in cross_term_res:
            m = re.match(regexp, col)
            if m:
                returned_cross_term_cols.add(col)
    returned_cols.extend(returned_cross_term_cols)
    return returned_cols

In [42]:

all_cols = list(X_train.columns)
print(list(all_cols))


c1 = select_column_names(all_cols)
c2 = select_column_names(all_cols, units_in_building=True)
c3 = select_column_names(all_cols, zip_feats=True)
c4 = select_column_names(all_cols, blurb_feats=True)
c5 = select_column_names(all_cols, zip_feats=True, cross_term_res=[r'.*sq_ft.*zip.*'])
c6 = select_column_names(all_cols, zip_feats=True, blurb_feats=True)
c7 = select_column_names(all_cols, units_in_building = True, blurb_feats=True)
c8 = select_column_names(all_cols, units_in_building=True, zip_feats=True, blurb_feats=True, cross_term_res=[r'.*'])
c9 = select_column_names(all_cols, units_in_building=True, zip_feats=True, blurb_feats=True)

col_lists = [c1, c2, c3, c4, c5, c6, c7, c8, c9]
print(col_lists)

def calculate_root_mean_squared_error(reg, X, y):
    y_hat = reg.predict(X)
    diff = y.to_numpy() - y_hat
    diff = diff.flatten()
    sum_squared_error = np.dot(diff, diff).sum()
    n = y.shape[0]
    rmse = math.sqrt(sum_squared_error / n)
    return rmse 

def perform_linear_regression(X, col_names, y):
    print("Running linear regression using features: ", col_names)
    X = X[col_names]
    drop_rows = X.isna().any(axis=1)
    print("rows to drop: " , drop_rows.index[drop_rows == True])
    X = X.loc[~drop_rows]
    y = y.loc[~drop_rows]
    reg = LinearRegression().fit(X, y)
    print("R^2: ", reg.score(X, y))
    rmse = calculate_root_mean_squared_error(reg, X, y)

    print("Root mean squared error: ", rmse)
    print("\n\n")
    return reg, X, y



i = 1
for col_names in col_lists:
    print(i)
    i += 1
    _, _, _ = perform_linear_regression(X_train, col_names, y_train)

['index', 'address', 'beds', 'baths', 'sq_ft', 'location', 'zip', 'units_in_building', 'zip_29401', 'zip_29403', 'zip_29405', 'zip_29406', 'zip_29407', 'zip_29410', 'zip_29412', 'zip_29414', 'zip_29418', 'zip_29455', 'zip_29464', 'zip_29492', 'index', 'svd_1', 'svd_2', 'svd_3', 'svd_4', 'svd_5', 'svd_6', 'svd_7', 'svd_8', 'svd_9', 'svd_10', 'svd_11', 'svd_12', 'svd_13', 'svd_14', 'svd_15', 'svd_16', 'svd_17', 'svd_18', 'svd_19', 'svd_20', 'svd_21', 'svd_22', 'svd_23', 'svd_24', 'svd_25', 'svd_26', 'svd_27', 'svd_28', 'svd_29', 'svd_30', 'baths x baths', 'baths x zip_29401', 'baths x zip_29403', 'baths x zip_29405', 'baths x zip_29406', 'baths x zip_29407', 'baths x zip_29410', 'baths x zip_29412', 'baths x zip_29414', 'baths x zip_29418', 'baths x zip_29455', 'baths x zip_29464', 'baths x zip_29492', 'beds x beds', 'beds x zip_29401', 'beds x zip_29403', 'beds x zip_29405', 'beds x zip_29406', 'beds x zip_29407', 'beds x zip_29410', 'beds x zip_29412', 'beds x zip_29414', 'beds x zip_2

Next steps:

- Look at the coefficients to see what our model thinks is important
- Look at examples where our model vastly over and underestimates price


To do all this better:

- write functions to divide data into training, CV, and test sets, then pick the set of features that minimizes CV error
- try with regularized regression
- try with neural nets
- try hand-picked word features - luxery, pool, etc.
- other features - distance to battery, etc.

Another line:
- understand bias more with graphs and maps

It seems like right now our model has fairly high bias. Hopefully neural nets help with that.

In [36]:
reg, X, y = perform_linear_regression(X_train, c8, y_train)
y_hat = pd.DataFrame(reg.predict(X), columns=['y_hat'])
diff = pd.DataFrame(y.price - y_hat.y_hat, columns=['price - y_hat'])
prices = pd.concat((y, y_hat, diff), axis=1)
prices.sort_values(by=['price - y_hat'], inplace=True)
print(prices.head())


print(X.tail())


Running linear regression using features:  ['beds', 'baths', 'sq_ft', 'units_in_building', 'zip_29401', 'zip_29403', 'zip_29405', 'zip_29406', 'zip_29407', 'zip_29410', 'zip_29412', 'zip_29414', 'zip_29418', 'zip_29455', 'zip_29464', 'zip_29492', 'svd_1', 'svd_2', 'svd_3', 'svd_4', 'svd_5', 'svd_6', 'svd_7', 'svd_8', 'svd_9', 'svd_10', 'svd_11', 'svd_12', 'svd_13', 'svd_14', 'svd_15', 'svd_16', 'svd_17', 'svd_18', 'svd_19', 'svd_20', 'svd_21', 'svd_22', 'svd_23', 'svd_24', 'svd_25', 'svd_26', 'svd_27', 'svd_28', 'svd_29', 'svd_30', 'units_in_building x zip_29403', 'baths x zip_29407', 'units_in_building x zip_29405', 'beds x zip_29455', 'units_in_building x zip_29418', 'beds x zip_29412', 'beds x zip_29406', 'beds x zip_29401', 'baths x zip_29403', 'units_in_building x zip_29410', 'units_in_building x zip_29455', 'baths x zip_29414', 'units_in_building x zip_29414', 'baths x zip_29401', 'baths x zip_29492', 'beds x zip_29403', 'beds x zip_29464', 'units_in_building x zip_29492', 'sq_ft

In [None]:
apt_df.loc[1885]
addr_df.loc[addr_df.address == '1562 Burnswick Dr, Charleston, SC 29455']

Unnamed: 0,address,latlng,url,blurb
303,"1562 Burnswick Dr, Charleston, SC 29455","(32.734599679549696, -80.07348447475918)",https://www.apartments.com/1562-burnswick-dr-...,Available March 6th! Nothing beats the ...


In [None]:
# Let's just look at basic info for the apartments that were way over or under predicted

X = X[['beds', 'baths', 'sq_ft', 'units_in_building']]
addresses = apt_df[['address']]
X = X.join(addresses, how='inner')
X_with_prices = X.join(prices, how='inner').sort_values(by=['price - y_hat'])

X_with_prices_and_blurbs = X_with_prices.merge(addr_df, on='address').sort_values(by=['price - y_hat'])
print(X_with_prices_and_blurbs.head())

outfile = DIR + 'linear_regression_price_vs_prediction.csv'

X_with_prices_and_blurbs['price'] = X_with_prices_and_blurbs.price.map(lambda p : '$' + str(p) + '  ')
X_with_prices_and_blurbs['y_hat'] = X_with_prices_and_blurbs.y_hat.map(lambda p : '$' + str(p)  + '  ')
X_with_prices_and_blurbs['price - y_hat'] = X_with_prices_and_blurbs['price - y_hat'].map(lambda p : '$' + str(p)  + '  ')
X_with_prices_and_blurbs['beds'] = X_with_prices_and_blurbs.beds.map(lambda b : str(b) + ' beds  ')
X_with_prices_and_blurbs['baths'] = X_with_prices_and_blurbs.baths.map(lambda b : str(b) + ' baths  ')
X_with_prices_and_blurbs['sq_ft'] = X_with_prices_and_blurbs.sq_ft.map(lambda  sf: str(sf) + ' sq_ft  ')
X_with_prices_and_blurbs['units_in_building'] = X_with_prices_and_blurbs.units_in_building.map(lambda  u: str(u) + ' units in building  ')




X_with_prices_and_blurbs.to_csv(path_or_buf=outfile, sep=';', index=False, columns=['price', 'y_hat', 'price - y_hat', 'beds', 'baths', 'sq_ft', 'units_in_building', 'address', 'url', 'blurb'])

Let's look at the feature weights, to see what clues that could give us

In [235]:
coeffs = reg.coef_.flatten()
coef_df = pd.DataFrame({"w": coeffs, "feat_name": c8})
print(coef_df.head())
weights_file = DIR + 'analyzing_regression/weights.csv'
coef_df.to_csv(weights_file, sep=';', index=False, columns=['feat_name', 'w'])

             w          feat_name
0 -1750.068266               beds
1  1174.007633              baths
2     1.176499              sq_ft
3   628.178942  units_in_building
4  1094.212515          zip_29401
