In [45]:
# All the imports in one place!

import numpy as np
import matplotlib.pyplot as plt
import csv
from functools import reduce
import re 
from collections import defaultdict

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

import pandas as pd

For some apartments, we saved square footage like "1,240 sq ft" as two columns: "1;; 240 sq ft;;"

Let's go through the results, and for any rows with 6 columns instead of 5, we'll check if the second to last column is just a 1 or 2 digit number.

If it is, we'll smush it to the square footage column

In [23]:
results_file = '/home/bryce/Projects/Data_Science/Apt_Prices/all-results.csv'
#print(data.shape)

adjusted_lines = []

with open(results_file, 'r') as f:
    for line in f.readlines():
        columns = line.split(';;')
        if len(columns) > 5:
            extra_cols = columns[4:]
            new_col = reduce(lambda s1, s2: s1.strip() + s2.strip(), extra_cols)
            new_cols = columns[:4] + [new_col]
            new_line = ';; '.join(new_cols)
            adjusted_lines.append(new_line + '\n')
            continue
        elif len(columns) <= 2:
            # these are either blank lines or the couple lines where an address wasn't found
            continue
        adjusted_lines.append(line)

adjusted_results_file = '/home/bryce/Projects/Data_Science/Apt_Prices/cleaned-results.csv'
with open(adjusted_results_file, 'w') as outfile:
    outfile.writelines(adjusted_lines)

OK, so the number of rows is consistent!

But numpy still can't read '$2345' and whatnot.

So let's clean up the file further.

old file columns:

123 Main St, Charleston, SC - Downtown;; $2131;; 2 bd;; 1 ba;; 1,200 sq ft

the new file columns:

123 Main St, Charleston, SC; Downtown; 2; 1; 1200; 2131

In [54]:
SKIP = 'SKIP'

# TODO: for prices with a range, if the range is within some dollar threshold, just return the average instead of skipping
def clean_price(pr_str):
    pr_arr = pr_str.split()
    if len(pr_arr) > 1:
        print("Multi word price: ", pr_str)
        return SKIP
    price = re.sub(r'\$|,', '', pr_arr[0]).strip()
    return price

def get_first_word_as_number(string, expected_num_words):
    arr = string.split()
    if len(arr) != expected_num_words:
        print("Expected " + str(expected_num_words) + " words in this string: ", string)
        return SKIP
    result = re.sub(r',', '', arr[0].strip())
    return result


def split_address_and_part_of_town(addr_str):
    arr = addr_str.split('–')
    if len(arr) == 1:
        print("No part of town for this address string: ", addr_str)
        return addr_str, ''
    if len(arr) != 2:
        print("Expected only one '–' in the address string: ", addr_str)
    [addr, part_of_town] = map(lambda s: s.strip(), arr)
    return addr, part_of_town


new_lines = ['address;part_of_town;beds;baths;square_footage;rent\n']
cleaned_results_file = '/home/bryce/Projects/Data_Science/Apt_Prices/cleaned-results.csv'
with open(cleaned_results_file, 'r') as f:
    for line in f.readlines():
        # Let's check if any single semicolons are in the file, so we can use just a single one as the delimiter

        single_semicolons = re.findall(r'[^;];[^;]', line)
        if single_semicolons:
            print("Found a stand-alone semicolon in this line: ", line)
        columns = line.split(';;')
        addr, part_of_town = split_address_and_part_of_town(columns[0])
        price = clean_price(columns[1])
        beds = get_first_word_as_number(columns[2], expected_num_words=2)
        baths = get_first_word_as_number(columns[3], expected_num_words=2)
        sq_ft = get_first_word_as_number(columns[4], expected_num_words=3)
        if SKIP in [price, beds, baths, sq_ft]:
            continue
        new_line = ';'.join([addr, part_of_town, beds, baths, sq_ft, price]) + '\n'
        new_lines.append(new_line)

formatted_results_file = '/home/bryce/Projects/Data_Science/Apt_Prices/formatted-results.csv'
with open(formatted_results_file, 'w') as outfile:
    outfile.writelines(new_lines)


Multi word price:   $1,583 – $1,689
Multi word price:    $1,758 – $2,050
Multi word price:    $2,282 – $2,457
Multi word price:   $1,579 – $1,758
Multi word price:    $2,091 – $2,278
Multi word price:   $Call for Rent
Multi word price:   $Call for Rent
Multi word price:   $Call for Rent
Multi word price:   $Call for Rent
Multi word price:   $Call for Rent
Multi word price:   $Call for Rent
Multi word price:   $Call for Rent
Multi word price:   $Call for Rent
Multi word price:   $Call for Rent
Multi word price:   $Call for Rent
Multi word price:   $Call for Rent
Multi word price:   $Call for Rent
Multi word price:   $Call for Rent
Multi word price:   $Call for Rent
Multi word price:   $Call for Rent
Multi word price:   $Call for Rent
Multi word price:   $Call for Rent
Multi word price:   $Call for Rent
Multi word price:   $Call for Rent
Multi word price:   $Call for Rent
Multi word price:   $Call for Rent
Multi word price:   $Call for Rent
Multi word price:   $Call for Rent
Multi word p

Let's actually change this code to make it a pandas dataframe. That seems easier to manipulate data with.

In [2]:
formatted_results_file = '/home/bryce/Projects/Data_Science/Apt_Prices/formatted-results.csv'


dataset = pd.read_csv(formatted_results_file, delimiter=';')

print(dataset.columns)

addresses = dataset['address'].tolist()
print(len(addresses))
zips = [s.split()[-1] for s in addresses]

unique_zips = tuple(set(zips))
dataset['zips'] = zips

zip_df = pd.DataFrame(unique_zips, columns=['Zips'])
label_encoder = LabelEncoder()
zip_df['Zip_Cats'] = label_encoder.fit_transform(zip_df['Zips'])

# 'data' will be what we call the dataset as a numpy array
data = dataset.to_records()
print(data.shape)
print(data['rent'].dtype)

Index(['address', 'part_of_town', 'beds', 'baths', 'square_footage', 'rent'], dtype='object')
1879
(1879,)
int64


In [8]:
addr = data['address'][1]
get_zip = lambda s : s.split()[-1]


addr_list = list(data['address'])
zips = map(get_zip, addr_list)

zip_count = defaultdict(int)
for z in zips:
    zip_count[z] += 1

sorted(zip_count.items(), key=lambda i : -i[1])

[('29403', 537),
 ('29414', 321),
 ('29407', 270),
 ('29492', 227),
 ('29406', 145),
 ('29412', 118),
 ('29464', 63),
 ('29455', 58),
 ('29405', 49),
 ('29418', 37),
 ('29410', 30),
 ('29401', 20),
 ('29466', 3),
 ('29420', 1)]

To start, we're going to do linear regression using just square footage, beds, and baths.

The below code shows the R^2 with these three features is 0.25. This isn't great. Let's see if we can increase it with more variables!

Later we'll add more variables to see how that improves accuracy.

In [72]:
# Once we make all our data numerical, we can just read the file in as X, instead of having to have column names

m = data.shape[0]
X = np.transpose(np.array((data['beds'], data['baths'], data['square_footage'] / 1000.)))
# bed_bath_ratio = X[:, 0] / X[: , 1]
# bed_bath_ratio = bed_bath_ratio.reshape(m, 1)
# print("Min beds before: ", np.min(X[:, 0]))
beds_without_studios = np.copy(X[:, 0])
beds_without_studios[beds_without_studios == 0] = 1
# print("Min beds after: ", np.min(X[:, 0]))


# sqft_per_bed = (X[:,2] / beds_without_studios).reshape(m, 1)
# X = np.append(X, bed_bath_ratio, axis = 1)
# X = np.append(X, sqft_per_bed, axis = 1)

y = data['rent'].astype('float64')

scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)

reg = LinearRegression().fit(rescaledX, y)
print(reg.score(rescaledX, y))

0.2939317419945209


Let's add variables one at a time. First, we'll add dummy variables for zip codes.

In [55]:
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output = False, min_frequency=50)
Dummy_Zips = encoder.fit_transform(data['zips'].reshape(m, 1))
print(Dummy_Zips.shape)
print(Dummy_Zips[0, :])

X_new = np.append(rescaledX, Dummy_Zips, axis = 1)
reg = LinearRegression().fit(X_new, y)
print(reg.score(X_new, y))


(1879, 9)
[0. 0. 0. 0. 0. 0. 0. 1. 0.]
0.5947082267026051


Cool! With zip codes as dummy variables, the R^2 goes up to 0.59! Next we can add a variable for listings with 3 or more apartments in the same address. This will capture the difference between single houses and duplexes, and (usually fancier) larger apartment complexes

In [69]:
address_count = dataset['address'].value_counts()
print(address_count.values)

count_per_unit = dataset['address'].map(lambda a : address_count[a])
count_per_unit_np = count_per_unit.to_numpy().reshape(m, 1)
really_high_indices = count_per_unit > 43

more_than_two = count_per_unit.map(lambda c : int(c > 2)).to_numpy().reshape(m, 1)
more_than_ten = count_per_unit.map(lambda c : int(c > 10)).to_numpy().reshape(m, 1)


X = np.append(X, count_per_unit, axis = 1)
scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)

X = np.append(rescaledX, Dummy_Zips, axis = 1)

cost_in_thousands = y / 1000.

reg = LinearRegression().fit(X , cost_in_thousands)
print(reg.score(X , cost_in_thousands))
print(reg.coef_)



[294  42  40  34  33  31  31  26  26  26  26  25  25  25  24  24  23  23
  23  21  21  21  20  20  20  20  20  20  20  19  19  19  19  18  18  17
  17  17  16  16  16  16  15  14  14  14  13  13  13  13  12  11  11  11
  10  10  10  10  10  10  10  10  10   9   9   9   9   8   8   8   7   7
   6   6   6   5   4   4   4   4   4   3   3   3   3   3   3   3   2   2
   2   2   2   2   2   2   2   1   1   1   1   1   1   1   1   1   1   1
   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
   1   1   1   1   1   1   1   1   1   1   1   1   

ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 1 has 1 dimension(s)

Damn, those are some crazy high coefficients. Let's use the same variables but with L2 regularization.

That's cool, with regularization the coefficients go way down, but the R^2 stays about the same.

In [76]:
X1 = np.append(X, Dummy_Zips, axis = 1)
X2 = np.append(X1, count_per_unit_np, axis = 1)

# what if we got rid of the Morison drive apts with a zillion units
X3 = X2[really_high_indices == False, :]
print(X3[0, :])
print(X3.shape)

dropped_cost = cost_in_thousands[really_high_indices == False]
reg = Ridge(alpha=1)
reg.fit(X3 , dropped_cost)
print(reg.score(X3 , dropped_cost))
print(reg.coef_)


[ 1.    1.    0.78  0.    0.    0.    0.    0.    0.    0.    1.    0.
 16.  ]
(1585, 13)
0.5816264235664939
[-0.22473432  0.29433694  1.5191954   0.56297038 -0.46320126 -0.08182816
  0.11268605 -0.25742259 -0.03191911  0.05620479  0.17562026 -0.07311034
 -0.00249948]
