In [69]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from scipy.stats import pearsonr
from scipy.spatial.distance import euclidean

In [32]:
# Grab Apartment Data
apartmentData = pd.read_csv('boston_apartment_listings.csv')

apartmentRent = apartmentData['Rent']
minRent = min(apartmentRent)
maxRent = max(apartmentRent)
apartmentRent = ((apartmentData['Rent'] - minRent) / (maxRent - minRent))

# FIX LATER
apartmentOverallCrime = [0 if np.isnan(val) else val for val in apartmentData['Overall CrimeRate']]
apartmentViolentCrime = [0 if np.isnan(val) else val for val in apartmentData['Violent CrimeRate']]

apartmentFeatures = list(zip(apartmentRent, apartmentData['Bed'], apartmentData['Bath'], apartmentOverallCrime, apartmentViolentCrime))

apartmentVectors = np.array(apartmentFeatures)

input = np.array([[((2950 - minRent) / (maxRent - minRent)), 2.0, 1.0, 1.86, 0.2866]])
cosineSimilarity = cosine_similarity(input, apartmentVectors)
apartmentData['similarity'] = cosineSimilarity.flatten()

recommend = apartmentData.sort_values(by='similarity', ascending=False)

print("Recommended Apartments for the User:")
i = 0
for index, row in recommend.iterrows():
    print(f"Address  : {row['Address']}, Similarity Score: {row['similarity']:.4f}")
    print(row['Rent'], ((row['Rent'] - minRent) / (maxRent - minRent)), row['Bed'], row['Bath'], row['Overall CrimeRate'], row['Violent CrimeRate'])
    i += 1

    if i == 10:
        break

Recommended Apartments for the User:
Address  : 165 Hemenway Unit 5, Similarity Score: 1.0000
2950 0.17045454545454544 2.0 1.0 1.86 0.2866
Address  : 315 Huntington Unit 2B, Similarity Score: 0.9999
3100 0.20454545454545456 2.0 1.0 1.86 0.2866
Address  : 238 Hemenway Unit b5, Similarity Score: 0.9998
3195 0.22613636363636364 2.0 1.0 1.86 0.2866
Address  : 132 Hemenway Unit 10, Similarity Score: 0.9998
3200 0.22727272727272727 2.0 1.0 1.86 0.2866
Address  : 463 Park Unit 15, Similarity Score: 0.9998
3200 0.22727272727272727 2.0 1.0 1.86 0.2866
Address  : 149 Park dr Unit 22, Similarity Score: 0.9997
3250 0.23863636363636365 2.0 1.0 1.86 0.2866
Address  : 115 St Stephen Unit 51, Similarity Score: 0.9996
3300 0.25 2.0 1.0 1.86 0.2866
Address  : 97 St Stephen Unit 9, Similarity Score: 0.9996
3300 0.25 2.0 1.0 1.86 0.2866
Address  : 409 Huntington Unit 28, Similarity Score: 0.9996
3300 0.25 2.0 1.0 1.86 0.2866
Address  : 238 Hemenway Unit B8, Similarity Score: 0.9994
3395 0.2715909090909091

In [126]:
def scale(data, input):
    dataMin = np.min(data)
    dataMax = np.max(data)

    # Apply MinMax scaling to the single value
    scaledData = (data - dataMin) / (dataMax - dataMin)
    scaledInput = (input - dataMin) / (dataMax - dataMin)

    return scaledData, scaledInput

def recommend(data, location, rent, violent, overall, bed, bath, transitDistance):
    
    # Calculate Rent/Distance Tradeoff
    dataRent = data['Rent']
    minRent = min(dataRent)
    maxRent = max(dataRent)
    rentTradeoff = ((dataRent - rent) / np.average(dataRent))
    rentTradeoff = np.exp(rentTradeoff)

    match location:
        case "Northeastern University":
            dataTransitDistance = data['Northeastern University_transit']
            dataTransitCost = data['Northeastern University_transit_cost']
            dataDriveDistance = data['Northeastern University_driving']
        case "Boston University":
            dataTransitDistance = data['Boston University_transit']
            dataTransitCost = data['Boston University_transit_cost']
            dataDriveDistance = data['Boston University_driving']
        case "Boston College":
            dataTransitDistance = data['Boston College_transit']
            dataTransitCost = data['Boston College_transit_cost']
            dataDriveDistance = data['Boston College_driving']

    minTransit = min(dataTransitDistance)
    maxTransit = max(dataTransitDistance)
    distanceTradeoff = ((dataTransitDistance - transitDistance) / np.average(dataTransitDistance))
    distanceTradeoff = np.exp(distanceTradeoff)

    dataTradeoff = rentTradeoff + distanceTradeoff
    # print(dataTradeoff)

    # Calculate Base Tradeoff (Might be able to hard code as 2)
    tradeoff = np.exp((rent - rent) / np.average(dataRent)) + np.exp((transitDistance - transitDistance) / np.average(dataTransitDistance))

    # Scale Crime Data
    dataViolentCrime, violentScaled = scale(data['Violent CrimeRate'], violent)
    dataOverallCrime, overallScaled = scale(data['Overall CrimeRate'], overall)

    # Data for Bed and Bath
    dataBed, bedScaled = scale(data['Bed'], bed)
    dataBath, bathScaled = scale(data['Bath'], bath)

    # Create Feature Vectors
    # apartmentFeatures = list(zip(dataViolentCrime, dataOverallCrime, dataBed, dataBath, dataTransitDistance, dataTradeoff))
    # apartmentFeatures = list(zip(dataTradeoff, dataRent))
    # apartmentFeatures = list(zip(dataTransitDistance, dataRent))
    # apartmentVectors = np.array(apartmentFeatures)

    scaledDataRent, scaledRent = scale(dataRent, rent)
    scaledDataTransit, scaledTransit = scale(dataTransitDistance, transitDistance)
    apartmentFeatures = list(zip(scaledDataTransit, scaledDataRent, dataTradeoff, dataViolentCrime, dataOverallCrime, dataBed, dataBath))
    apartmentVectors = np.array(apartmentFeatures)


    # Original User Apartment Comparison
    # input = np.array([violentScaled, overallScaled, bedScaled, bathScaled, transitDistance, tradeoff])
    # input = np.array([tradeoff, rent])
    input = np.array([scaledTransit, scaledRent, tradeoff, violentScaled, overallScaled, bedScaled, bathScaled])
    # cosineSimilarity = cosine_similarity(input, apartmentVectors)
    # data['similarity'] = cosineSimilarity.flatten()
    correlations = []

    # Loop through each vector in the array and compute the Pearson correlation
    for vector in apartmentVectors:
        # print(vector, input)
        distance = euclidean(input, vector)
        correlation = 1 / (1 + distance)
        correlations.append((correlation, distance))

    data['similarity'] = correlations
    data['dataTradeoff'] = dataTradeoff

    recommend = data.sort_values(by='similarity', ascending=False)

    return recommend

In [127]:
allData = pd.read_csv('updated_crime_rates.csv')
# print(allData.head())

# allData.loc[allData["Area Name"] == "Fenway", ["Violent CrimeRate", "Overall CrimeRate"]] = [0.2866, 1.86]
# allData.loc[allData["Area Name"] == "Jamaica Plain", ["Violent CrimeRate", "Overall CrimeRate"]] = [0.03, 0.28]
# allData.loc[allData["Area Name"] == "Brookline", ["Violent CrimeRate", "Overall CrimeRate"]] = [0.01, 0.25]
# allData.loc[allData["Area Name"] == "Roxbury", ["Violent CrimeRate", "Overall CrimeRate"]] = [0.38, 0.07]
# allData.loc[allData["Area Name"] == "Dorchester", "Violent CrimeRate"] = 0.2840

cleanedData = allData.dropna()

scaler = MinMaxScaler()
# data, location, rent, violent, overall, bed, bath, transitDistance
# 3300,0.38,0.07,3.0,1.0,72500.0,63246,75914.0,69459,
# temp = recommend(cleanedData, "Northeastern University", 2950, 0.2866, 1.86, 2.0, 1.0, 196.0)
temp = recommend(cleanedData, "Northeastern University", 3300, 0.38, 0.07, 3.0, 1.0, 72500.0)


# print("Northeastern University", 2950, 0.2866, 1.86, 2.0, 1.0, 196.0)
print("Northeastern University", 3300, 0.38, 0.07, 3.0, 1.0, 72500.0)

print("Recommended Apartments for the User:")
i = 0
for index, row in temp.iterrows():
    print(f"Address  : {row['Address'], row['Area Name']}, Similarity Score: {row['similarity']}")
    print(row['Rent'], row['Violent CrimeRate'], row['Overall CrimeRate'], row['Bed'], row['Bath'], row['Northeastern University_transit'], row['dataTradeoff'])
    i += 1

    if i == 100:
        break

Northeastern University 3300 0.38 0.07 3.0 1.0 72500.0
Recommended Apartments for the User:
Address  : ('190 Highland St Unit 1', 'Roxbury'), Similarity Score: (1.0, 0.0)
3300 0.38 0.07 3.0 1.0 72500.0 2.0
Address  : ('1 Newcomb St.,Unit 501,Boston,MA,02118,Roxbury', 'Roxbury'), Similarity Score: (0.7749710892796532, 0.29037071683475885)
5500 0.38 0.07 3.0 2.0 2694.0 1.7788582492889744
Address  : ('48 Guild St.,Unit 3,Boston,MA,02119,Roxbury', 'Roxbury'), Similarity Score: (0.7570774260045655, 0.3208688644666701)
6400 0.38 0.07 4.0 1.0 2071.0 2.2514912456899365
Address  : ('48 Guild St.,Unit 2,Boston,MA,02119,Roxbury', 'Roxbury'), Similarity Score: (0.7570774260045655, 0.3208688644666701)
6400 0.38 0.07 4.0 1.0 2071.0 2.2514912456899365
Address  : ('68 Hammond,Boston,MA,02120,Roxbury', 'Roxbury'), Similarity Score: (0.7569414016075274, 0.32110622813904155)
6400 0.38 0.07 4.0 1.0 497.0 2.2514892639707753
Address  : ('68 Hammond St.,Boston,MA,02120,Roxbury', 'Roxbury'), Similarity Score:

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['similarity'] = correlations
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['dataTradeoff'] = dataTradeoff


In [None]:


data = pd.read_csv('apartments_with_transit_cost.csv')
dataRent = data['Rent']
minRent = min(dataRent)
maxRent = max(dataRent)
dataRent = ((dataRent - 3000) / (maxRent - minRent))
dataRent = np.exp(dataRent)
print(isinstance(dataRent, pd.Series))

scaler = MinMaxScaler()
dataViolent = scaler.fit_transform(allData['Violent CrimeRate'].to_numpy().reshape(-1, 1))
temp = pd.Series(dataViolent.flatten())
print(isinstance(temp, pd.Series))
print(temp)

True
True
0      0.754211
1      0.754211
2      0.754211
3      0.754211
4      0.754211
         ...   
418    0.000000
419    0.000000
420    0.000000
421    0.000000
422    0.000000
Length: 423, dtype: float64
