In [1]:
import pandas as pd
from yelpapi import YelpAPI

# 1. Process columns


In [4]:
df = pd.read_csv("C:/Users/Anton/Documents/Anton_Gollbo/Skolarbete/projects/Hemnet_Housing/data/stockholm_housing_df_RAW.csv")

df['fee'] = df['fee'].str.rstrip(' kr/mån')

#Drop 'location' column, as it is redundant
df = df.drop(columns=["location"])

#Split size of apartment and number of rooms into separate columns
df[['size', 'rooms']] = df['size:rooms'].str.split("  ", 1, expand=True)
df = df.drop(columns=["size:rooms"])
#Clean up the columns size and room, removing m2 and "rum"
df['size'] = df['size'].str.rstrip('m²')
df["rooms"] = df["rooms"].str.rstrip(' rum')

#Split adress and floor of apt into separate columns
df[['adress', 'floor']] = df['adress'].str.split(",", 1, expand=True)

#Clean up sold_date, sale_price, value_dev, ppsqm
df["sold_date"] = df["sold_date"].str.lstrip('såld')
df["sale_price"] = df["sale_price"].str.lstrip('slutpris ')
df["sale_price"] = df["sale_price"].str.rstrip(' kr')
df["ppsqm"] = df["ppsqm"].str.rstrip(" kr/m²")
df["value_dev"] = df["value_dev"].str.rstrip(" %")
df["value_dev"] = df["value_dev"].str.lstrip("+")

# Introduce dummy variables in the following way: Balkong = 1, Hiss = 2, Balkong&Hiss = 3, Uteplats = 4
df["features"] = df.features.map( {'balkong':1 , 'hiss':2, 'balkong&hiss':3, 'uteplats': 4, 'NaN':5} )

#Remove everything except numerical values from floor
df['floor'] = df['floor'].str.extract('(\d+)', expand=False)

#Bug where value_dev ends up in wrong columns, due to old posting
bad_index = df[df["ppsqm"].str.find("%") != -1].index
df = df.drop(bad_index, axis=0)

column_list = ["ppsqm", "size", "rooms", "sale_price","fee"]

def fix_numeric(column_list, df):
    new_df = df.copy()
    for i in column_list: 
        new_df[i] = new_df[i].str.replace(" ", "")
        new_df[i] = pd.to_numeric(new_df[i] , errors='coerce')
        new_df = new_df.dropna(subset=[i])
        new_df[i] = new_df[i].astype('int')
    return new_df

cleaned_housing_df = fix_numeric(column_list,df.copy() )
#Reset index after removing rows
cleaned_housing_df = cleaned_housing_df.reset_index(drop=True)
#cleaned_housing_df.to_csv("stockholm_housing_df_CLEANED.csv",index=False)


In [5]:
cleaned_housing_df

Unnamed: 0,adress,fee,features,sale_price,sold_date,value_dev,ppsqm,district,size,rooms,floor
0,frejgatan 50,2391,1.0,6450000,15 januari 2022,,153571,vasastan,42,2,
1,rådmansgatan 86,2021,4.0,7200000,15 januari 2022,13,122034,vasastan,59,2,
2,sankt eriksgatan 109,436,2.0,3150000,14 januari 2022,19,126000,vasastan,25,1,
3,torsgatan 58,1733,2.0,3435000,14 januari 2022,15,110806,vasastan,31,1,1
4,torsplan 8,3950,3.0,10000000,14 januari 2022,11,153846,vasastan,65,3,9
...,...,...,...,...,...,...,...,...,...,...,...
10471,kammakargatan 70,1169,,2800000,19 mars 2013,13,75676,norrmalm,37,2,3
10472,olofsgatan 18,917,,2360000,28 februari 2013,8,73750,norrmalm,32,1,3
10473,regeringsgatan 70 d,3231,,3600000,17 februari 2013,3,52174,norrmalm,69,3,
10474,drottninggatan 80,2647,,3250000,22 januari 2013,5,73864,norrmalm,44,2,3


# Yelp API calls

In [52]:
yelp_df = pd.read_csv("C:/Users/Anton/Documents/Anton_Gollbo/Skolarbete/projects/Hemnet_Housing/data/UNFINISHED_yelp_df_CLEANED.csv")


In [35]:
yelp_api = YelpAPI("KulP_1xAbhj4PcwcltixYR5hz4qMJ2aarTp4uNP_bBED4CsgP1nqY0bZrDxRMMSsZYwqvirOQ1Dy--6v3Y2yS4lBVPmfebDVdXGukr74OZEKRNoivTBiORBJ0v_iYXYx")

#A 'suggested search area' of 500 is used, although, the docs tell us the following: 
#This field is used as a suggestion to the search. The actual search radius may be lower 
#than the suggested radius in dense urban areas, and higher in regions of less business density
#hopefully, it does give some quantifiable value as to the 'closeness' to points of interest a apartment has

def get_POI(adress):
    input_adress = adress
    response = yelp_api.search_query(location=input_adress, radius=500, limit=1)
    return response

def get_yelp_values(df):
    try:
        for i in range(4992, len(df)):
            response = get_POI(df["adress"][i])
            print("Getting values from adress:", df["adress"][i], "from row: " , i)
            
            lat = ( response['region']['center']['latitude'] )
            long = ( response['region']['center']['longitude'] )
            POI = ( response['total'])
            df.at[i, 'Latitude'] = lat
            df.at[i, 'Longitude'] = long
            df.at[i, 'NearbyPOIs'] = POI
            
        return df
    except Exception as e:
        print(e)
        print("\n","error")
        i = i+1
get_yelp_values(yelp_df)

#DONE UNTIL 4991, go from 4992

#yelp_df.to_csv("UNFINISHED_yelp_df_CLEANED.csv",index=False)

Getting values from adress: linnégatan 51 from row:  4500
Getting values from adress: östermalmsgatan 81 from row:  4501
Getting values from adress: surbrunnsgatan 15 from row:  4502
Getting values from adress: valhallavägen 67 from row:  4503
Getting values from adress: karlavägen 54 from row:  4504
Getting values from adress: birger jarlsgatan 104 f from row:  4505
Getting values from adress: danderydsgatan 28a from row:  4506
Getting values from adress: valhallavägen 160 from row:  4507
Getting values from adress: östermalmsgatan 34 a from row:  4508
Getting values from adress: lützengatan 5 a from row:  4509
Getting values from adress: banérgatan 23 from row:  4510
Getting values from adress: valhallavägen 65 from row:  4511
Getting values from adress: valhallavägen 160 from row:  4512
Getting values from adress: birger jarlsgatan 120 from row:  4513
Getting values from adress: nybrogatan 54 from row:  4514
Getting values from adress: brahegatan 37 from row:  4515
Getting values fr

Getting values from adress: östermalmsgatan 68b from row:  4634
Getting values from adress: narvavägen 33 a from row:  4635
Getting values from adress: skeppargatan 25 a from row:  4636
Getting values from adress: kommendörsgatan 24 from row:  4637
Getting values from adress: linnégatan 77a from row:  4638
Getting values from adress: karlavägen 4 from row:  4639
Getting values from adress: styrmansgatan 39 from row:  4640
Getting values from adress: grevgatan 42 from row:  4641
Getting values from adress: grevgatan 31 from row:  4642
Getting values from adress: sibyllegatan 26 from row:  4643
Getting values from adress: gumshornsgatan 6 from row:  4644
Getting values from adress: artillerigatan 29 from row:  4645
Getting values from adress: karlavägen 52 from row:  4646
Getting values from adress: karlavägen 52 from row:  4647
Getting values from adress: östermalmsgatan 67 from row:  4648
Getting values from adress: artillerigatan 63 from row:  4649
Getting values from adress: nybergsg

Getting values from adress: runebergsgatan 9 from row:  4770
Getting values from adress: grev magnigatan 8 from row:  4771
Getting values from adress: karlavägen 20 from row:  4772
Getting values from adress: karlavägen 75b from row:  4773
Getting values from adress: skeppargatan 28 from row:  4774
Getting values from adress: artillerigatan 65 from row:  4775
Getting values from adress: grevgatan 12 from row:  4776
Getting values from adress: valhallavägen 12a from row:  4777
Getting values from adress: karlavägen 82 from row:  4778
Getting values from adress: artillerigatan 75 from row:  4779
Getting values from adress: narvavägen 8 from row:  4780
Getting values from adress: östermalmsgatan 62 from row:  4781
Getting values from adress: valhallavägen 34 from row:  4782
Getting values from adress: ruddammsbacken 28 from row:  4783
Getting values from adress: wittstocksgatan 3c from row:  4784
Getting values from adress: skeppargatan 6a from row:  4785
Getting values from adress: grevg

Getting values from adress: valhallavägen 50 from row:  4906
Getting values from adress: frejgatan 4 from row:  4907
Getting values from adress: skeppargatan 39a from row:  4908
Getting values from adress: riddargatan 76 from row:  4909
Getting values from adress: rådmansgatan 3 from row:  4910
Getting values from adress: linnégatan 19 from row:  4911
Getting values from adress: styrmansgatan 51 a from row:  4912
Getting values from adress: lill-jans plan 6 ö.g from row:  4913
Getting values from adress: grevgatan 47 from row:  4914
Getting values from adress: surbrunnsgatan 9 from row:  4915
Getting values from adress: valhallavägen 158 from row:  4916
Getting values from adress: banérgatan 31 a from row:  4917
Getting values from adress: linnégatan 17 from row:  4918
Getting values from adress: styrmansgatan 50 from row:  4919
Getting values from adress: frejgatan 3 from row:  4920
Getting values from adress: grevgatan 26 from row:  4921
Getting values from adress: birger jarlsgatan 

In [53]:
yelp_df

Unnamed: 0,adress,fee,features,sale_price,sold_date,value_dev,ppsqm,district,size,rooms,floor,Latitude,Longitude,NearbyPOIs
0,frejgatan 50,2391,1.0,6450000,15 januari 2022,,153571,vasastan,42,2,,59.344004,18.043671,82.0
1,rådmansgatan 86,2021,4.0,7200000,15 januari 2022,13,122034,vasastan,59,2,,59.338402,18.049850,86.0
2,sankt eriksgatan 109,436,2.0,3150000,14 januari 2022,19,126000,vasastan,25,1,,59.345404,18.039551,60.0
3,torsgatan 58,1733,2.0,3435000,14 januari 2022,15,110806,vasastan,31,1,1.0,59.343304,18.036118,110.0
4,torsplan 8,3950,3.0,10000000,14 januari 2022,11,153846,vasastan,65,3,9.0,59.345754,18.034058,39.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10471,kammakargatan 70,1169,,2800000,19 mars 2013,13,75676,norrmalm,37,2,3.0,,,
10472,olofsgatan 18,917,,2360000,28 februari 2013,8,73750,norrmalm,32,1,3.0,,,
10473,regeringsgatan 70 d,3231,,3600000,17 februari 2013,3,52174,norrmalm,69,3,,,,
10474,drottninggatan 80,2647,,3250000,22 januari 2013,5,73864,norrmalm,44,2,3.0,,,
