# 1. RightMove WebScraping and Data Cleaning

## 1.1 Import packages

In [None]:
#import packages
import pandas as pd
import numpy as np
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import csv
import datetime
import time
from glob import glob

import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use('ggplot')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

## 1.2 Gathering the Data from RightMove Website using BeautifulSoup
### Note: RightMove doesn't allow web scraping, this was just for proof of concept


In [None]:
def generate_url(property_type, business, region, bed_number, price_range): 
    '''Function that takes the property types, type of business if it's sale or rent, region, bed numbers,
    and price range. Generates all the different url and parse them'''
    results = []
    
    #iterates through the different regions
    for key_reg, value_reg in region.items():
        #iterates through the different numbers of beds
        for key_bed, value_bed in bed_number.items():
            #iterates through the different price ranges
            for key_range, value_range in price_range.items():
                for page in tqdm(range(0, 1008, 24)):
                    try:
                        if property_type == 'flat' and business == 'sale':
                            url = 'https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier='+str(value_reg)+'&maxBedrooms='+str(value_bed[1])+'&minBedrooms='+str(value_bed[0])+'&maxPrice='+str(value_range[1])+'&minPrice='+str(value_range[0])+'&index='+str(page)+'&propertyTypes=flat&primaryDisplayPropertyType=flats&mustHave=&dontShow=sharedOwnership%2Cretirement&furnishTypes=&keywords='
                        elif property_type == 'house' and business == 'sale':
                            url = 'https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier='+str(value_reg)+'&maxBedrooms='+str(value_bed[1])+'&minBedrooms='+str(value_bed[0])+'&maxPrice='+str(value_range[1])+'&minPrice='+str(value_range[0])+'&index='+str(page)+'&propertyTypes=bungalow%2Cdetached%2Csemi-detached%2Cterraced&mustHave=&dontShow=sharedOwnership%2Cretirement&furnishTypes=&keywords='
                        elif property_type == 'flat' and business == 'rent':
                            url = 'https://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier='+str(value_reg)+'&maxBedrooms='+str(value_bed[1])+'&minBedrooms='+str(value_bed[0])+'&maxPrice='+str(value_range[1])+'&minPrice='+str(value_range[0])+'&index='+str(page)+'&propertyTypes=flat&primaryDisplayPropertyType=flats&mustHave=&dontShow=houseShare%2Cstudent%2Cretirement&furnishTypes=&keywords='
                        elif property_type == 'house' and business == 'sale':
                            url = 'https://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier='+str(value_reg)+'&maxBedrooms='+str(value_bed[1])+'&minBedrooms='+str(value_bed[0])+'&maxPrice='+str(value_range[1])+'&minPrice='+str(value_range[0])+'&index='+str(page)+'&propertyTypes=bungalow%2Cdetached%2Csemi-detached%2Cterraced&mustHave=&dontShow=houseShare%2Cstudent%2Cretirement&furnishTypes=&keywords='
                        else:
                            print('Error')
                            break    
                        #print('HTTP GET request to URL: %s' % url, end='')
                        response = requests.get(url)
                        #print(' | Status code: %s' % response.status_code)
                        soup = BeautifulSoup(response.text, 'html.parser')
                        
                        #avoid to be blocked
                        if soup.find('title').text == 'hCaptcha solve page':
                             #!.\nordvpn -c
                            time.sleep(5)
                            print(soup.find('title').text)
                            break
                    except:
                        print('page not found')
                    else:
                        soup_res = soup.find_all('div', class_='propertyCard-wrapper')
                        results.extend(soup_res)
    return results 
    

In [None]:
#function that extract the content from the 
def extract_titles_from_result(result):
    titles = []
    for title in result:
        try:
            titles.append(title.find('h2', attrs = {'class':'propertyCard-title' }).text.strip())
        except:
            titles.append(np.nan)
    return titles
    
def extract_address_from_result(result):                    
    address = []
    for add in result:
        try:
            address.append(add.find('meta', attrs = {'itemprop': 'streetAddress'})['content'])
        except:
            address.append(np.nan)
    return address
            
def extract_descriptions_from_result(result):                     
    descriptions = []
    for descr in result:
        try:
            descriptions.append(descr.find('span', attrs = {'data-test': 'property-description'}).text)
        except:
            descriptions.append(np.nan)
    return descriptions
            
def extract_prices_from_result(result, business):                    
    prices = []
    for price in result:
        try:
            if business == 'sale':
                prices.append(price.find('div', attrs = {'class': 'propertyCard-priceValue'}).text.strip())
            elif business == 'rent':
                prices.append(price.find('span', attrs = {'class': 'propertyCard-priceValue'}).text.strip())
        except:
            prices.append(np.nan)
    return prices

def extract_dates_from_result(result): 
    dates = []
    for date in result:
        try:
            dates.append(date.find('span', attrs = {'class': 'propertyCard-branchSummary-addedOrReduced'}).text.split(' ')[-1])
        except:
            dates.append(np.nan) 
    return dates

def extract_sellers_from_result(result):
    sellers = []
    for seller in result:
        try:
            sellers.append(seller.find('span', attrs = {'class': 'propertyCard-branchSummary-branchName'}).text.split('by')[-1].strip())
        except:
            sellers.append(np.nan) 
    return sellers

def extract_images_from_result(result):
    images = []
    for image in result:
        try:
            images.append(image.find('img', attrs = {'itemprop': 'image'})['src'])
        except:
            images.append(np.nan) 
    return images

In [None]:
def to_df(results, business):
    df = pd.DataFrame({'titles':       extract_titles_from_result(results),
                        'address':       extract_address_from_result(results),
                        'description':   extract_descriptions_from_result(results),
                        'prices':        extract_prices_from_result(results, business),
                        'dates':         extract_dates_from_result(results),
                        'sellers':       extract_sellers_from_result(results),
                        'images':        extract_images_from_result(results),
                        })
    df = df[df['titles']!= 'Property']
    return df

In [None]:
price_range_sale = {'1':['50000', '350000'], '2': ['350000', '425000'], '3': ['425000','500000'], '4': ['500000', '600000'], '5': ['600000', '700000'], '6': ['700000', '800000'], '7': ['800000', '900000'], '8': ['1000000', '20000000']} 
price_range_rent = {'1': ['100', '1300'], '2': ['1300', '1500'], '3': ['1500', '1750'], '4': ['1750', '2000'], '5': ['2000', '2500'], '6': ['2500', '3000'], '7': ['3000', '6000'], '8': ['6000', '40000']}
bed_numbers = {'0': ['1', '1'], '1': ['2', '2'], '2': ['3', '3'], '3': ['4', '10']}
regions = {'Central_London': 'REGION%5E92824', 'East_London': 'REGION%5E92825', 'North_London': 'REGION%5E92826', 'West_London': 'REGION%5E92830', 'South_London': 'REGION%5E92051', 'North_West London': 'REGION%5E92827', 'NorthEast_London': 'REGION%5E93926','SouthWest_London': 'REGION%5E92829', 'SouthEast_London': 'REGION%5E92828'}

In [None]:
#generate the csv file with information about flats for sale in London
flats_sale = generate_url('flat', 'sale', regions, bed_numbers, price_range_rent) 
df_flats_sale = to_df(flats_sale, 'sale')
df_flats_sale.to_csv('df_flats_sale.csv')

In [None]:
#generate the csv file with information about houses for sale in London
houses_sale = generate_url('house', 'sale', regions, bed_numbers, price_range_rent) 
df_houses_sale = to_df(houses_sale, 'sale')
df_houses_sale.to_csv('df_houses_sale.csv')

In [None]:
#generate the csv file with information about flats to rent in London
flats_rent = generate_url('flat', 'rent', regions, bed_numbers, price_range_rent) 
df_flats_rent = to_df(flats_rent, 'rent')
df_flats_rent.to_csv('df_flats_rent.csv')

In [None]:
#generate the csv file with information about houses to rent in London
houses_rent = generate_url('house', 'rent', regions, bed_numbers, price_range_rent) 
df_houses_rent = to_df(houses_sale, 'rent')
df_houses_rent.to_csv('df_houses_rent.csv')