# Web Scrapping

In [1]:
import pickle
import warnings
import time
import requests
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd

# to ignore all warnings
warnings.filterwarnings("ignore")

In [2]:
class LinkCollection:
    
    def __init__(self, url:str) -> None:
        self.__url_link = url
        self.__collected_links = None
    
    def collect_links(self):
        start = time.time()
        self.__collected_links = []
        url = self.__url_link
        prev_len = 0
        integer = str(1)
        while True:
            # modifying the link for next page
            end = url.split("-")[-1]
            end = end.split(".")
            end[0] = integer
            end = ".".join(end)
            url = url.split("-")[:-1]
            url.append(end)
            url = "-".join(url)
            # let me know 
            # if int(integer) % 50 == 0:
            #     print(f"Total collected links: {len(self.__collected_links)}")
            
            # parse the data
            # print(integer)
            
            html_data = requests.get(url).text
            soup = BeautifulSoup(html_data, 'lxml')
            html_data = soup.find_all("a", class_="_7ac32433", href=True)
            if len(html_data) == 0:
                end = time.time()
                print(f"Total Time: {end-start} sec")
                return
            # collect the links
            for link in soup.find_all("a", class_="_7ac32433"):
                self.__collected_links.append("".join(["zameen.com", link['href']]))
            integer = int(integer)
            integer += 1
            integer = str(integer)
    
    def return_links(self):
        return self.__collected_links

In [3]:
url = "https://www.zameen.com/all_locations/Islamabad-3-1-9.html"
html_data = requests.get(url).text
soup = BeautifulSoup(html_data, 'lxml')
html_data = soup.find_all("ul", class_="line-list")
urls_list = []
urls_list_city = []
for li in html_data:
    a_tag = li.find_all("a")
    for link in a_tag:
        urls_list_city.append(link.text)
        urls_list.append(link['href'])

In [None]:
all_urls = {}
counter = 0
for url, url_city in zip(urls_list, urls_list_city):
    lc = LinkCollection(url)
    print(f"Links for the city: {url_city}")
    lc.collect_links()
    all_urls[url_city] = lc.return_links()
    print(f"Total collected links for {url_city}: {len(all_urls[url_city])}")
    counter += len(all_urls[url_city])
    print(f"Total collected links: {counter}")
    print()

In [None]:
with open("isb_house_buy.json", "wb") as f:
    pickle.dump(all_urls, f)

In [None]:
isb_urls = {}
with open("isb_house_buy.json", "rb") as f:
    isb_urls = pickle.load(f)

In [None]:
all_urls = []
for key in isb_urls:
    all_urls.extend(isb_urls[key])

In [None]:
len(all_urls)

## Collecting Data

In [None]:
# urls_list = pd.read_csv(filepath_or_buffer="Lahore_House_Buy_Propery_Links.csv")
# urls_list = urls_list["0"].to_list()

In [None]:
def collect_data(url:str):
    data = {}
    html_data = requests.get(url).text
    soup = BeautifulSoup(html_data, 'html.parser')
    property_name = soup.find("h1", class_ = "_64bb5b3b")
    data['property_name'] = property_name.get_text()
    details_metadata = soup.find_all("span", class_="_3af7fa95")
    details_data = soup.find_all("span", class_="_812aa185")
    for detail_meta, data_ in zip(details_metadata,details_data):
        data[detail_meta.get_text()] = data_.get_text()
    description = soup.find("span", class_="_2a806e1e").get_text()
    data['description'] = description
    
    amenities = soup.find_all("li", class_="_579bdb8a")
    for amenity in amenities:
        amenity_metadata = amenity.find("div", class_="a152770c").text
        # print(amenity_metadata)
        amenity_data = amenity.find_all("span", class_="_17984a2c")
        data[amenity_metadata] = list()
        for single_amenity_data in amenity_data:
            # print(single_amenity_data.text)
            data[amenity_metadata].append(single_amenity_data.text)
        # print()
    return data

In [None]:
import time
useless_links = []
whole_data = {}
counter = 0
start = time.time()
for url in all_urls[200:300]:
    url = " https://" + url
    try:
        url_data = collect_data(url)
    except AttributeError:
        useless_links.append(url)
        continue
    whole_data[url] = url_data
    counter += 1
    if counter % 100 == 0:
        end = time.time()
        print(f"Data of {counter} links has been scrapped.")
        print(f"No. of useless links: {len(useless_links)}")
        print(f"Total Time: {end-start}")

In [None]:
df = pd.DataFrame(whole_data).T

In [None]:
df.reset_index(inplace=True)

In [None]:
df

In [None]:
# df1 = pd.read_csv("Islamabad_House_Buy.csv")

In [None]:
df.to_csv("Islamabad_House_Buy.csv", index=None)

In [None]:
df = pd.concat([df, df1], axis=0, ignore_index=True)

In [10]:
df.to_excel("Data/Rawalpindi_House_Buy.xlsx", index=None)
df.to_csv("Data/Rawalpindi_House_Buy.csv", index=None)

In [None]:
df.shape

In [None]:
df = pd.read_csv("Islamabad_House_Buy.csv")

In [4]:
df = pd.read_csv("Rwp_House_Buy.csv")
df1 = pd.read_csv("Rwp_House_Buy_1.csv")
df2 = pd.read_csv("Rwp_House_Buy_2.csv")

In [5]:
df = pd.concat([df, df1, df2], axis=0, ignore_index=True)

In [6]:
df.duplicated().sum()

26

In [9]:
df.sample(5)

Unnamed: 0,index,property_name,Type,Price,Location,Bath(s),Area,Purpose,Bedroom(s),Added,...,Rooms,Community Features,Nearby Locations and Other Facilities,Other Facilities\r\n,Business and Communication,Healthcare Recreational,Plot Features,Initial Amount,Monthly Installment,Remaining Installments
1953,https://zameen.com/Property/bahria_town_rawal...,Brand New Designer House For Sale Bharia Town ...,House,PKR5.5 Crore,"Bahria Town Rawalpindi, Rawalpindi, Punjab",6,10 Marla,For Sale,5,2 weeks ago,...,"['Bedrooms: 5', 'Bathrooms: 6', 'Servant Quart...","['Community Lawn or Garden', 'Community Swimmi...","['Nearby Schools', 'Nearby Hospitals', 'Nearby...","['Maintenance Staff', 'Security Staff', 'Facil...","['Broadband Internet Access', 'Satellite or Ca...","['Sauna', 'Jacuzzi']",,,,
9514,https://zameen.com/Property/rawalpindi_railwa...,House Is Available For Sale,House,PKR3.5 Crore,"Railway Scheme 7, Rawalpindi, Punjab",6,8 Marla,For Sale,8,3 years ago,...,,,,,,,,,,
343,https://zameen.com/Property/bahria_town_rawal...,Beautiful Designer House Single unit For Sale ...,House,PKR10.75 Crore,"Bahria Town Rawalpindi, Rawalpindi, Punjab",6,1 Kanal,For Sale,5,7 hours ago,...,"['Bedrooms: 5', 'Bathrooms: 6', 'Servant Quart...","['Community Lawn or Garden', 'Community Swimmi...","['Nearby Schools', 'Nearby Hospitals', 'Nearby...","['Maintenance Staff', 'Security Staff', 'Facil...","['Broadband Internet Access', 'Satellite or Ca...",['Jacuzzi'],,,,
1318,https://zameen.com/Property/bahria_town_rawal...,14 Marla Corner House Brand New Designer Avail...,House,PKR7.5 Crore,"Bahria Town Rawalpindi, Rawalpindi, Punjab",6,14 Marla,For Sale,5,2 days ago,...,"['Bedrooms: 5', 'Bathrooms: 6', 'Servant Quart...",,"['Nearby Schools', 'Nearby Hospitals', 'Nearby...",,,,,,,
4431,https://zameen.com/Property/chaklala_scheme_c...,Beautiful Brand New Single Story 4.5 Marla Hou...,House,PKR1.3 Crore,"Chaklala Scheme, Rawalpindi, Punjab",2,4 Marla,For Sale,2,2 weeks ago,...,"['Bedrooms', 'Bathrooms', 'Servant Quarters', ...","['Community Lawn or Garden', 'Community Swimmi...","['Nearby Schools', 'Nearby Hospitals', 'Nearby...","['Maintenance Staff', 'Security Staff', 'Facil...","['Broadband Internet Access', 'Satellite or Ca...","['Lawn or Garden', 'Swimming Pool', 'Sauna', '...",,,,
