# CIAN parser by Elsakova Anna

Loading libraries

In [25]:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import math
import time

In [8]:
def html_stripper(text):
    return re.sub('<[^<]+?>', '', str(text))

In [9]:
page = 1

## Find links to site

In [10]:
all_flats = 'http://www.cian.ru/cat.php?deal_type=sale&engine_version=2&offer_type=flat&p={}&region=1&room1=1&room2=1&room3=1&room4=1&room5=1&room6=1&room7=1&room9=1'

## Loading flat links 

In [11]:
#WARNING! More pages
links = []
for page in range(1,30):
    page_url =  all_flats.format(page)

    search_page = requests.get(page_url)
    search_page = search_page.content
    search_page = BeautifulSoup(search_page, 'lxml')

    flat_urls = search_page.findAll('div', attrs = {'ng-class':"{'serp-item_removed': offer.remove.state, 'serp-item_popup-opened': isPopupOpen}"})
    flat_urls = re.split('http://www.cian.ru/sale/flat/|/" ng-class="', str(flat_urls))

    for link in flat_urls:
        if link.isdigit():
            links.append(link)

In [12]:
len(links)

812

## Parsing begins!

### Flat URL number and ordinal number

In [13]:
#for each flat remember it's url number and ordinal number
flats_info = [{'Flat URL': links[i],'N': i} for i in range(len(links))]
flats_info[0:3]

[{'Flat URL': '150197606', 'N': 0},
 {'Flat URL': '150514886', 'N': 1},
 {'Flat URL': '150072433', 'N': 2}]

There are some functions that we would need for parsing information

In [34]:
#this function distinguish a number (or no information) in a string
def parse_quantitative_info( string ):
    #print(string)
    num = re.search(r'([–0-9,]+)', string)
    if num: #if there were appropriate information - not strings
        num = num.groups()[0].replace(',','.') #find the number in the string
        if num == '–': #if it is missing info
            return None #writing NaN
        else:
            return float(num); #write the float number
    else: num is None
    
#this function distinguish is there is a substring that we need in string
def parse_str_info( string, substring ):
    return 1 if re.findall(substring, string) else 0
    
#this function is used for loading data about one flat - flat which url is 
#http://www.cian.ru/sale/flat/%pagenumber%/ 
def go_to_page( page_number ):
    flat_url = 'http://www.cian.ru/sale/flat/' + str(links[i]) + '/'
    flat_page = requests.get(flat_url)
    flat_page = flat_page.content
    flat_page = BeautifulSoup(flat_page, 'lxml')  
    return flat_page;

#this function load table with general information from every flat page
def load_table():
    table = flat_page.find('table', attrs = {'class':'object_descr_props'})
    table = html_stripper(table)
    general_info = table.split('\n')
    return general_info;

def count_distance_to_center(lattitude, longitude):
    kremlin_lattitude = 55.752121
    kremlin_longitude = 37.617664
    kilometers_in_one_degree = 63.995
    dist = math.sqrt(math.pow((kremlin_lattitude - lattitude),2) + math.pow((kremlin_longitude - longitude),2))
    dist = dist * kilometers_in_one_degree
    return dist

And now begins parsing the whole CIAN Moscow

In [42]:
#We need to look on EVERY page
for i in range(0,len(links)):
    #Go to page i
    flat_page = go_to_page(str(links[i]))                         
                      
    #Now we need to find info about number of rooms in every flat (IF 9 THEN STUDIYA)
    for script in flat_page.findAll('script'):
        #If we found a script, inside which we have an info about rooms_count then we found what we need
        res = re.search(r'rooms_count: (.*),', str(script.string))
        if res:
            rooms = int(res.groups()[0])
            total_space = float(re.search(r'total_area: (.*),', str(script.string)).groups()[0].replace(',','.'))
    flats_info[i]['Rooms'] = rooms
    
    #Coordinates
    
    #Price
    price = flat_page.find('meta', attrs = {"itemprop":"price"})
    flats_info[i]['Price'] = parse_quantitative_info(str(price))
    
    #All information about metro
    metro = flat_page.find('span', attrs = {'class':'object_item_metro_comment'})
    if metro is None:
        metro_dist = None
        metro_way = None
    else: 
        metro_dist = int(re.findall(r'([0-9]+)', str(metro))[0])
        metro_way = 1 if re.findall(r'пешком', str(metro)) else 0
    flats_info[i]['Metrdist'] = metro_dist
    flats_info[i]['Walk'] = metro_way
    
    #Coordinates
    coords = flat_page.find('div', attrs={'class':'map_info_button_extend'}).contents[1]
    coords = re.split('&amp|center=|%2C', str(coords))
    coords_list = []
    for item in coords:
        if item[0].isdigit():
            coords_list.append(item)
    lattitude = float(coords_list[0])
    longitude = float(coords_list[1])
    distance_to_center = count_distance_to_center(lattitude, longitude)
    flats_info[i]['Dist'] = distance_to_center;
    
    #table with general information about flat
    general_info = load_table()
    #loading information we needed from general information
    #print(general_info)
    total_space = None
    live_space = None
    kitchen_space = None
    floor_number = None
    total_floors = None
    balcony = None
    new_house = None
    brick_house = None
    telephone = None
    for j in range(0,len(general_info)):
        if general_info[j] == 'Общая площадь:':
            total_space = parse_quantitative_info(general_info[j+2])
        if general_info[j] == 'Жилая площадь:':
            live_space = parse_quantitative_info(general_info[j+2])
        if general_info[j] == 'Площадь кухни:':
            kitchen_space = parse_quantitative_info(general_info[j+2])
        if general_info[j] == 'Этаж:':
            floors_info = re.findall(r'[–0-9,]+',general_info[j+2])
            floor_number = int(floors_info[0])
            if len(floors_info) > 1:
                total_floors = int(re.findall(r'[–0-9,]+',general_info[j+2])[1])
        if general_info[j] == 'Балкон:':
                balcony = parse_quantitative_info(general_info[j+1])
        if general_info[j] == 'Тип дома:':
            new_house = parse_str_info(general_info[j+2],'новостройка')
            brick_house = parse_str_info(general_info[j+3],'кирпичный|монолитный|кирпично-монолитный|жб|железобетонный')
        if general_info[j] == 'Телефон:':
            telephone = parse_str_info(general_info[j+1], 'да') 
            
    flats_info[i]['Totsp'] = total_space        
    flats_info[i]['Livesp'] = live_space
    flats_info[i]['Kitsp'] = kitchen_space
    flats_info[i]['Floor'] = floor_number
    flats_info[i]['Nfloor'] = total_floors
    flats_info[i]['Bal'] = balcony
    flats_info[i]['New'] = new_house
    flats_info[i]['Brick'] = brick_house
    flats_info[i]['Tel'] = telephone
    #print(general_info)
    #print(flats_info[i])
    
#print(flats_info[0:3])

In [43]:
Moscow_flats = pd.DataFrame(flats_info)
Moscow_flats.to_csv('Moscow_flats.csv')