<a href="https://colab.research.google.com/github/agustinperalta/webscrapping_argenprop/blob/main/web_scrapping_argenprop_ap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
pip install unidecode

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [32]:
import requests
from unidecode import unidecode
from bs4 import BeautifulSoup

In [123]:
session = requests.Session()
session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'})

In [124]:
'''
This function take an url and call to the API that returns the information of Argentina Provinces, the function return a list only with its name.
'''
def listado_provincias(url):
  try:
    response = session.get(url, timeout=10)
    response.raise_for_status()
    json_data = response.json()
    return [unidecode(prov['nombre'].lower().replace('ciudad autonoma de buenos aires', 'capital federal')) for prov in json_data['provincias'][:-1]]
  except (requests.exceptions.RequestException, ValueError):
    return []

In [125]:
''' 
This function take the list of Argentine Province & return a list of links to each Province.
'''
def armar_links(provincias):
  URL = "https://www.argenprop.com/departamento-alquiler"
  links = set()
  for prov in provincias:
    link = f"{URL}-provincia-{prov.replace(' ', '-')}{'-arg' if prov not in ('capital federal', 'buenos aires') else ''}"
  links.add(link)
  return list(links)
  


In [127]:
''' 
This function takes the paginator element, if is None return -pagina-1 else return a list of strings where each string is Pagina-n, Pagina-n+1 
'''
def range_pages(paginator):
  if paginator:
    pages = paginator.find_all('li', class_='pagination__page')
    pages = [page for page in pages if not 'pagination__page-prev' in page['class'] and not 'pagination__page--disable' in page['class'] and (not 'pagination__page-next' in page['class'])]
    links = [f"-Pagina-{i}" for i in range(int(pages[0].get_text().replace('\n', '').strip()), int(pages[-1].get_text().replace('\n', '').strip())+1)]
  else:
    links = ['-Pagina-1']
  return links

In [126]:
''' 
This function takes the href url and creates the url to access to the property page.
'''
def build_link_property_page(property_url_link):
  MAIN_PATH =  "https://www.argenprop.com"
  return f"{MAIN_PATH}{property_url_link}"

In [133]:
''' 
This function takes the property url and if type is bd (building details) then search & return the list with id=section_9 that has the building installations
else if the type is ud (location details) so return the map element that contains the coordinates
'''
def property_page(property_url_link , type):
  page_property = session.get(property_url_link,timeout=10)
  soup_property = BeautifulSoup(page_property.content, "html.parser")
  if type == 'bd':
    building_details = soup_property.find('ul',{'id': 'section_9'}).find_all('li') if soup_property.find('ul',{'id': 'section_9'}) else None
    return building_details if building_details else None
  elif type == 'ud':
    location_details = soup_property.find('div', {'class': 'leaflet-container'}) if soup_property.find('div', {'class': 'leaflet-container'}) else None
    return location_details if location_details else None

In [38]:
''' 
This function takes the list of building installations, iterate it and return an array of bools that indicates if the building has or not the ammenitie
'''
def extract_building_details(building_details):
  parr = False
  gym = False
  pool = False
  sol = False
  play = False
  sal = False
  asc = False
  if len(building_details) == 0:
    return [parr,gym,pool,sol,play,sal,asc]
  else:
    for element in building_details:
      if element.get_text().replace('\n','').strip() == 'Parrilla':
        parr = True
      elif element.get_text().replace('\n','').strip() == 'Gimnasio':
        gym = True
      elif element.get_text().replace('\n','').strip() == 'Pileta':
        pool = True
      elif element.get_text().replace('\n','').strip() == 'Solarium':
        sol = True
      elif element.get_text().replace('\n','').strip() == 'Juegos para chicos':
        play = True
      elif element.get_text().replace('\n','').strip() == 'Salón de fiestas':
        sal = True
      elif element.get_text().replace('\n','').strip() == 'Ascensor':
        asc = True
    return [parr,gym,pool,sol,play,sal,asc]


In [129]:
'''
This function takes the location details and if the location details is not null so call to API Geolocation Data to get the Province and department name. Return an array with 3 elements 
'''
def extract_location_details(location_details):
  provincia = None
  departamento = None
  municipio = None
  if location_details:
    latitude = float(location_details['data-latitude'].replace(',','.'))
    longitude = float(location_details['data-longitude'].replace(',','.'))
    data = session.get(f'https://apis.datos.gob.ar/georef/api/ubicacion?lat={latitude}&lon={longitude}',timeout=10)
    if data.status_code == 200:
      data = data.json()
      provincia = data['ubicacion']['provincia']['nombre']
      departamento = data['ubicacion']['departamento']['nombre']
      municipio = data['ubicacion']['municipio']['nombre']
      return  [departamento,provincia, municipio]
    else:
      return  [departamento,provincia, municipio]
  else:
    return [departamento,provincia, municipio]

In [40]:
'''
This function takes the element ul with the class card__main-features that contains the information about the department such as surface, number of rooms, etc.
And returns a list with the value of each feature.
'''
def extract_property_features(property_features):
  li_elements = property_features.find_all('li')
  sup=''
  dorm=''
  ant=''
  ban = ''
  amb = ''
  coch = ''
  to = ''
  est = ''
  dis = ''
  for li in li_elements:
    i_element = li.find('i')
    if i_element:
      if i_element.attrs.get('class')[0] == 'icono-superficie_cubierta':
        sup = li.text.strip()
      elif i_element.attrs.get('class')[0] == 'icono-cantidad_dormitorios':
        dorm = li.text.strip()
      elif i_element.attrs.get('class')[0] == 'icono-antiguedad':
        ant = li.text.strip()
      elif i_element.attrs.get('class')[0] == 'icono-cantidad_banos':
        ban = li.text.strip()
      elif i_element.attrs.get('class')[0] == 'icono-cantidad_ambientes':
        amb = li.text.strip()
      elif i_element.attrs.get('class')[0] == 'icono-ambiente_cochera':
        coch = li.text.strip()
      elif i_element.attrs.get('class')[0] == 'icono-cantidad_toilettes':
        to = li.text.strip()
      elif i_element.attrs.get('class')[0] == 'icono-estado_propiedad':
        est = li.text.strip()
      elif i_element.attrs.get('class')[0] == 'icono-disposicion':
        dis = li.text.strip()
  properties = [sup,dorm,ant,ban,amb,coch,to,est,dis]
  return properties

In [41]:
'''
This function takes the element div with class card__monetary-values that contains the main information about the department. 
Return this information as an array.
'''
def extract_main_details(main_details):
  text = main_details.find('p', attrs={"class":"card__price"}).text.replace("\n","").strip()
  span = main_details.find('span', attrs={"class":"card__currency"}).text.replace("\n","").strip()
  span_1 = main_details.find('span', attrs={"class":"card__expenses"}).text.replace("\n","").strip() if main_details.find('span', attrs={"class":"card__expenses"}) else ""
  price = text[len(span):len(span_1)].strip() if span_1 else text[len(span):].strip()
  currency = span
  expenses = span_1
  address = main_details.find("h2", class_="card__address").text.replace("\n","").strip() if main_details.find("h2", class_="card__address") else 'Consultar Dirección'
  title = main_details.find("p", class_="card__title--primary hide-mobile").text.replace("\n","").strip()
  location = main_details.find("p", class_="card__title--primary show-mobile").text.replace("\n","").strip()
  info = [price,currency,expenses,address,title,location]
  return info

In [42]:
'''
This function takes the link of each province and the list of string that contains the pages for each one.
Return an array with the links for all pages of each provinces. e.g: 'https://www.argenprop.com/departamento-alquiler-provincia-misiones-arg-Pagina-1..n'
'''
def generate_all_pages(link,pages):
  all_pages = []
  for pag in pages:
    all_pages.append(link+pag)
  return all_pages

In [131]:
'''
This function takes the array of links for each province and for each link search the card department (element div with class 'listening__item'). 
Call to explore_cards function to take information of cards and return an array with the all department information of each provinces.
'''
def explore_page(all_pages):
  properties_prov = []
  for pagina in all_pages:
    page = session.get(pagina,timeout=10)
    soup = BeautifulSoup(page.content, "html.parser")
    cards = soup.find_all("div",class_="listing__item")
    if cards:
      page_info = explore_cards(cards)
      properties_prov.extend(page_info)
  return properties_prov

In [44]:
'''
This function takes a list of cards and return an array where each element contains information of one card.
'''
def explore_cards(cards):
  page_infos = []
  for card in cards:
    main_details = card.find("div",class_='card__monetary-values')
    property_features = card.find('ul',class_='card__main-features')
    if card.find('a',class_='card'):
      url_link_property = build_link_property_page(card.find('a',class_='card')['href'])
      building_details_list = property_page(url_link_property,type='bd')
      location_details = property_page(url_link_property,type='ud')
      url_property = [build_link_property_page(card.find('a',class_='card')['href'])]
    else:
      continue
    if main_details.find("span",class_="card__noprice"):
      continue
    else:
      details=extract_main_details(main_details)
      properties=extract_property_features(property_features)
      building_details = extract_building_details(building_details_list if building_details_list else [])
      location_details = extract_location_details(location_details if location_details else [])
      info= details + properties + building_details + location_details + url_property if url_property else []
      page_infos.append(info)
  print(page_infos)
  return page_infos


In [45]:
provincias = listado_provincias('https://infra.datos.gob.ar/catalog/modernizacion/dataset/7/distribution/7.2/download/provincias.json')

In [194]:
#If you want to run the script with all province uncomment the follow line
#links = armar_links(provincias)
#You will run the script for Cordoba Province in order to try the script
links = armar_links(['cordoba'])

In [195]:
links

['https://www.argenprop.com/departamento-alquiler-provincia-cordoba-arg']

In [196]:
properties_all = []
for link in links:
  page = session.get(link,timeout=10)
  soup = BeautifulSoup(page.content, "html.parser")
  paginator = soup.find('ul',class_='pagination pagination--links')
  pages = range_pages(paginator)
  all_pages = generate_all_pages(link,pages)
  print(all_pages)
  properties = explore_page(all_pages)
  properties_all.extend(properties)

['https://www.argenprop.com/departamento-alquiler-provincia-cordoba-arg-Pagina-1', 'https://www.argenprop.com/departamento-alquiler-provincia-cordoba-arg-Pagina-2', 'https://www.argenprop.com/departamento-alquiler-provincia-cordoba-arg-Pagina-3', 'https://www.argenprop.com/departamento-alquiler-provincia-cordoba-arg-Pagina-4', 'https://www.argenprop.com/departamento-alquiler-provincia-cordoba-arg-Pagina-5', 'https://www.argenprop.com/departamento-alquiler-provincia-cordoba-arg-Pagina-6']
[['75.000', '$', '&plus $9.600expensas', 'Duarte Quiros 2500, Piso 6', 'Departamento en Alquiler en Alto Alberdi, Cordoba', 'Alto Alberdi, Cordoba', '52  m² cubie.', '1 dorm.', '5 años', '1 baño', '2 ambientes', '', '', 'Muy Bueno', 'Frente', False, False, False, False, False, False, False, 'Capital', 'Córdoba', 'Córdoba', 'https://www.argenprop.com/departamento-en-alquiler-en-alto-alberdi-2-ambientes--6262241'], ['130.000', '$', '', 'Buenos Aires 400, Piso 5', 'Departamento en Alquiler en Cordoba, Cor

In [197]:
import pandas as pd
df = pd.DataFrame(properties_all,columns=[['price','currency','expenses','address','title','location','superficie','dormitorio','antiguedad','banos','ambientes','cocheras','toilett','estado','disposicion','parr','gym','pool','solarium','juegos','salon','ascensor','departamento','provincia','municipio','url_link']])

In [198]:
df

Unnamed: 0,price,currency,expenses,address,title,location,superficie,dormitorio,antiguedad,banos,...,gym,pool,solarium,juegos,salon,ascensor,departamento,provincia,municipio,url_link
0,75.000,$,&plus $9.600expensas,"Duarte Quiros 2500, Piso 6","Departamento en Alquiler en Alto Alberdi, Cordoba","Alto Alberdi, Cordoba",52 m² cubie.,1 dorm.,5 años,1 baño,...,False,False,False,False,False,False,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...
1,130.000,$,,"Buenos Aires 400, Piso 5","Departamento en Alquiler en Cordoba, Cordoba C...","Cordoba, Cordoba Capital",,2 dorm.,25 años,2 baños,...,False,False,False,False,False,False,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...
2,140.000,$,&plus $21.200expensas,"Bv Chacabuco 1200, Piso 1","Departamento en Alquiler en Nueva Cordoba, Cor...","Nueva Cordoba, Cordoba",66 m² cubie.,2 dorm.,30 años,1 baño,...,False,False,False,False,False,False,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...
3,60.000,$,,Soldado Ruiz 1000,"Departamento en Alquiler en San Martin, Cordoba","San Martin, Cordoba",45 m² cubie.,1 dorm.,,1 baño,...,False,False,False,False,False,False,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...
4,55.000,$,,"Salta 100, Piso 1","Departamento en Alquiler en Cordoba, Cordoba C...","Cordoba, Cordoba Capital",30 m² cubie.,1 dorm.,18 años,1 baño,...,False,False,False,False,False,False,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,88.000,$,&plus $8.500expensas,Santiago Temple 100,"Departamento en Alquiler en Nueva Cordoba, Cor...","Nueva Cordoba, Cordoba",38 m² cubie.,1 dorm.,20 años,1 baño,...,False,False,False,False,False,False,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...
99,45.000,$,,Urrutia 500,"Departamento en Alquiler en Alberdi, Cordoba","Alberdi, Cordoba",25 m² cubie.,Monoam.,30 años,,...,False,False,False,False,False,False,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...
100,75.000,$,,Hipolito Vieytes 100,"Departamento en Alquiler en Alberdi, Cordoba","Alberdi, Cordoba",40 m² cubie.,1 dorm.,20 años,1 baño,...,False,False,False,False,False,False,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...
101,115.000,$,&plus $13.596expensas,Poeta Lugones y Derqui,"Departamento en Alquiler en Nueva Cordoba, Cor...","Nueva Cordoba, Cordoba",44 m² cubie.,1 dorm.,16 años,1 baño,...,False,False,False,False,False,False,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...


In [199]:
df.columns = ['price','currency','expenses','address','title','location','superficie','dormitorio','antiguedad','banos','ambientes','cocheras','toilett','estado','disposicion','parr','gym','pool','solarium','juegos','salon','ascensor','departamento','provincia','municipio','url_link']

In [200]:
def change_to_type(df,colum_name,type):
  df[colum_name]=df[colum_name].str.replace('.', '')
  df[colum_name] = df[colum_name].astype(type)

In [201]:
def change_to_numeric(df,colum_name):
  df[df[colum_name]!=''][colum_name] = pd.to_numeric(df[df[colum_name]!=''][colum_name], errors='coerce')
  df[colum_name] = pd.to_numeric(df[colum_name], errors='coerce')

In [202]:
def extract_number(text):
  import re
  if isinstance(text, str):
    if '.' in text:
      text = text.replace('.', '')
    match = re.search(r'\d+', text)
    if match:
        return int(match.group(0))
  return text

In [203]:
def process_column(df, column_name, reference_value=None):
    if column_name in df.columns:
      df[column_name] = df[column_name].apply(lambda x: 0 if x == reference_value else extract_number(x))
    else:
      print("La columna especificada no existe en el DataFrame")
        


In [204]:
pd.set_option('display.float_format', lambda x: '%.0f' % x)

In [205]:
df.dormitorio.unique()

array(['1 dorm.', '2 dorm.', '3 dorm.', '5 dorm.', 'Monoam.', '4 dorm.'],
      dtype=object)

In [206]:
df.antiguedad.unique()

array(['5 años', '25 años', '30 años', '', '18 años', '50 años',
       '16 años', 'A Estrenar', '8 años', '20 años', '3 años', '10 años',
       '47 años', '1 año', '4 años', '7 años', '12 años', '15 años',
       '11 años', '33 años', '2 años', '6 años', '51 años'], dtype=object)

In [207]:
process_column(df, 'dormitorio', 'Monoam.')
process_column(df, 'antiguedad', 'A Estrenar')
process_column(df, 'expenses')
process_column(df, 'superficie')
process_column(df, 'banos')
process_column(df, 'ambientes')
process_column(df, 'cocheras')
process_column(df, 'toilett')

In [208]:
df

Unnamed: 0,price,currency,expenses,address,title,location,superficie,dormitorio,antiguedad,banos,...,gym,pool,solarium,juegos,salon,ascensor,departamento,provincia,municipio,url_link
0,75.000,$,9600,"Duarte Quiros 2500, Piso 6","Departamento en Alquiler en Alto Alberdi, Cordoba","Alto Alberdi, Cordoba",52,1,5,1,...,False,False,False,False,False,False,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...
1,130.000,$,,"Buenos Aires 400, Piso 5","Departamento en Alquiler en Cordoba, Cordoba C...","Cordoba, Cordoba Capital",,2,25,2,...,False,False,False,False,False,False,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...
2,140.000,$,21200,"Bv Chacabuco 1200, Piso 1","Departamento en Alquiler en Nueva Cordoba, Cor...","Nueva Cordoba, Cordoba",66,2,30,1,...,False,False,False,False,False,False,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...
3,60.000,$,,Soldado Ruiz 1000,"Departamento en Alquiler en San Martin, Cordoba","San Martin, Cordoba",45,1,,1,...,False,False,False,False,False,False,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...
4,55.000,$,,"Salta 100, Piso 1","Departamento en Alquiler en Cordoba, Cordoba C...","Cordoba, Cordoba Capital",30,1,18,1,...,False,False,False,False,False,False,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,88.000,$,8500,Santiago Temple 100,"Departamento en Alquiler en Nueva Cordoba, Cor...","Nueva Cordoba, Cordoba",38,1,20,1,...,False,False,False,False,False,False,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...
99,45.000,$,,Urrutia 500,"Departamento en Alquiler en Alberdi, Cordoba","Alberdi, Cordoba",25,0,30,,...,False,False,False,False,False,False,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...
100,75.000,$,,Hipolito Vieytes 100,"Departamento en Alquiler en Alberdi, Cordoba","Alberdi, Cordoba",40,1,20,1,...,False,False,False,False,False,False,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...
101,115.000,$,13596,Poeta Lugones y Derqui,"Departamento en Alquiler en Nueva Cordoba, Cor...","Nueva Cordoba, Cordoba",44,1,16,1,...,False,False,False,False,False,False,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...


In [209]:
df[['localidad_barrio','provincia_region']] = df['location'].str.split(',',expand=True).apply(lambda x: x.str.strip())

In [210]:
df['parr']=df['parr'].apply(lambda x: 1 if x == True else 0)

In [211]:
df['gym']=df['gym'].apply(lambda x: 1 if x == True else 0)

In [212]:
df['pool'] = df['pool'].apply(lambda x: 1 if x == True else 0)

In [213]:
df['solarium']=df['solarium'].apply(lambda x: 1 if x == True else 0)

In [214]:
df['juegos']=df['juegos'].apply(lambda x: 1 if x == True else 0)

In [215]:
df['salon']=df['salon'].apply(lambda x: 1 if x == True else 0)

In [216]:
df['ascensor']=df['ascensor'].apply(lambda x: 1 if x == True else 0)

In [217]:
df

Unnamed: 0,price,currency,expenses,address,title,location,superficie,dormitorio,antiguedad,banos,...,solarium,juegos,salon,ascensor,departamento,provincia,municipio,url_link,localidad_barrio,provincia_region
0,75.000,$,9600,"Duarte Quiros 2500, Piso 6","Departamento en Alquiler en Alto Alberdi, Cordoba","Alto Alberdi, Cordoba",52,1,5,1,...,0,0,0,0,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...,Alto Alberdi,Cordoba
1,130.000,$,,"Buenos Aires 400, Piso 5","Departamento en Alquiler en Cordoba, Cordoba C...","Cordoba, Cordoba Capital",,2,25,2,...,0,0,0,0,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...,Cordoba,Cordoba Capital
2,140.000,$,21200,"Bv Chacabuco 1200, Piso 1","Departamento en Alquiler en Nueva Cordoba, Cor...","Nueva Cordoba, Cordoba",66,2,30,1,...,0,0,0,0,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...,Nueva Cordoba,Cordoba
3,60.000,$,,Soldado Ruiz 1000,"Departamento en Alquiler en San Martin, Cordoba","San Martin, Cordoba",45,1,,1,...,0,0,0,0,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...,San Martin,Cordoba
4,55.000,$,,"Salta 100, Piso 1","Departamento en Alquiler en Cordoba, Cordoba C...","Cordoba, Cordoba Capital",30,1,18,1,...,0,0,0,0,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...,Cordoba,Cordoba Capital
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,88.000,$,8500,Santiago Temple 100,"Departamento en Alquiler en Nueva Cordoba, Cor...","Nueva Cordoba, Cordoba",38,1,20,1,...,0,0,0,0,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...,Nueva Cordoba,Cordoba
99,45.000,$,,Urrutia 500,"Departamento en Alquiler en Alberdi, Cordoba","Alberdi, Cordoba",25,0,30,,...,0,0,0,0,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...,Alberdi,Cordoba
100,75.000,$,,Hipolito Vieytes 100,"Departamento en Alquiler en Alberdi, Cordoba","Alberdi, Cordoba",40,1,20,1,...,0,0,0,0,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...,Alberdi,Cordoba
101,115.000,$,13596,Poeta Lugones y Derqui,"Departamento en Alquiler en Nueva Cordoba, Cor...","Nueva Cordoba, Cordoba",44,1,16,1,...,0,0,0,0,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...,Nueva Cordoba,Cordoba


In [218]:
df.rename(columns={'price': 'precio', 'currency': 'moneda', 'address': 'direccion','title':'titulo','location':'ubicacion','parr':'parrilla','gym':'gimnasio','pool':'pileta','juegos':'playroom','salon':'quincho'}, inplace=True)


In [219]:
change_to_type(df,'precio',float)

  df[colum_name]=df[colum_name].str.replace('.', '')


In [220]:
change_to_numeric(df,'expenses')
change_to_numeric(df,'superficie')
change_to_numeric(df,'dormitorio')
change_to_numeric(df,'antiguedad')
change_to_numeric(df,'banos')
change_to_numeric(df,'ambientes')
change_to_numeric(df,'cocheras')
change_to_numeric(df,'toilett')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[df[colum_name]!=''][colum_name] = pd.to_numeric(df[df[colum_name]!=''][colum_name], errors='coerce')


In [221]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 28 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   precio            103 non-null    float64
 1   moneda            103 non-null    object 
 2   expenses          49 non-null     float64
 3   direccion         103 non-null    object 
 4   titulo            103 non-null    object 
 5   ubicacion         103 non-null    object 
 6   superficie        87 non-null     float64
 7   dormitorio        103 non-null    int64  
 8   antiguedad        76 non-null     float64
 9   banos             100 non-null    float64
 10  ambientes         95 non-null     float64
 11  cocheras          23 non-null     float64
 12  toilett           3 non-null      float64
 13  estado            103 non-null    object 
 14  disposicion       103 non-null    object 
 15  parrilla          103 non-null    int64  
 16  gimnasio          103 non-null    int64  
 1

In [222]:
df.replace(to_replace='', value=pd.NaT, inplace=True)


In [223]:
df.municipio

0      Córdoba
1      Córdoba
2      Córdoba
3      Córdoba
4      Córdoba
        ...   
98     Córdoba
99     Córdoba
100    Córdoba
101    Córdoba
102    Córdoba
Name: municipio, Length: 103, dtype: object

In [224]:
df.describe()

Unnamed: 0,precio,expenses,superficie,dormitorio,antiguedad,banos,ambientes,cocheras,toilett,parrilla,gimnasio,pileta,solarium,playroom,quincho,ascensor
count,103,49,87,103,76,100,95,23,3,103,103,103,103,103,103,103
mean,97344,17512,68,2,10,1,3,1,1,0,0,0,0,0,0,0
std,77226,19853,51,1,12,1,1,0,1,0,0,0,0,0,0,0
min,1200,1,22,0,0,1,1,1,1,0,0,0,0,0,0,0
25%,45000,8000,42,1,0,1,2,1,1,0,0,0,0,0,0,0
50%,80000,12000,49,1,6,1,2,1,1,0,0,0,0,0,0,0
75%,135000,20000,78,2,18,1,3,1,2,0,0,0,0,0,0,0
max,420000,120000,380,5,51,5,7,2,2,1,1,1,1,1,0,0


In [225]:
df

Unnamed: 0,precio,moneda,expenses,direccion,titulo,ubicacion,superficie,dormitorio,antiguedad,banos,...,solarium,playroom,quincho,ascensor,departamento,provincia,municipio,url_link,localidad_barrio,provincia_region
0,75000,$,9600,"Duarte Quiros 2500, Piso 6","Departamento en Alquiler en Alto Alberdi, Cordoba","Alto Alberdi, Cordoba",52,1,5,1,...,0,0,0,0,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...,Alto Alberdi,Cordoba
1,130000,$,,"Buenos Aires 400, Piso 5","Departamento en Alquiler en Cordoba, Cordoba C...","Cordoba, Cordoba Capital",,2,25,2,...,0,0,0,0,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...,Cordoba,Cordoba Capital
2,140000,$,21200,"Bv Chacabuco 1200, Piso 1","Departamento en Alquiler en Nueva Cordoba, Cor...","Nueva Cordoba, Cordoba",66,2,30,1,...,0,0,0,0,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...,Nueva Cordoba,Cordoba
3,60000,$,,Soldado Ruiz 1000,"Departamento en Alquiler en San Martin, Cordoba","San Martin, Cordoba",45,1,,1,...,0,0,0,0,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...,San Martin,Cordoba
4,55000,$,,"Salta 100, Piso 1","Departamento en Alquiler en Cordoba, Cordoba C...","Cordoba, Cordoba Capital",30,1,18,1,...,0,0,0,0,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...,Cordoba,Cordoba Capital
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,88000,$,8500,Santiago Temple 100,"Departamento en Alquiler en Nueva Cordoba, Cor...","Nueva Cordoba, Cordoba",38,1,20,1,...,0,0,0,0,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...,Nueva Cordoba,Cordoba
99,45000,$,,Urrutia 500,"Departamento en Alquiler en Alberdi, Cordoba","Alberdi, Cordoba",25,0,30,,...,0,0,0,0,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...,Alberdi,Cordoba
100,75000,$,,Hipolito Vieytes 100,"Departamento en Alquiler en Alberdi, Cordoba","Alberdi, Cordoba",40,1,20,1,...,0,0,0,0,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...,Alberdi,Cordoba
101,115000,$,13596,Poeta Lugones y Derqui,"Departamento en Alquiler en Nueva Cordoba, Cor...","Nueva Cordoba, Cordoba",44,1,16,1,...,0,0,0,0,Capital,Córdoba,Córdoba,https://www.argenprop.com/departamento-en-alqu...,Nueva Cordoba,Cordoba


In [226]:
def extract_dolar_price(url):
  request = requests.get(url)
  if request.status_code == 200:
    json_data = request.json()
  return json_data['blue']['value_buy']  

In [227]:
dolar_price = extract_dolar_price('https://api.bluelytics.com.ar/v2/latest')

In [228]:
df.loc[df['moneda']=='USD',['precio']] = df.loc[df['moneda']=='USD',['precio']] * dolar_price

In [229]:
df.loc[df['moneda']=='USD',['moneda']] = '$'

In [230]:
df[df.duplicated(subset=['precio','direccion','titulo','ubicacion'])].sort_values(by='titulo')

Unnamed: 0,precio,moneda,expenses,direccion,titulo,ubicacion,superficie,dormitorio,antiguedad,banos,...,solarium,playroom,quincho,ascensor,departamento,provincia,municipio,url_link,localidad_barrio,provincia_region


In [231]:
df.drop_duplicates(subset=['precio','direccion','titulo','ubicacion'],inplace=True)

In [232]:
df.to_csv('web_scrapping_all_result.csv',index=False)