# Scraping SeeClickFix Reviews

SeeClickFix is a site that, according to their own description, is a "tool your neighborhood needs to fix that broken sidewalk and the pothole on the bus route, giving your kids a safer trip to school and improving the quality of life where you live". 

Since SeeClickFix is a site to voice discontent about urban infrastructure it is the optimal place to collect data for urban infrastructure related comments. The following notebook details the scraping and (partly) cleaning of comments collected from SeeClickFix. The whole scraping process is done with the help of Selenium. 


Credit: The code was partly adapted from my fellow research intern Abhay Mahajan, who is also working on the same project. The original code can be found on his [Github](https://github.com/mahajan-abhay/Concordia_University_MITACS).

In [1]:
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

In [2]:
import pandas as pd
import numpy as np
import time

In [218]:
browser = webdriver.Chrome(executable_path="/Users/andreamock/Documents/chromedriver")

In [4]:
def gatherLinks(driver,url, numPages):
    '''Given the url to a page and the number of pages to scroll through to retrieve complaints, iteratives 
    over all of the pages of complaints and saves the links to each individual issue in a list. Finally the list is 
    returned '''
    
    hrefs = []
    for i in range(numPages):
        driver.get(url + str(i)) # get a certain page of comments
        # extract all of the issues on one page 
        issues = driver.find_elements_by_class_name('riverTitle') 
        for issue in issues:
            element = issue.find_elements_by_tag_name("a") # gather link for a particular issues
            link = element[0].get_attribute("href")
            hrefs.append(link) # add link to 
    return hrefs # return all of the 
    

In [6]:
allLinks = gatherLinks(browser,"https://seeclickfix.com/watchers/23686?page=", 119)

In [7]:
len(allLinks) # number of total links collected

1426

In [8]:
allLinks[:3] # subset of links 

['https://seeclickfix.com/issues/1089017-roads-and-sidewalks-rues-et-trottoirs',
 'https://seeclickfix.com/issues/3343034-roads-and-sidewalks-rues-et-trottoirs',
 'https://seeclickfix.com/issues/1105853-municipal-buildings-batiments-municipaux']

In [100]:
def gatherCommentInfo(driver,link):
    driver.get(link) # pull up page 
    
    commentDict = {}
    try: 
        title = driver.find_element_by_xpath("//span[@itemprop = 'name']").text
        location = driver.find_element_by_xpath("//div[@class = 'tagline']").text
        locationClean = location.split('•')[0].strip() # get rid of unnecessary text in location info
    
        description = driver.find_element_by_xpath("//div[@itemprop = 'articleBody']").text
        descriptionClean = description.strip('DESCRIPTION').strip()
        infoBox = driver.find_elements_by_class_name("txt-block")
        infoItems = ['Issue ID', 'Submitted To', 'Category', 'Viewed', 'Neighborhood', 'Reported via']
        for i in range(len(infoBox)):
            lineInfo = infoBox[i].text.split(':')

            category = lineInfo[0].strip()
            infoClean = lineInfo[-1].strip()
            if category in infoItems: 
                commentDict[category] = infoClean
        
        for item in infoItems:
            if item not in commentDict:
                commentDict[item] = None
        
        dateEl = driver.find_element_by_xpath("//div[@class = 'txt-block']/time")
        commentDict['Date'] = dateEl.get_attribute("datetime")
        commentDict['Title'] = title
        commentDict['Location'] = locationClean
        commentDict['Description'] = descriptionClean
        
    except:
        print('Error collecting information for ' + str(link))
    
    return commentDict    
    

In [102]:
# example of a data collected for one complaint
comment = gatherCommentInfo(browser,allLinks[5])
comment

{'Issue ID': '3605026',
 'Submitted To': 'Côte Saint Luc',
 'Category': 'Roads and sidewalks • Rues et trottoirs',
 'Viewed': '228 times',
 'Neighborhood': 'Côte-Saint-Luc',
 'Reported via': 'mobile application',
 'Date': '2017-08-02 13:26:54',
 'Title': 'Roads and sidewalks • Rues et trottoirs',
 'Location': '5581 Rosedale H4V 2J3',
 'Description': 'Two people have already tripped and injured themselves at night on these broken sidewalks. Thanks in advance!'}

In [103]:
# all of the data items that will be collected
comment.keys()

dict_keys(['Issue ID', 'Submitted To', 'Category', 'Viewed', 'Neighborhood', 'Reported via', 'Date', 'Title', 'Location', 'Description'])

In [88]:
import csv

In [104]:
# assign header columns
headerList = ['Issue ID', 'Submitted To', 'Category', 'Viewed', 'Neighborhood', 
              'Reported via', 'Date', 'Title', 'Location', 'Description']
  
# open CSV file and assign header
with open("seeClickFixData.csv", 'w') as file:
    dw = csv.DictWriter(file, delimiter=',', 
                        fieldnames=headerList)
    dw.writeheader()

In [95]:
def append_dict_as_row(file_name, elem_dict):
    # Open file in append mode
    with open(file_name, 'a+', newline='') as write_obj:
        # Create a writer object from csv module
        w = csv.DictWriter(write_obj, elem_dict.keys())
        w.writerow(elem_dict) # 2

In [96]:
def gatherAllComments(driver,listOfLinks): 
    for commentLink in listOfLinks:
        commentInfo = gatherCommentInfo(driver, commentLink)
        append_dict_as_row('seeClickFixData.csv', commentInfo)

In [140]:
gatherAllComments(browser,allLinks[1100:])

In [128]:
df = pd.read_csv('seeClickFixData.csv')
df.shape

(1425, 10)

In [129]:
df.head()

Unnamed: 0,Issue ID,Submitted To,Category,Viewed,Neighborhood,Reported via,Date,Title,Location,Description
0,1089017,Côte Saint Luc,Roads and sidewalks • Rues et trottoirs,462 times,Côte-Saint-Luc,//www.cotesaintluc.org,2014-05-22 17:10:20,Roads and sidewalks • Rues et trottoirs,"6826 The Avenue Côte Saint-Luc, Quebec",The sidewalk/curb that extends from bike path ...
1,3343034,Côte Saint Luc,Roads and sidewalks • Rues et trottoirs,1360 times,Côte-Saint-Luc,//www.cotesaintluc.org,2017-04-16 22:02:52,Roads and sidewalks • Rues et trottoirs,"Ch Emerson Côte Saint-Luc, Québec","Today is day 43, I am beginning to fear my com..."
2,1105853,Côte Saint Luc,Municipal buildings • Bâtiments municipaux,700 times,Côte-Saint-Luc,mobile application,2014-06-03 23:42:45,Municipal buildings • Bâtiments municipaux,"7500 Chemin Mackle Côte Saint-Luc, QC H4W 1A6,...",Chairs too close to railing. Kids could climb ...
3,1515685,Côte Saint Luc,Waste • Matières résiduelles,2016 times,Côte-Saint-Luc,,2015-03-06 15:21:10,Waste • Matières résiduelles,"5752 Av :Lockwood Cote Saint Luc , Quebec",THE COLOR OF A BIN SHOULD NOT MATTER. THURSDAY...
4,1118476,Côte Saint Luc,Other • Autre,404 times,Côte-Saint-Luc,//www.cotesaintluc.org,2014-06-11 15:30:06,BBQ ON BALCONIES,"7030 Kildare Rd Côte Saint-Luc, Quebec",Is the by-law forbidding BBQing on balconies s...


In [130]:
dfClean = df.copy() # create a copy that will contained clean data

In [131]:
# identify entries where no description is provided 
noDescription = df['Description'].apply(lambda x: 'No description provided' in x) 

In [135]:
# substitute all entries that have value no description with None

dfClean.loc[noDescription,'Description'] = None

In [142]:
# gcheck out the different categories 
dfClean['Category'].unique()

array(['Roads and sidewalks • Rues et trottoirs',
       'Municipal buildings • Bâtiments municipaux',
       'Waste • Matières résiduelles', 'Other • Autre',
       'Traffic lights and signs • Feux de circulation & signalisation',
       'Snow Removal • Déneigement', 'Parks • Parcs',
       'Streetlight • Lampadaire', 'Trees and grass • Arbres et gazon',
       'Graffiti', 'Grass • Gazon', 'None', 'Trees • Arbres'],
      dtype=object)

In [144]:
topic = dfClean['Category'].apply(lambda x: x.split('•')[0].strip())
topic[:5] # cleaned topics including only English names 

0    Roads and sidewalks
1    Roads and sidewalks
2    Municipal buildings
3                  Waste
4                  Other
Name: Category, dtype: object

In [145]:
# create a cleaned topic column
dfClean['Topic'] = topic
dfClean = dfClean.drop(columns=['Category'])

In [146]:
dfClean.head()

Unnamed: 0,Issue ID,Submitted To,Viewed,Neighborhood,Reported via,Date,Title,Location,Description,Topic
0,1089017,Côte Saint Luc,462 times,Côte-Saint-Luc,//www.cotesaintluc.org,2014-05-22 17:10:20,Roads and sidewalks • Rues et trottoirs,"6826 The Avenue Côte Saint-Luc, Quebec",The sidewalk/curb that extends from bike path ...,Roads and sidewalks
1,3343034,Côte Saint Luc,1360 times,Côte-Saint-Luc,//www.cotesaintluc.org,2017-04-16 22:02:52,Roads and sidewalks • Rues et trottoirs,"Ch Emerson Côte Saint-Luc, Québec","Today is day 43, I am beginning to fear my com...",Roads and sidewalks
2,1105853,Côte Saint Luc,700 times,Côte-Saint-Luc,mobile application,2014-06-03 23:42:45,Municipal buildings • Bâtiments municipaux,"7500 Chemin Mackle Côte Saint-Luc, QC H4W 1A6,...",Chairs too close to railing. Kids could climb ...,Municipal buildings
3,1515685,Côte Saint Luc,2016 times,Côte-Saint-Luc,,2015-03-06 15:21:10,Waste • Matières résiduelles,"5752 Av :Lockwood Cote Saint Luc , Quebec",THE COLOR OF A BIN SHOULD NOT MATTER. THURSDAY...,Waste
4,1118476,Côte Saint Luc,404 times,Côte-Saint-Luc,//www.cotesaintluc.org,2014-06-11 15:30:06,BBQ ON BALCONIES,"7030 Kildare Rd Côte Saint-Luc, Quebec",Is the by-law forbidding BBQing on balconies s...,Other


In [151]:
def cleanReportedVia(entry):
    if type(entry) != float: # only strip from strings
        return entry.strip('//')
    else:
        return entry

In [153]:
cleanedReported = dfClean['Reported via'].apply(lambda x: cleanReportedVia(x))
cleanedReported[:5]

0    www.cotesaintluc.org
1    www.cotesaintluc.org
2      mobile application
3                     NaN
4    www.cotesaintluc.org
Name: Reported via, dtype: object

In [154]:
# clean the reported via column
dfClean = dfClean.drop(columns=['Reported via'])
dfClean['Reported via'] = cleanedReported

In [155]:
dfClean.head() # newly cleaned dataframe 

Unnamed: 0,Issue ID,Submitted To,Viewed,Neighborhood,Date,Title,Location,Description,Topic,Reported via
0,1089017,Côte Saint Luc,462 times,Côte-Saint-Luc,2014-05-22 17:10:20,Roads and sidewalks • Rues et trottoirs,"6826 The Avenue Côte Saint-Luc, Quebec",The sidewalk/curb that extends from bike path ...,Roads and sidewalks,www.cotesaintluc.org
1,3343034,Côte Saint Luc,1360 times,Côte-Saint-Luc,2017-04-16 22:02:52,Roads and sidewalks • Rues et trottoirs,"Ch Emerson Côte Saint-Luc, Québec","Today is day 43, I am beginning to fear my com...",Roads and sidewalks,www.cotesaintluc.org
2,1105853,Côte Saint Luc,700 times,Côte-Saint-Luc,2014-06-03 23:42:45,Municipal buildings • Bâtiments municipaux,"7500 Chemin Mackle Côte Saint-Luc, QC H4W 1A6,...",Chairs too close to railing. Kids could climb ...,Municipal buildings,mobile application
3,1515685,Côte Saint Luc,2016 times,Côte-Saint-Luc,2015-03-06 15:21:10,Waste • Matières résiduelles,"5752 Av :Lockwood Cote Saint Luc , Quebec",THE COLOR OF A BIN SHOULD NOT MATTER. THURSDAY...,Waste,
4,1118476,Côte Saint Luc,404 times,Côte-Saint-Luc,2014-06-11 15:30:06,BBQ ON BALCONIES,"7030 Kildare Rd Côte Saint-Luc, Quebec",Is the by-law forbidding BBQing on balconies s...,Other,www.cotesaintluc.org


In [157]:
dfClean.to_csv('seeClickFixDataClean.csv') # save cleaned dataframe to csv

## Collecting more SeeClickFix data
In addition to the data collected above. 

In [220]:
def gatherMoreLinks(driver): 
    '''Clicks through multiple pages'''
    links = []
    continueClicking = True
    
    pages = driver.find_element_by_xpath("//div[@class = 'scf-flex scf-flex-justify-between ember-view']").text
    commentsTotal = pages.split('\n')[0].split('of') # grab only number of items showing and total number of items
    print('totcom', commentsTotal)
    totalNumComments = commentsTotal[-1].strip() # total number of comments
    
    while continueClicking:
        time.sleep(5) # sleep time to allow for loading of page 
        hrefs = driver.find_elements_by_xpath(
            "//a[@class = 'ember-view scf-flex scf-flex-align-center scf-color-primary-dark scf-no-underline']")
        for href in hrefs: # gather individual links on one page and add them to a list
            links.append(href.get_attribute("href"))
        
        next_button = driver.find_element_by_xpath("//button[@class = 'scf-pagination--button paginate-next']")
        next_button.click()
        
        # gather the number of page one is at
        currentPage = driver.find_element_by_xpath("//div[@class = 'scf-flex scf-flex-justify-between ember-view']").text
        currentComments = currentPage.split('\n')[0].split('of') # grab only number of items showing
        currentCommentCount = currentComments[0].split()[-1] # number of comment currently showing
        
        #print('totalnum', totalNumComments, 'curr', currentCommentCount) 
        if  totalNumComments == currentCommentCount:
            continueClicking = False
    return links 

In [219]:
# get the link where more reviews are listed

browser.get('https://seeclickfix.com/web_portal/2k1ssvSpae6TaFxzH6eMTSUr/issues/map?lat=45.466188626180674&lng=-73.71551555581392&max_lat=45.794339630460705&max_lng=-72.93548583984376&min_lat=45.13555516012536&min_lng=-74.49554443359376&status=open%2Cacknowledged%2Cclosed%2Carchived&zoom=10')

In [237]:
# gather the links for each comment
moreLinks = gatherMoreLinks(browser)

In [222]:
moreLinks[:3]

['https://seeclickfix.com/web_portal/2k1ssvSpae6TaFxzH6eMTSUr/issues/map/10112914?lat=45.466188626180674&lng=-73.71551555581392&max_lat=45.70905627558719&max_lng=-73.16619873046876&min_lat=45.222677199620094&min_lng=-74.26483154296876&status=open%2Cacknowledged%2Cclosed%2Carchived&zoom=10',
 'https://seeclickfix.com/web_portal/2k1ssvSpae6TaFxzH6eMTSUr/issues/map/9733691?lat=45.466188626180674&lng=-73.71551555581392&max_lat=45.70905627558719&max_lng=-73.16619873046876&min_lat=45.222677199620094&min_lng=-74.26483154296876&status=open%2Cacknowledged%2Cclosed%2Carchived&zoom=10',
 'https://seeclickfix.com/web_portal/2k1ssvSpae6TaFxzH6eMTSUr/issues/map/8625095?lat=45.466188626180674&lng=-73.71551555581392&max_lat=45.70905627558719&max_lng=-73.16619873046876&min_lat=45.222677199620094&min_lng=-74.26483154296876&status=open%2Cacknowledged%2Cclosed%2Carchived&zoom=10']

In [223]:
with open("links.txt", 'w') as f:
    for l in moreLinks:
        f.write(str(l) + '\n')

In [244]:
def gatherComplaintInfo(driver, link): 
    
    complaintDict= dict()
    
    driver.get(link)
    time.sleep(5) # allow page to load
    itemList = ['Title', 'Location', 'Description', 'Date']
    elementList = [
        "//a[@class = 'ember-view scf-flex scf-flex-align-center scf-color-primary-dark scf-no-underline']",
        "//p[@class = 'scf-c-issue-header__truncated-item scf-flex-item scf-mb-none scf-color-mono-dark']",
        "//p[@class = 'scf-mb-base scf-add-line-breaks']","//time[@class = 'scf-capitalize scf-inline-block']"
    ]
    for i in range(len(itemList)):
        try: 
            if itemList[i] == 'Location': 
                location = driver.find_element_by_xpath(elementList[i]).text
                info = location.split('\n')[1]
            elif itemList[i] != 'Date': 
                info = driver.find_element_by_xpath(elementList[i]).text 
            else:
                info = driver.find_element_by_xpath(elementList[i]).get_attribute("datetime")
            complaintDict[itemList[i]] = info
        except:
            complaintDict[itemList[i]] = None
    
    return complaintDict
    

In [235]:
gatherComplaintInfo(browser, moreLinks[0])

{'Title': 'Needs full-depth repair.',
 'Location': '4539-4551 4e Rue Laval, Québec, H7W 2K1, CAN',
 'Description': None,
 'Date': '2021-06-11T11:07:40-04:00'}

In [236]:
def gatherMoreComments(driver,listOfLinks): 
    for commentLink in listOfLinks:
        commentInfo = gatherComplaintInfo(driver, commentLink)
        append_dict_as_row('moreSeeClickFixData.csv', commentInfo)

In [238]:
# assign header columns
headerList2 = ['Title', 'Location', 'Description', 'Date']
  
# open CSV file and assign header
with open("moreSeeClickFixData.csv", 'w') as file:
    dw = csv.DictWriter(file, delimiter=',', 
                        fieldnames=headerList2)
    dw.writeheader()

In [None]:
gatherMoreComments(browser,moreLinks[700:1000])

In [239]:
len(moreLinks)

2108