#### Importing required packages

In [1]:
# We will using Scrapy to scrape the data from the web
import scrapy
import re
from scrapy.crawler import CrawlerProcess

#### Spider class to scrape info about cities

In [2]:
class CitiesSpider(scrapy.Spider):   # our class inherits from scrapy.Spider
    name = "cities"
    
    def start_requests(self):
        urls = ["https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population"] # list to enter our urls

        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)  # we will explain the callback soon
    
    def parse(self, response):
        
        city_rows = response.css('table:nth-of-type(5) > tbody > tr')
        
        for city_row in city_rows:
            city_row = city_row.css('td')
            
            if city_row:
                city_details = city_row.css('*:not(sup):not([href^="#cite"]):not(.geo-dec):not(.geo):not(.fn):not(.latitude):not(.longitude)::text').extract()
            
                if city_details:
                    # Latitude and Longitude
                    coordinates = city_row.css('span.geo::text').extract_first()
                    latitude = str(coordinates).split(';')[0]
                    longitude = coordinates.split(';')[1]
                    city_details.append(latitude)
                    city_details.append(longitude)

                    # State Capital and Largest City
                    background_color = city_row.css('::attr(style)').re_first('background-color:#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3})')
                    is_state_capital = 'N'
                    is_state_largest_city = 'N'

                    if background_color == 'ccff99':
                        is_state_capital = 'Y'
                        is_state_largest_city = 'Y'

                    if background_color == 'ffff99':
                        is_state_capital = 'Y'

                    if background_color == 'cfecec':
                        is_state_largest_city = 'Y'

                    city_details.append(is_state_capital)
                    city_details.append(is_state_largest_city)
                    
                    # There may be cities with the same name in different states
                    # Creating the key for city using city name followed by its latitude and longitude
                    # 1st field extracted is city name
                    city = city_details[1] + "_" + latitude + "_" + longitude
                     
                    cities[city] = city_details
                    
                    href = city_row.css('a[href^="/wiki/"]::attr(href)').extract_first().strip()
                    
                    res = response.follow(url=href, callback=self.parse_individual_page)
                    res.meta['city'] = city
                    yield res
    
    def parse_individual_page(self, response):
        city = response.meta.get('city')
        time_zone = response.css('table.infobox > tbody > tr > td > a[title^="UTC"]::text').extract_first()
        cities[city].append(time_zone)

#### Function to clean the scraped data

In [3]:
def clean_data(cities):
    # Cleaning Data
    for city, city_details in cities.items():
        # Replacing non-breaking space with a space
        city_details = [city_detail.replace(u'\xa0', u' ') for city_detail in city_details]
                    
        # Replacing unicode minus sign \u2212 with ascii minus sign
        city_details = [city_detail.replace('\u2212', '-') for city_detail in city_details]
                    
        # Removing unnecessary characters like comma (,)
        city_details = [re.sub(r'[^a-zA-Z0-9\s.\+\-]', '', city_detail) for city_detail in city_details]
        
        city_details = [city_detail.strip() for city_detail in city_details]
        city_details = [city_detail for city_detail in city_details if city_detail]
        
        # Removing units from the data fields
        units = ['km', 'mi', 'sq mi']
        regexp = re.compile(r'(\d+)(.+)(\d+)\s*(%s)\b' % '|'.join(units))
        city_details = [re.sub(regexp, r'\1\2\3', city_detail) for city_detail in city_details]
        
        cities[city] = city_details

#### Function to write a csv file using the cleaned data

In [4]:
def create_csv(cities):
    #Creating CSV file
    filename = 'cities.csv'

    header = ['Rank', 'City', 'State', '2018_Population_Estimate', '2010_Population_Census', 'Change_Population', '2016_Land_Area_sq_mi', '2016_Land_Area_sq_km', '2016_Population_Density_per_sq_mi', '2016_Population_Density_per_sq_km', 'Latitude', 'Longitude', 'Is_State_Capital', 'Is_State_Largest', 'Time Zone']

    with open(filename, 'w') as f:
        # Header
        for header_field in header:
            f.write(header_field + ",")
        f.write("\n")

        # Data
        for city_details in cities.values():
            for city_detail in city_details:
                f.write(city_detail + ",")
            f.write("\n")

#### Running the spider, cleaning the scraped data and creating csv

In [5]:
# Dictonary that will hold the list of details corresponding to a city ('City Name' is the key)
cities = dict()

process = CrawlerProcess()
process.crawl(CitiesSpider)
process.start()

# Cleaning the data that has been scraped
clean_data(cities)
# Creating csv file with the cleaned data
create_csv(cities)

2019-05-29 15:16:33 [scrapy.utils.log] INFO: Scrapy 1.5.2 started (bot: scrapybot)
2019-05-29 15:16:33 [scrapy.utils.log] INFO: Versions: lxml 4.3.2.0, libxml2 2.9.9, cssselect 1.0.3, parsel 1.5.1, w3lib 1.20.0, Twisted 19.2.0, Python 3.7.3 (default, Mar 27 2019, 17:13:21) [MSC v.1915 64 bit (AMD64)], pyOpenSSL 19.0.0 (OpenSSL 1.1.1b  26 Feb 2019), cryptography 2.6.1, Platform Windows-10-10.0.17134-SP0
2019-05-29 15:16:33 [scrapy.crawler] INFO: Overridden settings: {}
2019-05-29 15:16:33 [scrapy.extensions.telnet] INFO: Telnet Password: 5c67f014c61aef7d
2019-05-29 15:16:33 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.logstats.LogStats']
2019-05-29 15:16:33 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defa

2019-05-29 15:16:35 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/Fort_Collins,_Colorado> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)
2019-05-29 15:16:35 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/Shreveport,_Louisiana> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)
2019-05-29 15:16:35 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/Cary,_North_Carolina> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)
2019-05-29 15:16:35 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/Jackson,_Mississippi> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)
2019-05-29 15:16:35 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/Springfield,_Missouri> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cit

2019-05-29 15:16:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/Stamford,_Connecticut> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)
2019-05-29 15:16:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/New_Haven,_Connecticut> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)
2019-05-29 15:16:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/Columbia,_South_Carolina> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)
2019-05-29 15:16:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/Elizabeth,_New_Jersey> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)
2019-05-29 15:16:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/Topeka,_Kansas> (referer: https://en.wikipedia.org/wiki/List_of_United_States_citie

2019-05-29 15:16:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/League_City,_Texas> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)
2019-05-29 15:16:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/Meridian,_Idaho> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)
2019-05-29 15:16:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/Tyler,_Texas> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)
2019-05-29 15:16:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/Rialto,_California> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)
2019-05-29 15:16:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/West_Covina,_California> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)

2019-05-29 15:16:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/Costa_Mesa,_California> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)
2019-05-29 15:16:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/Lowell,_Massachusetts> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)
2019-05-29 15:16:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/Murrieta,_California> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)
2019-05-29 15:16:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/El_Monte,_California> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)
2019-05-29 15:16:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/Carlsbad,_California> (referer: https://en.wikipedia.org/wiki/List_of_United_States_citi

2019-05-29 15:16:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/Hayward,_California> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)
2019-05-29 15:16:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/Elk_Grove,_California> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)
2019-05-29 15:16:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/Salem,_Oregon> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)
2019-05-29 15:16:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/Alexandria,_Virginia> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)
2019-05-29 15:16:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/Oceanside,_California> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_pop

2019-05-29 15:16:42 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/Boise,_Idaho> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)
2019-05-29 15:16:42 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/Fremont,_California> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)
2019-05-29 15:16:42 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/Irving,_Texas> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)
2019-05-29 15:16:42 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/Garland,_Texas> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)
2019-05-29 15:16:42 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/Chesapeake,_Virginia> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)
2019-05

2019-05-29 15:16:43 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/Anaheim,_California> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)
2019-05-29 15:16:44 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/Bakersfield,_California> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)
2019-05-29 15:16:44 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/Arlington,_Texas> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)
2019-05-29 15:16:44 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/Cleveland> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)
2019-05-29 15:16:44 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/Wichita,_Kansas> (referer: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)
201

2019-05-29 15:16:46 [scrapy.core.engine] INFO: Spider closed (finished)
