In [1]:
#Interactive visualization of semiconductor manufacturing plants in the world

import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from geopy.geocoders import Bing


####### Functions

# Function to manipulate text
def text_manipulation(text):
    """Returns the provided text with references, white spaces and new lines removed"""
    text = text.split('[')[0]
    text = text.strip()    
    return text

# Function to download contents of a webpage via BeautifulSoup and requests
def download_webpage(url):
    """Returns HTML parsed contents of a webpage"""
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')
    return soup

# Function to append "https://en.wikipedia.org" to short wiki page
def weblink_manipulation(link):
    """Returns full webpage wikipedia address if link is a wiki page"""
    if 'http' not in link:
        return ("https://en.wikipedia.org" + link)
    else:
        return link
    
# Function to get geographical co-ordinates for semiconductor plant locations from geopy library
def geo_coordinates(semi_plant_location):
    """Returns latitude and longitude of the provided semiconductor plant location. Returns none if co-ordinates are not found"""
    geolocator = Bing(api_key= 'ENTER YOUR API KEY HERE') #### <----- Enter bing maps api key
    try:
        location = geolocator.geocode(semi_plant_location, timeout = 10)
        if location!= None: return [location.latitude, location.longitude]
    except GeocoderTimedOut: #To avoid service timeout error breaking the execution
        pass

# Function to get geographical border co-ordinates of the world from webscraped JSON file
#### Below code adapted from https://teamtreehouse.com/library/plotting-the-world with minor modification
def world_border_coordinates(features):
    depth = lambda L: isinstance(L, list) and max(map(depth, L))+1
    country_id = []
    longitudes = []
    latitudes = []
    
    for feature in features:#json_data['features']:
        coordinates = feature['geometry']['coordinates']
        number_dimensions = depth(coordinates)
        # one border
        if number_dimensions == 3:
            country_id.append(feature['properties']['name'])
            points = np.array(coordinates[0], 'f')
            longitudes.append(points[:, 0])
            latitudes.append(points[:, 1])
        # several borders
        else:
            for shape in coordinates:
                country_id.append(feature['properties']['name'])
                points = np.array(shape[0], 'f')
                longitudes.append(points[:, 0])
                latitudes.append(points[:, 1])
    return [country_id, longitudes, latitudes]
#### Above code adapted from https://teamtreehouse.com/library/plotting-the-world with minor modification

In [2]:
################ Data Wrangling ###############   

#Download webpage
page_cont = download_webpage("https://en.wikipedia.org/wiki/List_of_semiconductor_fabrication_plants")


In [3]:
######### Getting contents from webscraped beautifulsoup object ###########

# Get column names for dataframe
col_names = ["webpage"]
for i in page_cont.table.select('tr th'):
    col_names.append(i.text)



In [4]:
# Get contents of the webscraped table into a dictionary
test_dict = {d_key: [] for d_key in range(0,len(page_cont.table.find_all('tr')[1:]))}

for index, row in enumerate(page_cont.table.find_all('tr')[1:]):
    try:
        test_dict[index].append(weblink_manipulation(row.find('td').find('a', recursive = False)['href']))
    except TypeError:
        test_dict[index].append("NA")
    for cell in row.find_all('td'):
        test_dict[index].append(text_manipulation(cell.text)) #copy all the contents of table into dictionary

In [5]:
# Create dataframe from dictionary 
df = pd.DataFrame.from_dict(test_dict, orient='index')
df.columns = col_names
df.fillna('', inplace = True) #Fill NA with empty string for better analysis

In [6]:
# Get geo_coordinates of plant location from geopy into a dictionary
co_ord = {} #library to store co-ordinates of plant location
gen = (plant_loc for plant_loc in df['Plant location'].unique() if plant_loc) 
#Generator expression, combining for and if loop
for x in gen:
    co_ord[x] = geo_coordinates(x) 

TypeError: __init__() missing 1 required positional argument: 'api_key'

In [103]:
# create new columns for co-ordinates in dataframe and populate dictionary contents
df = df.assign(Latitude = np.NAN, Longitude = np.NAN)

for key in co_ord:
    if key and co_ord[key] != None:
        df.loc[df.index[df['Plant location'] == key], 'Latitude'] = co_ord[key][0]
        df.loc[df.index[df['Plant location'] == key], 'Longitude'] = co_ord[key][1]

In [104]:
# Download world border coordinates JSON file

import requests
countries = requests.get('https://rawgit.com/johan/world.geo.json/master/countries.geo.json').json()
country_coord = world_border_coordinates(countries['features'])

In [105]:
# Data visualization via bokeh

from bokeh.plotting import figure, output_file, show
from bokeh.models import Range1d
from bokeh.models import HoverTool, ColumnDataSource, TapTool
from bokeh.layouts import row
from bokeh.models.widgets import TextInput

output_file("world-semi-manufacture-companies.html")

p = figure(width = 1200, height=650, title='Semiconductor fabrication companies in the world', background_fill_color = '#9ecae1')

#Displaying world map and setting the range of longitude and latitude
for i in range(len(country_coord[0])):
    p.patch(x = country_coord[1][i], y = country_coord[2][i], line_color = 'grey')#, aplha = 0.6)

p.x_range = Range1d(start = -180, end = 180)
p.y_range = Range1d(start = -70, end = 85)

# Assigning source for plotting

source = ColumnDataSource(data={'Longitude' : df['Longitude'], 'Latitude' : df['Latitude'], 'Company': df['Company'],
'Plant name': df['Plant name']})

p.circle(x = 'Longitude', y='Latitude', fill_color='#feb24c', size=10, source = source)
p.add_tools(HoverTool(tooltips=[('Company','@Company'),('Plant name', '@{Plant name}')], mode='mouse'))

show(p)

