In [1]:
import pandas as pd
import numpy as np
from bokeh.io import output_notebook, output_file
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.models.tools import HoverTool

In [2]:
#Load the dataset
df_load = pd.read_csv(r"C:\Users\bpraf_000\Downloads\database.csv")
#Preview the data
df_load.head()

Unnamed: 0,Date,Time,Latitude,Longitude,Type,Depth,Depth Error,Depth Seismic Stations,Magnitude,Magnitude Type,...,Magnitude Seismic Stations,Azimuthal Gap,Horizontal Distance,Horizontal Error,Root Mean Square,ID,Source,Location Source,Magnitude Source,Status
0,01/02/1965,13:44:18,19.246,145.616,Earthquake,131.6,,,6.0,MW,...,,,,,,ISCGEM860706,ISCGEM,ISCGEM,ISCGEM,Automatic
1,01/04/1965,11:29:49,1.863,127.352,Earthquake,80.0,,,5.8,MW,...,,,,,,ISCGEM860737,ISCGEM,ISCGEM,ISCGEM,Automatic
2,01/05/1965,18:05:58,-20.579,-173.972,Earthquake,20.0,,,6.2,MW,...,,,,,,ISCGEM860762,ISCGEM,ISCGEM,ISCGEM,Automatic
3,01/08/1965,18:49:43,-59.076,-23.557,Earthquake,15.0,,,5.8,MW,...,,,,,,ISCGEM860856,ISCGEM,ISCGEM,ISCGEM,Automatic
4,01/09/1965,13:32:50,11.938,126.427,Earthquake,15.0,,,5.8,MW,...,,,,,,ISCGEM860890,ISCGEM,ISCGEM,ISCGEM,Automatic


In [3]:
#Create a years field and add it to the dataframe
df_load = df_load.drop([3378, 7512, 20650])
df_load['Year'] = [int(x.split('/')[2]) for x in df_load.iloc[:,0]]
df_load.head()

Unnamed: 0,Date,Time,Latitude,Longitude,Type,Depth,Depth Error,Depth Seismic Stations,Magnitude,Magnitude Type,...,Azimuthal Gap,Horizontal Distance,Horizontal Error,Root Mean Square,ID,Source,Location Source,Magnitude Source,Status,Year
0,01/02/1965,13:44:18,19.246,145.616,Earthquake,131.6,,,6.0,MW,...,,,,,ISCGEM860706,ISCGEM,ISCGEM,ISCGEM,Automatic,1965
1,01/04/1965,11:29:49,1.863,127.352,Earthquake,80.0,,,5.8,MW,...,,,,,ISCGEM860737,ISCGEM,ISCGEM,ISCGEM,Automatic,1965
2,01/05/1965,18:05:58,-20.579,-173.972,Earthquake,20.0,,,6.2,MW,...,,,,,ISCGEM860762,ISCGEM,ISCGEM,ISCGEM,Automatic,1965
3,01/08/1965,18:49:43,-59.076,-23.557,Earthquake,15.0,,,5.8,MW,...,,,,,ISCGEM860856,ISCGEM,ISCGEM,ISCGEM,Automatic,1965
4,01/09/1965,13:32:50,11.938,126.427,Earthquake,15.0,,,5.8,MW,...,,,,,ISCGEM860890,ISCGEM,ISCGEM,ISCGEM,Automatic,1965


In [4]:
#Create a list of year values
lst_years = list(df_load['Year'].unique())
count_years = []
#Preview list of years
lst_years

[1965,
 1966,
 1967,
 1968,
 1969,
 1970,
 1971,
 1972,
 1973,
 1974,
 1975,
 1976,
 1977,
 1978,
 1979,
 1980,
 1981,
 1982,
 1983,
 1984,
 1985,
 1986,
 1987,
 1988,
 1989,
 1990,
 1991,
 1992,
 1993,
 1994,
 1995,
 1996,
 1997,
 1998,
 1999,
 2000,
 2001,
 2002,
 2003,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016]

In [5]:
#Count the number of records in the dataframe for each year in the list of years
for year in lst_years:
    val = df_load[df_load['Year'] == year]
    count_years.append(len(val))

#Preview count_years
count_years

[339,
 234,
 255,
 305,
 323,
 345,
 386,
 388,
 401,
 361,
 411,
 457,
 425,
 410,
 356,
 348,
 321,
 346,
 453,
 482,
 475,
 485,
 505,
 489,
 480,
 528,
 429,
 533,
 466,
 508,
 591,
 541,
 456,
 388,
 446,
 553,
 443,
 444,
 485,
 571,
 533,
 508,
 608,
 508,
 517,
 560,
 712,
 445,
 461,
 480,
 446,
 469]

In [13]:
#Build the Earthquakes Frequency dataframe using the year and number of earthquakes each year lists
df_quake_freq = pd.DataFrame({'Years': lst_years, 'Counts': count_years})
#Preview frequency dataframe
df_quake_freq

Unnamed: 0,Years,Counts
0,1965,339
1,1966,234
2,1967,255
3,1968,305
4,1969,323
5,1970,345
6,1971,386
7,1972,388
8,1973,401
9,1974,361


In [14]:
#Convert dataframe into ColumnDataSource for Bokeh
source_freq = ColumnDataSource(df_quake_freq)

#Create lists from columndatasource 
years_list = source_freq.data['Years'].tolist()
counts_list = source_freq.data['Counts'].tolist()

print(source_freq)

ColumnDataSource(id='1116', ...)


In [26]:
#Define the style of our plots using a custom style function
def style(p):
    #Title
    p.title.align = 'center'
    p.title.text_font_size = '20pt'
    p.title.text_font = 'serif'
    
    #Axis titles
    p.xaxis.axis_label_text_font_size = '14pt'
    p.xaxis.axis_label_text_font_style = 'bold'
    p.yaxis.axis_label_text_font_size = '14pt'
    p.yaxis.axis_label_text_font_style = 'bold'
    
    #Tick labels
    p.xaxis.major_label_text_font_size = '12pt'
    p.yaxis.major_label_text_font_size = '12pt'
    
    #Legend
    p.legend.location = 'top_left'
    
    return p

In [45]:
#Create the Barchart
def plotBar():
    #Show the plot embedded in jupyter notebook
    #output_notebook()
    
    #load the datasource
    cds = ColumnDataSource(data=dict(
        yrs = years_list,
        numQuakes = counts_list
    ))
    
    #Tooltip
    TOOLTIPS = [
        ("Year", " @yrs"),
        ("Number of earthquakes", " @numQuakes")
    ]
    
    #Create a figure
    barChart = figure(title='Frequency of Earthquakes by Year',
                     plot_height=400,
                     plot_width=1000,
                     x_axis_label='Years',
                     y_axis_label='Number of Occurances',
                     x_minor_ticks=2,
                     y_range=(0, df_quake_freq['Counts'].max() + 100),
                     toolbar_location=None,
                     tooltips=TOOLTIPS)
    
    barChart.vbar(x='yrs', bottom=0, top='numQuakes',
                 color='#009999', width=0.75,
                 legend='Year', source=cds)
    
    #Style the bar chart
    barChart = style(barChart)
    
    #how(barChart)
    
    return barChart

#plotBar()



In [46]:
#Create the line chart
def plotLine():
    #Show the plot embedded in jupyter notebook
    #output_notebook()
    
    #Load the datasource
    cds = ColumnDataSource(data=dict(
        yrs = years_list,
        numQuakes = counts_list
    ))
    
    #Tooltip
    TOOLTIPS = [
        ("Year", " @yrs"),
        ("Number of earthquakes", " @numQuakes")
    ]
    
    #Create figure
    p = figure(title='Earthquakes Trend by Year',
              plot_width=800,
              plot_height=400,
              x_axis_label='Years',
              y_axis_label='Number of Occurances',
              x_minor_ticks=2,
              y_range=(0, df_quake_freq['Counts'].max() + 100),
              toolbar_location=None,
              tooltips=TOOLTIPS)
    
    #Create line
    p.line(x='yrs', y='numQuakes', color='#009999', line_width=2, legend='Yearly Trend', source=cds)
    
    #add points to the line for each year
    p.circle(x='yrs', y='numQuakes', color='#009999', size=8, fill_color='white', source=cds)
    
    p = style(p)
    
    #show(p)
    
    return p

#plotLine()
   

In [27]:
#Define the style of our donut using a custom style function
def style2(p):
    #Title
    p.title.align = 'center'
    p.title.text_font_size = '20pt'
    p.title.text_font = 'serif'
    
    #Axis titles
    p.xaxis.axis_label_text_font_size = '14pt'
    p.xaxis.axis_label_text_font_style = 'bold'
    p.yaxis.axis_label_text_font_size = '14pt'
    p.yaxis.axis_label_text_font_style = 'bold'
    
    #Tick labels
    p.xaxis.major_label_text_font_size = '12pt'
    p.yaxis.major_label_text_font_size = '12pt'
    
    #Legend
    p.legend.location = 'top_right'
    
    return p

In [47]:
import math
from math import pi
from bokeh.palettes import Category20c
from bokeh.transform import cumsum

#Create donut chart
def plotDonut():
    #show the plot embedded in jupyter notebook
    #output_notebook()
    
    #grab all the types of occurances and their count
    x = dict(df_load['Type'].value_counts())
    
    #convert the dict to a pandas series
    pie_data = pd.Series(x).reset_index(name='value').rename(columns={'index':'type'})
    
    #add color and angles to the dataset
    pie_data['angle'] = pie_data['value']/pie_data['value'].sum() * 2*pi
    pie_data['color'] = Category20c[len(x)]
    
    #Create figure
    p = figure(title='Types of Earthquakes (1965-2016)',
              plot_height=400,
              toolbar_location=None,
              tools='hover',
              tooltips='@type: @value',
              x_range=(-0.5, 1.0))
    
    #add the donut chart
    p.annular_wedge(x=0, y=1, inner_radius=0.2, outer_radius=0.35,
                   start_angle=cumsum('angle', include_zero=True), 
                   end_angle=cumsum('angle'), line_color='white',
                   fill_color='color', legend='type', source=pie_data)
    
    p.axis.axis_label=None
    p.axis.visible=False
    p.grid.grid_line_color=None
    
    p = style2(p)
    
    #show(p)
    
    return p
    
    
#plotDonut()


In [48]:
#Create a magnitude plot
def plotMagnitude():
    #show the plot embedded in jupyter notebook
    #output_notebook()
    
    magnitude = []
    
    #get the average magnitude value for each year
    for i in df_quake_freq.Years:
        #query the dataframe to get the result set for each year
        x = df_load[df_load['Year'] == i]
        
        #average earthquake magnitude for each year
        data_magnitude = sum(x.Magnitude)/len(x.Magnitude)
        magnitude.append(data_magnitude)
        
    df_quake_freq['Magnitude'] = magnitude
    
    depth = []
    
    #get the average depth val for each year
    for i in df_quake_freq.Years:
        x = df_load[df_load['Year'] == i]
        
        #average val calc
        data_depth = sum(x.Depth)/len(x.Depth)
        depth.append(data_depth)
        
    df_quake_freq['Depth'] = depth
    
    #Get the maximum earthquake magnitude for each year
    max_magnitude = list(df_load.groupby('Year').Magnitude.max())
    
    df_quake_freq['Max_Magnitude'] = max_magnitude
    
    
    #Preview dataframe
    #print(df_quake_freq.head())
    
    #load datasource
    cds = ColumnDataSource=dict(
        yrs = years_list,
        avg_mag = df_quake_freq['Magnitude'].values.tolist(),
        max_mag = df_quake_freq['Max_Magnitude'].values.tolist()
    )
    
    #create tooltips
    TOOLTIPS = [
        ("Year", " @yrs"), 
        ("Average Magnitude", " @avg_mag"),
        ("Maximum Magnitude", " @max_mag")
    ]
    
    #create figure
    mp = figure(title='Maximum and Average Magnitude by Year',
               plot_width=800,
               plot_height=500,
               x_axis_label='Years',
               y_axis_label='Magnitude',
               x_minor_ticks=2,
               y_range=(5, df_quake_freq['Max_Magnitude'].max() + 1),
               toolbar_location=None,
               tooltips=TOOLTIPS)
    
    #create max magnitude line and circles at year marks
    mp.line(x='yrs', y='max_mag', color='#009999', line_width=2, legend='Max Magnitude', source=cds)
    mp.circle(x='yrs', y='max_mag', color='#009999', size=8, fill_color='#009999', source=cds)
    
    #create average mag line and circles
    mp.line(x='yrs', y='avg_mag', color='orange', line_width=2, legend='Avg Magnitude', source=cds)
    mp.circle(x='yrs', y='avg_mag', color='orange', size=8, fill_color='orange', source=cds)
    
    #style the graph
    mp = style(mp)
    
    #show(mp)
    
    return mp

#plotMagnitude()


In [49]:
from bokeh.tile_providers import CARTODBPOSITRON, STAMEN_TERRAIN
from bokeh.themes import built_in_themes
from bokeh.io import curdoc


#Create Geo Map Plot
def plotMap():
    #show the plot embedded in jupyter notebook
    #output_notebook()
    
    #pull lats and lons out of dataset
    lat = df_load['Latitude'].values.tolist()
    lon = df_load['Longitude'].values.tolist()
    
    lst_lat = []
    lst_lon = []
    i = 0
    
    #convert lat and lons into Mercator projection for the Bokeh library 
    for i in range(len(lon)):
        r_major = 6378137.000
        x = r_major * math.radians(lon[i])
        scale = x/lon[i]
        y = 180.0/math.pi * math.log(math.tan(math.pi/4.0 +
            lat[i] * (math.pi/180.0)/2.0)) * scale
        
        lst_lon.append(x)
        lst_lat.append(y)
        i += 1
    
    df_load['coords_x'] = lst_lon
    df_load['coords_y'] = lst_lat
    
    longs = df_load['coords_x'].tolist()
    lats = df_load['coords_y'].tolist()
    mags = df_load['Magnitude'].tolist()
    
    #Create datasource
    cds = ColumnDataSource(data=dict(
        lat=lats,
        lon=longs,
        mag=mags
    ))

    #Tooltip
    TOOLTIPS = [
        ("Magnitude", " @mag")
    ]
    
    #Create figure
    p = figure(title='Earthquake Map',
              plot_width=1000,
              plot_height=500,
              x_range=(-2000000, 6000000),
              y_range=(-1000000, 7000000),
              tooltips=TOOLTIPS)
    
    p.add_tile(CARTODBPOSITRON)

    p.circle(x='lon', y='lat', fill_color='#009999', fill_alpha=0.8, source=cds, legend='Quakes 1965-2016')
    
    

    #style the map plot
    #Title
    p.title.align = 'center'
    p.title.text_font_size = '20pt'
    p.title.text_font = 'serif'
    
    #Legend
    p.legend.location = 'bottom_right'
    p.legend.background_fill_color = 'black'
    p.legend.background_fill_alpha = 0.8
    p.legend.click_policy = 'hide'
    p.legend.label_text_color = 'white'
    
    p.xaxis.visible=False
    p.yaxis.visible=False
    p.axis.axis_label=None
    p.axis.visible=False
    p.grid.grid_line_color=None
    
    #show(p)
    
    return p

#plotMap()
    



In [50]:
#Create grid plot
from bokeh.layouts import gridplot

output_file('dashboard.html')

#Make a grid
grid = gridplot([[plotMap(), plotMagnitude(), plotDonut()], 
                 [plotBar(), plotLine()]])

#show the plot
show(grid)



In [51]:
df_test = pd.read_csv(r'C:\Users\bpraf_000\Downloads\earthquakeTest.csv')
df_train = df_load.drop(['Depth Error', 'Time', 'Depth Seismic Stations', 'Magnitude Error', 'Magnitude Seismic Stations', 'Azimuthal Gap', 'Horizontal Distance', 'Horizontal Error', 
    'Root Mean Square', 'Source', 'Location Source', 'Magnitude Source', 'Status'], axis=1)

#preview training dataset
df_train.head()


Unnamed: 0,Date,Latitude,Longitude,Type,Depth,Magnitude,Magnitude Type,ID,Year,coords_x,coords_y
0,01/02/1965,19.246,145.616,Earthquake,131.6,6.0,MW,ISCGEM860706,1965,16209900.0,2183920.0
1,01/04/1965,1.863,127.352,Earthquake,80.0,5.8,MW,ISCGEM860737,1965,14176760.0,207424.8
2,01/05/1965,-20.579,-173.972,Earthquake,20.0,6.2,MW,ISCGEM860762,1965,-19366470.0,-2341749.0
3,01/08/1965,-59.076,-23.557,Earthquake,15.0,5.8,MW,ISCGEM860856,1965,-2622353.0,-8196832.0
4,01/09/1965,11.938,126.427,Earthquake,15.0,5.8,MW,ISCGEM860890,1965,14073790.0,1338653.0


In [52]:
#preview test dataset
df_test.head()

Unnamed: 0,time,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,...,updated,place,type,horizontalError,depthError,magError,magNst,status,locationSource,magSource
0,2017-01-01T00:04:56.020Z,32.98,-115.545833,11.5,2.68,ml,41.0,77.0,0.06553,0.26,...,2017-02-08T21:33:00.874Z,"2km W of Brawley, CA",earthquake,0.24,0.46,0.196,64.0,reviewed,ci,ci
1,2017-01-01T00:13:25.380Z,2.8327,127.5786,78.93,5.0,mb,,101.0,2.058,0.75,...,2017-03-27T23:53:16.040Z,"131km NNW of Tobelo, Indonesia",earthquake,6.8,7.1,0.065,75.0,reviewed,us,us
2,2017-01-01T00:22:02.820Z,32.973,-115.5505,9.4,2.65,ml,42.0,75.0,0.07023,0.24,...,2017-02-08T21:36:24.950Z,"2km WSW of Brawley, CA",earthquake,0.23,0.61,0.198,76.0,reviewed,ci,ci
3,2017-01-01T00:23:53.890Z,-5.9497,153.8988,10.0,4.1,mb,,185.0,2.457,0.32,...,2017-03-27T23:53:16.040Z,"180km WNW of Panguna, Papua New Guinea",earthquake,7.5,1.9,0.184,8.0,reviewed,us,us
4,2017-01-01T00:45:57.980Z,-2.9302,139.4328,49.25,4.1,mb,,132.0,7.174,0.9,...,2017-03-27T23:53:16.040Z,"132km WSW of Abepura, Indonesia",earthquake,13.5,8.4,0.166,10.0,reviewed,us,us


In [53]:
df_test.columns

Index(['time', 'latitude', 'longitude', 'depth', 'mag', 'magType', 'nst',
       'gap', 'dmin', 'rms', 'net', 'id', 'updated', 'place', 'type',
       'horizontalError', 'depthError', 'magError', 'magNst', 'status',
       'locationSource', 'magSource'],
      dtype='object')

In [54]:
#take the fields we need and drop the rest
df_test_clean = df_test[['time', 'latitude', 'longitude', 'mag', 'depth']]

In [55]:
df_test_clean.head()

Unnamed: 0,time,latitude,longitude,mag,depth
0,2017-01-01T00:04:56.020Z,32.98,-115.545833,2.68,11.5
1,2017-01-01T00:13:25.380Z,2.8327,127.5786,5.0,78.93
2,2017-01-01T00:22:02.820Z,32.973,-115.5505,2.65,9.4
3,2017-01-01T00:23:53.890Z,-5.9497,153.8988,4.1,10.0
4,2017-01-01T00:45:57.980Z,-2.9302,139.4328,4.1,49.25


In [56]:
#rename the fields
df_train = df_train.rename(columns={'Magnitude Type': 'Magnitude_Type'})
df_test_clean = df_test_clean.rename(columns={'time': 'Date', 'latitude': 'Latitude', 'longitude': 'Longitude', 'mag': 'Magnitude', 'depth': 'Depth'})

#preview dataframes
df_train.head()

Unnamed: 0,Date,Latitude,Longitude,Type,Depth,Magnitude,Magnitude_Type,ID,Year,coords_x,coords_y
0,01/02/1965,19.246,145.616,Earthquake,131.6,6.0,MW,ISCGEM860706,1965,16209900.0,2183920.0
1,01/04/1965,1.863,127.352,Earthquake,80.0,5.8,MW,ISCGEM860737,1965,14176760.0,207424.8
2,01/05/1965,-20.579,-173.972,Earthquake,20.0,6.2,MW,ISCGEM860762,1965,-19366470.0,-2341749.0
3,01/08/1965,-59.076,-23.557,Earthquake,15.0,5.8,MW,ISCGEM860856,1965,-2622353.0,-8196832.0
4,01/09/1965,11.938,126.427,Earthquake,15.0,5.8,MW,ISCGEM860890,1965,14073790.0,1338653.0


In [57]:
df_test_clean.head()

Unnamed: 0,Date,Latitude,Longitude,Magnitude,Depth
0,2017-01-01T00:04:56.020Z,32.98,-115.545833,2.68,11.5
1,2017-01-01T00:13:25.380Z,2.8327,127.5786,5.0,78.93
2,2017-01-01T00:22:02.820Z,32.973,-115.5505,2.65,9.4
3,2017-01-01T00:23:53.890Z,-5.9497,153.8988,4.1,10.0
4,2017-01-01T00:45:57.980Z,-2.9302,139.4328,4.1,49.25


In [58]:
#create training and testing dataframes with same columns
df_testing = df_test_clean[['Latitude', 'Longitude', 'Magnitude', 'Depth']]
df_training = df_train[['Latitude', 'Longitude', 'Magnitude', 'Depth']]

#preview dataframes
df_testing.head()


Unnamed: 0,Latitude,Longitude,Magnitude,Depth
0,32.98,-115.545833,2.68,11.5
1,2.8327,127.5786,5.0,78.93
2,32.973,-115.5505,2.65,9.4
3,-5.9497,153.8988,4.1,10.0
4,-2.9302,139.4328,4.1,49.25


In [59]:
df_training.head()

Unnamed: 0,Latitude,Longitude,Magnitude,Depth
0,19.246,145.616,6.0,131.6
1,1.863,127.352,5.8,80.0
2,-20.579,-173.972,6.2,20.0
3,-59.076,-23.557,5.8,15.0
4,11.938,126.427,5.8,15.0


In [60]:
#Drop all null values contained within both dataframes to avoid issues in model
df_training.dropna()
df_testing.dropna()


Unnamed: 0,Latitude,Longitude,Magnitude,Depth
0,32.980000,-115.545833,2.68,11.500
1,2.832700,127.578600,5.00,78.930
2,32.973000,-115.550500,2.65,9.400
3,-5.949700,153.898800,4.10,10.000
4,-2.930200,139.432800,4.10,49.250
...,...,...,...,...
19995,-21.459800,168.774000,4.30,10.000
19996,35.239500,-97.745300,2.60,6.364
19997,42.139833,-121.692667,2.58,6.880
19998,67.461600,-158.713600,2.80,6.500


In [61]:
#create training features for Machine Learning model
X = df_training[['Latitude', 'Longitude']]
y = df_training[['Magnitude', 'Depth']]


In [62]:
#create testing features for model
X_new = df_testing[['Latitude', 'Longitude']]
y_new = df_testing[['Magnitude', 'Depth']]

In [63]:
#import machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV


#Use train_test_split to split our training data into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #20% testing & 80% training


In [65]:
#Create a model
model_reg = RandomForestRegressor(random_state=50)
#Train the model
model_reg.fit(X_train, y_train)
#Predict y_test (magnitude and depth) using X_test features (lat and lon)
results = model_reg.predict(X_test)
#check accuracy score
score = model_reg.score(X_test, y_test) * 100

print(score)



86.80160970029225




In [67]:
#preview predicted earthquaks
results

array([[  5.85 ,  56.96 ],
       [  5.82 ,  26.57 ],
       [  5.52 ,  32.121],
       ...,
       [  6.1  ,  31.89 ],
       [  5.68 , 589.37 ],
       [  6.53 ,  27.96 ]])

In [68]:
#improve the model accuracy by automating hyperparameter tuning
parameters = {'n_estimators': [10, 20, 50, 100, 200, 500]}

In [69]:
#create gridsearchcv model
grid_obj = GridSearchCV(model_reg, parameters)
#train the model
grid_fit = grid_obj.fit(X_train, y_train)
#Select the best fitted model
best_fit = grid_fit.best_estimator_






In [70]:
results = best_fit.predict(X_test)
#check the model accuracy score
score = best_fit.score(X_test, y_test) * 100
print(score)

87.68697774753744




In [71]:
#validate accuracy of model with the new data (out of sample)
#predict the earthquakes in 2017
final_results = best_fit.predict(X_new)
#check model accuracy score
final_score = best_fit.score(X_new, y_new) * 100

print(final_score)

82.30454596176573




In [73]:
#store the prediction results in lists
lst_Magnitudes = []
lst_Depth = []
i = 0

for r in final_results.tolist():
    lst_Magnitudes.append(final_results[i][0])
    lst_Depth.append(final_results[i][1])
    i += 1
    

In [74]:
#create prediction dataframe
df_results = X_new[['Latitude', 'Longitude']]
df_results['Magnitude'] = lst_Magnitudes
df_results['Depth'] = lst_Depth
df_results['Score'] = final_score
df_results['Year'] = 2017

In [75]:
#preview prediction dataset
df_results.head()

Unnamed: 0,Latitude,Longitude,Magnitude,Depth,Score,Year
0,32.98,-115.545833,5.80424,11.394514,82.304546,2017
1,2.8327,127.5786,5.9424,57.8042,82.304546,2017
2,32.973,-115.5505,5.80424,11.394514,82.304546,2017
3,-5.9497,153.8988,5.897,47.3562,82.304546,2017
4,-2.9302,139.4328,5.7686,62.8152,82.304546,2017


In [81]:
import pandas as pd
import numpy as np
from bokeh.io import output_notebook, output_file
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.models.tools import HoverTool # Allows interactivity with the graphs
import math
from math import pi
from bokeh.palettes import Category20c
from bokeh.transform import cumsum
from bokeh.tile_providers import CARTODBPOSITRON, STAMEN_TERRAIN
from bokeh.themes import built_in_themes
from bokeh.io import curdoc
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV # Improve accuracy

#load the dataset
df_load = pd.read_csv(r"C:\Users\bpraf_000\Downloads\database.csv")

#create a year field and add it to the dataframe
df_load = df_load.drop([3378, 7512, 20650])
df_load['Year'] = [int(x.split('/')[2]) for x in df_load.iloc[:,0]]

#create list of year values
lst_years = list(df_load['Year'].unique())
count_years = []

#count the number of records in the dataframe for each year
for year in lst_years:
    val = df_load[df_load['Year'] == year]
    count_years.append(len(val))

#build the earthquake frequency dataframe using the years and counts list
df_quake_freq = pd.DataFrame({'Years': lst_years, 'Counts': count_years})

#Create ColumnDataSource for our list of years and counts
source_freq = ColumnDataSource(df_quake_freq)

#create years and counts lists
years_list = source_freq.data['Years'].tolist()
counts_list = source_freq.data['Counts'].tolist()

#display the visuals directly in the browser 
output_file('dashboard.html')

#change visual to a dark theme
curdoc().theme = 'dark_minimal'

#create custom style functions to design our plots
def style(p):
    #Title
    p.title.align = 'center'
    p.title.text_font_size = '20pt'
    p.title.text_font = 'serif'
    
    #Axis titles
    p.xaxis.axis_label_text_font_size = '14pt'
    p.xaxis.axis_label_text_font_style = 'bold'
    p.yaxis.axis_label_text_font_size = '14pt'
    p.yaxis.axis_label_text_font_style = 'bold'
    
    #Tick labels
    p.xaxis.major_label_text_font_size = '12pt'
    p.yaxis.major_label_text_font_size = '12pt'
    
    #Legend
    p.legend.location = 'top_left'
    
    return p

#Define the style of our donut using a custom style function
def style2(p):
    #Title
    p.title.align = 'center'
    p.title.text_font_size = '20pt'
    p.title.text_font = 'serif'
    
    #Axis titles
    p.xaxis.axis_label_text_font_size = '14pt'
    p.xaxis.axis_label_text_font_style = 'bold'
    p.yaxis.axis_label_text_font_size = '14pt'
    p.yaxis.axis_label_text_font_style = 'bold'
    
    #Tick labels
    p.xaxis.major_label_text_font_size = '12pt'
    p.yaxis.major_label_text_font_size = '12pt'
    
    #Legend
    p.legend.location = 'top_right'
    
    return p


#Create the bar chart
def plotBar():
    #Show the plot embedded in jupyter notebook
    #output_notebook()
    
    #load the datasource
    cds = ColumnDataSource(data=dict(
        yrs = years_list,
        numQuakes = counts_list
    ))
    
    #Tooltip
    TOOLTIPS = [
        ("Year", " @yrs"),
        ("Number of earthquakes", " @numQuakes")
    ]
    
    #Create a figure
    barChart = figure(title='Frequency of Earthquakes by Year',
                     plot_height=400,
                     plot_width=1000,
                     x_axis_label='Years',
                     y_axis_label='Number of Occurances',
                     x_minor_ticks=2,
                     y_range=(0, df_quake_freq['Counts'].max() + 100),
                     toolbar_location=None,
                     tooltips=TOOLTIPS)
    
    barChart.vbar(x='yrs', bottom=0, top='numQuakes',
                 color='#009999', width=0.75,
                 legend='Year', source=cds)
    
    #Style the bar chart
    barChart = style(barChart)
    
    #how(barChart)
    
    return barChart


#Create Line Chart
def plotLine():
    #Show the plot embedded in jupyter notebook
    #output_notebook()
    
    #Load the datasource
    cds = ColumnDataSource(data=dict(
        yrs = years_list,
        numQuakes = counts_list
    ))
    
    #Tooltip
    TOOLTIPS = [
        ("Year", " @yrs"),
        ("Number of earthquakes", " @numQuakes")
    ]
    
    #Create figure
    p = figure(title='Earthquakes Trend by Year',
              plot_width=800,
              plot_height=400,
              x_axis_label='Years',
              y_axis_label='Number of Occurances',
              x_minor_ticks=2,
              y_range=(0, df_quake_freq['Counts'].max() + 100),
              toolbar_location=None,
              tooltips=TOOLTIPS)
    
    #Create line
    p.line(x='yrs', y='numQuakes', color='#009999', line_width=2, legend='Yearly Trend', source=cds)
    
    #add points to the line for each year
    p.circle(x='yrs', y='numQuakes', color='#009999', size=8, fill_color='white', source=cds)
    
    p = style(p)
    
    #show(p)
    
    return p


#Create the donut chart
def plotDonut():
    #show the plot embedded in jupyter notebook
    #output_notebook()
    
    #grab all the types of occurances and their count
    x = dict(df_load['Type'].value_counts())
    
    #convert the dict to a pandas series
    pie_data = pd.Series(x).reset_index(name='value').rename(columns={'index':'type'})
    
    #add color and angles to the dataset
    pie_data['angle'] = pie_data['value']/pie_data['value'].sum() * 2*pi
    pie_data['color'] = Category20c[len(x)]
    
    #Create figure
    p = figure(title='Types of Earthquakes (1965-2016)',
              plot_height=400,
              toolbar_location=None,
              tools='hover',
              tooltips='@type: @value',
              x_range=(-0.5, 1.0))
    
    #add the donut chart
    p.annular_wedge(x=0, y=1, inner_radius=0.2, outer_radius=0.35,
                   start_angle=cumsum('angle', include_zero=True), 
                   end_angle=cumsum('angle'), line_color='white',
                   fill_color='color', legend='type', source=pie_data)
    
    p.axis.axis_label=None
    p.axis.visible=False
    p.grid.grid_line_color=None
    
    p = style2(p)
    
    #show(p)
    
    return p


#Create the magnitude plot with predictions
def plotMagnitude():
    magnitude = []
    pred_magnitude = []
    
    #get the average magnitude value for each year
    for i in df_quake_freq.Years:
        #query the dataframe to get the result set for each year
        x = df_load[df_load['Year'] == i]
        
        #average earthquake magnitude for each year
        data_magnitude = sum(x.Magnitude)/len(x.Magnitude)
        magnitude.append(data_magnitude)
    df_quake_freq['Magnitude'] = magnitude 
    
    depth = []
    #get the average depth val for each year
    for i in df_quake_freq.Years:
        x = df_load[df_load['Year'] == i]
        
        #average val calc
        data_depth = sum(x.Depth)/len(x.Depth)
        depth.append(data_depth)
    df_quake_freq['Depth'] = depth

    #Get the maximum earthquake magnitude for each year
    max_magnitude = list(df_load.groupby('Year').Magnitude.max())
    df_quake_freq['Max_Magnitude'] = max_magnitude
    
    #Get the average magnitude for the year 2017 (the predictions)
    df_results['Mean_Magnitude'] = df_results['Magnitude'].mean()
    df_results['Max_Magnitude'] = df_results['Magnitude'].max()
    
    #load datasource
    cds = ColumnDataSource(data=dict(
        yrs = years_list,
        avg_mag = df_quake_freq['Magnitude'].values.tolist(),
        max_mag = df_quake_freq['Max_Magnitude'].values.tolist()
    ))
    
    pred_cds = ColumnDataSource(data=dict(
        yrs = [2017],
        avg_mag = df_results['Mean_Magnitude'].values.tolist(),
        max_mag = df_results['Max_Magnitude'].values.tolist()
    ))
    
    #Tooltip
    TOOLTIPS = [
        ("Year", " @yrs"), 
        ("Average Magnitude", " @avg_mag"),
        ("Maximum Magnitude", " @max_mag")
    ]
    
    #create figure
    mp = figure(title='Maximum and Average Magnitude by Year',
               plot_width=800,
               plot_height=500,
               x_axis_label='Years',
               y_axis_label='Magnitude',
               x_minor_ticks=2,
               y_range=(5, df_quake_freq['Max_Magnitude'].max() + 1),
               toolbar_location=None,
               tooltips=TOOLTIPS)
    
    #create max magnitude line and circles at year marks
    mp.line(x='yrs', y='max_mag', color='#009999', line_width=2, legend='Max Magnitude', source=cds)
    mp.circle(x='yrs', y='max_mag', color='#009999', size=8, fill_color='#009999', source=cds)
    
    #create average mag line and circles
    mp.line(x='yrs', y='avg_mag', color='orange', line_width=2, legend='Avg Magnitude', source=cds)
    mp.circle(x='yrs', y='avg_mag', color='orange', size=8, fill_color='orange', source=cds)
    
    #create predicted max magnitude line and circles at year marks
    mp.line(x='yrs', y='max_mag', color='#ccff33', line_width=2, legend='Predicted Max/Avg Magnitude', source=pred_cds)
    mp.circle(x='yrs', y='max_mag', color='#ccff33', size=8, fill_color='#ccff33', source=pred_cds)
    
    #create predicted average mag line and circles
    mp.line(x='yrs', y='avg_mag', color='#ccff33', line_width=2, source=pred_cds)
    mp.circle(x='yrs', y='avg_mag', color='#ccff33', size=8, fill_color='#ccff33', source=pred_cds)
    
    #style the graph
    mp = style(mp)
    
    #show(mp)
    
    return mp
    

#Create Geo Map Plot including predicted points
def plotMap():
    
    #pull lats and lons out of dataset
    lat = df_load['Latitude'].values.tolist()
    lon = df_load['Longitude'].values.tolist()
    
    #predicted lats and lons out of dataset
    pred_lat = df_results['Latitude'].values.tolist()
    pred_lon = df_results['Longitude'].values.tolist()
    
    lst_lat = []
    lst_lon = []
    
    lst_pred_lat = []
    lst_pred_lon = []
    
    i = 0
    j = 0
    
    #convert lat and lons into Mercator projection for the Bokeh library 
    for i in range(len(lon)):
        r_major = 6378137.000
        x = r_major * math.radians(lon[i])
        scale = x/lon[i]
        y = 180.0/math.pi * math.log(math.tan(math.pi/4.0 +
            lat[i] * (math.pi/180.0)/2.0)) * scale
        
        lst_lon.append(x)
        lst_lat.append(y)
        i += 1
    
    #convert pred_lat and pred_lon into Mercator projection for the Bokeh library 
    for j in range(len(pred_lon)):
        r_major = 6378137.000
        x = r_major * math.radians(pred_lon[j])
        scale = x/pred_lon[j]
        y = 180.0/math.pi * math.log(math.tan(math.pi/4.0 +
            pred_lat[j] * (math.pi/180.0)/2.0)) * scale
        
        lst_pred_lon.append(x)
        lst_pred_lat.append(y)
        j += 1
    
    
    df_load['coords_x'] = lst_lon
    df_load['coords_y'] = lst_lat
    
    df_results['coords_x'] = lst_pred_lon
    df_results['coords_y'] = lst_pred_lat
    
    
    longs = df_load['coords_x'].tolist()
    lats = df_load['coords_y'].tolist()
    mags = df_load['Magnitude'].tolist()
    years = df_load['Year'].tolist()
    
    pred_longs = df_results['coords_x'].tolist()
    pred_lats = df_results['coords_y'].tolist()
    pred_mags = df_results['Magnitude'].tolist()
    pred_year = df_results['Year'].tolist()
    
    
    #Create datasource
    cds = ColumnDataSource(data=dict(
        lat=lats,
        lon=longs,
        mag=mags,
        year=years
    ))
    
    pred_cds = ColumnDataSource(data=dict(
        pred_lat=pred_lats,
        pred_long=pred_longs,
        pred_mag=pred_mags,
        year=pred_year
    ))
    
    #Tooltip
    TOOLTIPS = [
        ("Magnitude", " @mag"),
        ("Predicted Magnitude", " @pred_mag"),
        ("Year", " @year")
    ]
    
    #Create figure
    p = figure(title='Earthquake Map',
              plot_width=1000,
              plot_height=500,
              x_range=(-2000000, 6000000),
              y_range=(-1000000, 7000000),
              tooltips=TOOLTIPS)
    
    #create map
    p.add_tile(CARTODBPOSITRON)
    
    
    #create circles
    p.circle(x='lon', y='lat', size='mag', fill_color='#009999', fill_alpha=0.8, source=cds, legend='Quakes 1965-2016')
    
    #create circles for predicted earthquakes
    p.circle(x='pred_long', y='pred_lat', size='pred_mag', fill_color='#ccff33', fill_alpha=0.8, source=pred_cds, legend='Predicted Quakes 2017')
    
    
    #style the map plot
    #Title
    p.title.align = 'center'
    p.title.text_font_size = '20pt'
    p.title.text_font = 'serif'
    
    #Legend
    p.legend.location = 'bottom_right'
    p.legend.background_fill_color = 'black'
    p.legend.background_fill_alpha = 0.8
    p.legend.click_policy = 'hide'
    p.legend.label_text_color = 'white'
    
    p.xaxis.visible=False
    p.yaxis.visible=False
    p.axis.axis_label=None
    p.axis.visible=False
    p.grid.grid_line_color=None
    
    #show(p)
    
    return p
    




In [82]:
#Create grid plot
from bokeh.layouts import gridplot

#Make a grid
grid = gridplot([[plotMap(), plotMagnitude(), plotDonut()], 
                 [plotBar(), plotLine()]])

#show the final plot
show(grid)

