In [1]:
dbutils.library.installPyPI("scikit-learn")
dbutils.library.installPyPI("joblib")
dbutils.library.restartPython()

In [2]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.sql import DataFrame
import pyspark.sql.functions as F
from pyspark.sql import SQLContext
import pandas as pd
import numpy as np
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

In [3]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import pickle
import joblib

In [4]:
# get or create Spark session
app_name = "intro-to-pyspark"
spark = SparkSession.builder.appName(app_name).getOrCreate()

In [5]:
# #NOTE - We need for final demo
# # Read raw csv file and get it in spark dataframe.
# url = "/FileStore/tables/US_Accidents_May19.csv"

# df = spark.read \
#     .format("com.databricks.spark.csv") \
#     .options(header='true', inferSchema = True) \
#     .load(url)
# df.show()

In [6]:
# display(df)

In [7]:
# # Keep only necessary columns to store in AWS postgress
# df_to_load = df.select("Severity", "TMC", "Start_Lat", "Start_Lng", "Distance(mi)", "Start_Time","End_Time", "Description", "Street", "City", "State" \
#                        ,"Zipcode", "Country", "Timezone","Weather_Timestamp", "Temperature(F)", "Wind_Chill(F)", "Humidity(%)", "Pressure(in)" \
#                        ,"Visibility(mi)", "Wind_Speed(mph)", "Precipitation(in)", "Weather_Condition")

In [8]:
# display(df_to_load)

In [9]:
# Get secret value from AWS postgres database
import boto3

secret_name = "ut/secret/postgre"
region_name = "us-east-2"
access_key = "AKIA4DI3HF5D4RU6LXGI"
secret_key = "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"

session = boto3.session.Session(aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region_name)
client = session.client('secretsmanager')
secret_value = client.get_secret_value(SecretId=secret_name)
secret_value

In [10]:
# Get Connection
import json
def get_connection(secret_value):
  return json.loads(secret_value['SecretString'])
get_connection(secret_value)
connection = get_connection(secret_value)

In [11]:
# Postgres credentials
jdbcHostname = connection['host']
jdbcPort = connection['port']
jdbcDatabase = "postgres"
dialect = "postgresql"
jdbcUsername = connection['username']
jdbcPassword = connection['password']

jdbcUrl = f"jdbc:{dialect}://{jdbcHostname}:{jdbcPort}/{jdbcDatabase}"
connectionProperties = {
  "user" : jdbcUsername,
  "password" : jdbcPassword,
  "driver" : "org.postgresql.Driver" 
}
# for mysql driver = com.mysql.jdbc.Driver
# jdbcUrl

In [12]:
# #NOTE - Need to comment out for Demo
# # table "US_accidents_data" contains lat, lng data in case we need it later for heat map.
# # table = "US_accidents_row"
# table = "US_accidents_data"
# mode = "overwrite" # options are: error, append, overwrite

# df_to_load.write.jdbc(jdbcUrl, table, mode, connectionProperties)

In [13]:
# # read from postgres
# df_fromPostgres = spark.read.jdbc(url = jdbcUrl, table = table, properties = connectionProperties)
# display(df_fromPostgres)

In [14]:
# # write to csv file
# df_fromPostgres.write.format("com.databricks.spark.csv").mode("overwrite").option("header", "true").save("dbfs:/FileStore/df/spark_data.csv")

In [15]:
# # print schema
# df_fromPostgres.printSchema()

In [16]:
# url = "/FileStore/df/spark.csv"
url = "/FileStore/df/spark_data.csv"

df_from_local = spark.read \
    .format("com.databricks.spark.csv") \
    .options(header='true', inferSchema = True) \
    .load(url)
df_from_local.show()

In [17]:
df_from_local.groupBy("Severity").count().show()

In [18]:
# #Keep only severity 2, 3 and 4. 
df_Severity = df_from_local.where((df_from_local["Severity"]==2) | (df_from_local["Severity"]==3) | (df_from_local["Severity"]==4))
display(df_Severity)

Severity,TMC,Start_Lat,Start_Lng,Distance(mi),Start_Time,End_Time,Description,Street,City,State,Zipcode,Country,Timezone,Weather_Timestamp,Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),Precipitation(in),Weather_Condition
3,201.0,39.865147,-84.058723,0.01,2016-02-08T05:46:00.000+0000,2016-02-08T11:00:00.000+0000,Right lane blocked due to accident on I-70 Eastbound at Exit 41 OH-235 State Route 4.,I-70 E,Dayton,OH,45424,US,US/Eastern,2016-02-08T05:58:00.000+0000,36.9,,91.0,29.68,10.0,,0.02,Light Rain
2,201.0,39.928059000000005,-82.831184,0.01,2016-02-08T06:07:59.000+0000,2016-02-08T06:37:59.000+0000,Accident on Brice Rd at Tussing Rd. Expect delays.,Brice Rd,Reynoldsburg,OH,43068-3402,US,US/Eastern,2016-02-08T05:51:00.000+0000,37.9,,100.0,29.65,10.0,,0.0,Light Rain
2,201.0,39.063148,-84.032608,0.01,2016-02-08T06:49:27.000+0000,2016-02-08T07:19:27.000+0000,Accident on OH-32 State Route 32 Westbound at Dela Palma Rd. Expect delays.,State Route 32,Williamsburg,OH,45176,US,US/Eastern,2016-02-08T06:56:00.000+0000,36.0,33.3,100.0,29.67,10.0,3.5,,Overcast
3,201.0,39.747753,-84.20558199999999,0.01,2016-02-08T07:23:34.000+0000,2016-02-08T07:53:34.000+0000,Accident on I-75 Southbound at Exits 52 52B US-35. Expect delays.,I-75 S,Dayton,OH,45417,US,US/Eastern,2016-02-08T07:38:00.000+0000,35.1,31.0,96.0,29.64,9.0,4.6,,Mostly Cloudy
2,201.0,39.627781,-84.188354,0.01,2016-02-08T07:39:07.000+0000,2016-02-08T08:09:07.000+0000,Accident on McEwen Rd at OH-725 Miamisburg Centerville Rd. Expect delays.,Miamisburg Centerville Rd,Dayton,OH,45459,US,US/Eastern,2016-02-08T07:53:00.000+0000,36.0,33.3,89.0,29.65,6.0,3.5,,Mostly Cloudy
3,201.0,40.10059,-82.92519399999999,0.01,2016-02-08T07:44:26.000+0000,2016-02-08T08:14:26.000+0000,Accident on I-270 Outerbelt Northbound near Exit 29 OH-3 State St. Expect delays.,Westerville Rd,Westerville,OH,43081,US,US/Eastern,2016-02-08T07:51:00.000+0000,37.9,35.5,97.0,29.63,7.0,3.5,0.03,Light Rain
2,201.0,39.758274,-84.23050699999999,0.0,2016-02-08T07:59:35.000+0000,2016-02-08T08:29:35.000+0000,Accident on Oakridge Dr at Woodward Ave. Expect delays.,N Woodward Ave,Dayton,OH,45417-2476,US,US/Eastern,2016-02-08T07:56:00.000+0000,34.0,31.0,100.0,29.66,7.0,3.5,,Overcast
3,201.0,39.770382,-84.194901,0.01,2016-02-08T07:59:58.000+0000,2016-02-08T08:29:58.000+0000,Accident on I-75 Southbound at Exit 54B Grand Ave. Expect delays.,N Main St,Dayton,OH,45405,US,US/Eastern,2016-02-08T07:56:00.000+0000,34.0,31.0,100.0,29.66,7.0,3.5,,Overcast
2,201.0,39.778061,-84.172005,0.0,2016-02-08T08:00:40.000+0000,2016-02-08T08:30:40.000+0000,Accident on Notre Dame Ave at Warner Ave. Expect delays.,Notre Dame Ave,Dayton,OH,45404-1923,US,US/Eastern,2016-02-08T07:58:00.000+0000,33.3,,99.0,29.67,5.0,1.2,,Mostly Cloudy
3,201.0,40.10059,-82.92519399999999,0.01,2016-02-08T08:10:04.000+0000,2016-02-08T08:40:04.000+0000,Right hand shoulder blocked due to accident on I-270 Outerbelt Westbound at Exit 29 OH-3 State St.,Westerville Rd,Westerville,OH,43081,US,US/Eastern,2016-02-08T08:28:00.000+0000,37.4,33.8,100.0,29.62,3.0,4.6,0.02,Light Rain


In [19]:
df_Severity.count()

In [20]:
# # Map the weather condition to narrow down those into 5-6 categories 
weather_dict = {
'Clear' : 'Clear',
'Overcast' : 'Cloudy',
'Mostly Cloudy' : 'Cloudy',
'Partly Cloudy' : 'Cloudy',
'Scattered Clouds' : 'Clear',
'Light Rain' : 'Clear',
'Light Snow' : 'Clear',
'Haze' : 'Low Visibility',
'Rain' : 'Rain',
'Fog' : 'Low Visibility',
'Heavy Rain' : 'Rain',
'Light Drizzle' : 'Clear',
'Light Thunderstorms and Rain' : 'Clear',
'Thunderstorm' : 'Rain',
'Snow' : 'Slippery',
'Smoke' : 'Low Visibility',
'Heavy Thunderstorms and Rain' : 'Rain',
'Thunderstorms and Rain' : 'Rain',
'Light Freezing Rain' : 'Slippery',
'Mist' : 'Low Visibility',
'Patches of Fog' : 'Low Visibility',
'Drizzle' : 'Clear',
'Heavy Snow' : 'Slippery',
'Light Freezing Fog' : 'Low Visibility',
'Shallow Fog' : 'Low Visibility',
'Light Freezing Drizzle': 'Clear',
'Blowing Snow' : 'Low Visibility',
'Light Ice Pellets' : 'Clear',
'Heavy Drizzle' : 'Rain',
'Light Rain Showers' : 'Clear',
'Rain Showers' : 'Rain',
'Ice Pellets' : 'Low Visibility',
'Squalls' : 'Low Visibility',
'Small Hail' : 'Low Visibility',
'Light Snow Showers' : 'Low Visibility',
'Volcanic Ash' : 'Low Visibility',
'Light Thunderstorms and Snow' : 'Clear',
'Sand' : 'Low Visibility',
'Funnel Cloud' : 'Low Visibility',
'Light Haze' : 'Clear',
'Heavy Thunderstorms with Small Hail' : 'Low Visibility',
'Heavy Rain Showers' : 'Rain',
'Heavy Thunderstorms and Snow' : 'Slippery',
'Low Drifting Snow' : 'Low Visibility',
'Snow Grains' : 'Slippery',
'Light Fog' : 'Clear',
'Heavy Blowing Snow' : 'Low Visibility',
'Heavy Ice Pellets' : 'Slippery',
'Light Hail' : 'Clear',
'Light Snow Grains' : 'Clear',
'Light Thunderstorm' : 'Clear',
'Light Blowing Snow' : 'Clear',
'Thunderstorms and Snow' : 'Rain',
'Hail' : 'Low Visibility',
'Heavy Freezing Rain' : 'Rain',
'Heavy Freezing Drizzle' : 'Rain',
'Snow Showers' : 'Rain',
'Blowing Sand' : 'Low Visibility',
'Dust Whirls' : 'Low Visibility',
'Heavy Smoke' : 'Low Visibility',
'Widespread Dust' : 'Low Visibility'
}


@udf(returnType=StringType())
def udf_weather_ctg(condition):
    if condition is not None:
        category = weather_dict.get(condition)
    else:
        category = "UNKNOWN"
    return category

In [21]:
# Replace multiple weather condition to main categories
df_weather_catg = df_Severity.withColumn("Weather_Category", udf_weather_ctg("Weather_Condition"))
display(df_weather_catg)

Severity,TMC,Start_Lat,Start_Lng,Distance(mi),Start_Time,End_Time,Description,Street,City,State,Zipcode,Country,Timezone,Weather_Timestamp,Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),Precipitation(in),Weather_Condition,Weather_Category
3,201.0,39.865147,-84.058723,0.01,2016-02-08T05:46:00.000+0000,2016-02-08T11:00:00.000+0000,Right lane blocked due to accident on I-70 Eastbound at Exit 41 OH-235 State Route 4.,I-70 E,Dayton,OH,45424,US,US/Eastern,2016-02-08T05:58:00.000+0000,36.9,,91.0,29.68,10.0,,0.02,Light Rain,Clear
2,201.0,39.928059000000005,-82.831184,0.01,2016-02-08T06:07:59.000+0000,2016-02-08T06:37:59.000+0000,Accident on Brice Rd at Tussing Rd. Expect delays.,Brice Rd,Reynoldsburg,OH,43068-3402,US,US/Eastern,2016-02-08T05:51:00.000+0000,37.9,,100.0,29.65,10.0,,0.0,Light Rain,Clear
2,201.0,39.063148,-84.032608,0.01,2016-02-08T06:49:27.000+0000,2016-02-08T07:19:27.000+0000,Accident on OH-32 State Route 32 Westbound at Dela Palma Rd. Expect delays.,State Route 32,Williamsburg,OH,45176,US,US/Eastern,2016-02-08T06:56:00.000+0000,36.0,33.3,100.0,29.67,10.0,3.5,,Overcast,Cloudy
3,201.0,39.747753,-84.20558199999999,0.01,2016-02-08T07:23:34.000+0000,2016-02-08T07:53:34.000+0000,Accident on I-75 Southbound at Exits 52 52B US-35. Expect delays.,I-75 S,Dayton,OH,45417,US,US/Eastern,2016-02-08T07:38:00.000+0000,35.1,31.0,96.0,29.64,9.0,4.6,,Mostly Cloudy,Cloudy
2,201.0,39.627781,-84.188354,0.01,2016-02-08T07:39:07.000+0000,2016-02-08T08:09:07.000+0000,Accident on McEwen Rd at OH-725 Miamisburg Centerville Rd. Expect delays.,Miamisburg Centerville Rd,Dayton,OH,45459,US,US/Eastern,2016-02-08T07:53:00.000+0000,36.0,33.3,89.0,29.65,6.0,3.5,,Mostly Cloudy,Cloudy
3,201.0,40.10059,-82.92519399999999,0.01,2016-02-08T07:44:26.000+0000,2016-02-08T08:14:26.000+0000,Accident on I-270 Outerbelt Northbound near Exit 29 OH-3 State St. Expect delays.,Westerville Rd,Westerville,OH,43081,US,US/Eastern,2016-02-08T07:51:00.000+0000,37.9,35.5,97.0,29.63,7.0,3.5,0.03,Light Rain,Clear
2,201.0,39.758274,-84.23050699999999,0.0,2016-02-08T07:59:35.000+0000,2016-02-08T08:29:35.000+0000,Accident on Oakridge Dr at Woodward Ave. Expect delays.,N Woodward Ave,Dayton,OH,45417-2476,US,US/Eastern,2016-02-08T07:56:00.000+0000,34.0,31.0,100.0,29.66,7.0,3.5,,Overcast,Cloudy
3,201.0,39.770382,-84.194901,0.01,2016-02-08T07:59:58.000+0000,2016-02-08T08:29:58.000+0000,Accident on I-75 Southbound at Exit 54B Grand Ave. Expect delays.,N Main St,Dayton,OH,45405,US,US/Eastern,2016-02-08T07:56:00.000+0000,34.0,31.0,100.0,29.66,7.0,3.5,,Overcast,Cloudy
2,201.0,39.778061,-84.172005,0.0,2016-02-08T08:00:40.000+0000,2016-02-08T08:30:40.000+0000,Accident on Notre Dame Ave at Warner Ave. Expect delays.,Notre Dame Ave,Dayton,OH,45404-1923,US,US/Eastern,2016-02-08T07:58:00.000+0000,33.3,,99.0,29.67,5.0,1.2,,Mostly Cloudy,Cloudy
3,201.0,40.10059,-82.92519399999999,0.01,2016-02-08T08:10:04.000+0000,2016-02-08T08:40:04.000+0000,Right hand shoulder blocked due to accident on I-270 Outerbelt Westbound at Exit 29 OH-3 State St.,Westerville Rd,Westerville,OH,43081,US,US/Eastern,2016-02-08T08:28:00.000+0000,37.4,33.8,100.0,29.62,3.0,4.6,0.02,Light Rain,Clear


In [22]:
df_weather_catg.groupBy("Weather_Category").count().show(1000)
##df_weather_catg.filter("Weather_Category IS NULL").show()

In [23]:
df_with_lat = df_weather_catg.fillna(0)

In [24]:
# Store csv file with lat lng columns
df_with_lat.repartition(1).write.format("com.databricks.spark.csv").mode("overwrite").option("header", "true").save("dbfs:/FileStore/df/data_with_lat.csv")

In [25]:
df_weather_catg.count()

In [26]:
df_with_lat_noNull = df_weather_catg.select("Severity"  \
                          ,"Start_Lat","Start_Lng", "Start_Time", "Temperature(F)", "Visibility(mi)" \
                          , "Wind_Speed(mph)", "Weather_Category", "Pressure(in)")

In [27]:
df_with_lat_noNull = df_with_lat_noNull.dropna(how="any")

In [28]:
df_with_lat_noNull.count()

In [29]:
# Store csv file with lat lng columns and droping na
df_with_lat_noNull.repartition(1).write.format("com.databricks.spark.csv").mode("overwrite").option("header", "true").save("dbfs:/FileStore/df/df_with_lat_noNull.csv")

In [30]:
df_pd_lat = df_with_lat.toPandas()
df_pd_lat.head()

Unnamed: 0,Severity,TMC,Start_Lat,Start_Lng,Distance(mi),Start_Time,End_Time,Description,Street,City,State,Zipcode,Country,Timezone,Weather_Timestamp,Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),Precipitation(in),Weather_Condition,Weather_Category
0,3,201.0,39.865147,-84.058723,0.01,2016-02-08 05:46:00,2016-02-08 11:00:00,Right lane blocked due to accident on I-70 Eas...,I-70 E,Dayton,OH,45424,US,US/Eastern,2016-02-08 05:58:00,36.9,0.0,91.0,29.68,10.0,0.0,0.02,Light Rain,Clear
1,2,201.0,39.928059,-82.831184,0.01,2016-02-08 06:07:59,2016-02-08 06:37:59,Accident on Brice Rd at Tussing Rd. Expect del...,Brice Rd,Reynoldsburg,OH,43068-3402,US,US/Eastern,2016-02-08 05:51:00,37.9,0.0,100.0,29.65,10.0,0.0,0.0,Light Rain,Clear
2,2,201.0,39.063148,-84.032608,0.01,2016-02-08 06:49:27,2016-02-08 07:19:27,Accident on OH-32 State Route 32 Westbound at ...,State Route 32,Williamsburg,OH,45176,US,US/Eastern,2016-02-08 06:56:00,36.0,33.3,100.0,29.67,10.0,3.5,0.0,Overcast,Cloudy
3,3,201.0,39.747753,-84.205582,0.01,2016-02-08 07:23:34,2016-02-08 07:53:34,Accident on I-75 Southbound at Exits 52 52B US...,I-75 S,Dayton,OH,45417,US,US/Eastern,2016-02-08 07:38:00,35.1,31.0,96.0,29.64,9.0,4.6,0.0,Mostly Cloudy,Cloudy
4,2,201.0,39.627781,-84.188354,0.01,2016-02-08 07:39:07,2016-02-08 08:09:07,Accident on McEwen Rd at OH-725 Miamisburg Cen...,Miamisburg Centerville Rd,Dayton,OH,45459,US,US/Eastern,2016-02-08 07:53:00,36.0,33.3,89.0,29.65,6.0,3.5,0.0,Mostly Cloudy,Cloudy


In [31]:
print(df_pd_lat["Temperature(F)"].isnull().sum())
print(df_pd_lat["Visibility(mi)"].isnull().sum())

In [32]:
# Keep only weather related columns for ML.
df_for_panda = df_weather_catg.select("Severity"  \
                          ,"Temperature(F)", "Visibility(mi)" \
                          , "Wind_Speed(mph)", "Weather_Category", "Pressure(in)")

In [33]:
display(df_for_panda)

Severity,Temperature(F),Visibility(mi),Wind_Speed(mph),Weather_Category,Pressure(in)
3,36.9,10.0,,Clear,29.68
2,37.9,10.0,,Clear,29.65
2,36.0,10.0,3.5,Cloudy,29.67
3,35.1,9.0,4.6,Cloudy,29.64
2,36.0,6.0,3.5,Cloudy,29.65
3,37.9,7.0,3.5,Clear,29.63
2,34.0,7.0,3.5,Cloudy,29.66
3,34.0,7.0,3.5,Cloudy,29.66
2,33.3,5.0,1.2,Cloudy,29.67
3,37.4,3.0,4.6,Clear,29.62


In [34]:
# Convert spark dataframe to pandas
df_panda = df_for_panda.toPandas()
df_panda.head()

Unnamed: 0,Severity,Temperature(F),Visibility(mi),Wind_Speed(mph),Weather_Category,Pressure(in)
0,3,36.9,10.0,,Clear,29.68
1,2,37.9,10.0,,Clear,29.65
2,2,36.0,10.0,3.5,Cloudy,29.67
3,3,35.1,9.0,4.6,Cloudy,29.64
4,2,36.0,6.0,3.5,Cloudy,29.65


In [35]:
df_panda.count()

In [36]:
print(df_panda["Temperature(F)"].isnull().sum())
print(df_panda["Visibility(mi)"].isnull().sum())
print(df_panda["Wind_Speed(mph)"].isnull().sum())
print(df_panda["Weather_Category"].isnull().sum())
print(df_panda["Pressure(in)"].isnull().sum())

In [37]:
df_panda.describe()

Unnamed: 0,Severity,Temperature(F),Visibility(mi),Wind_Speed(mph),Pressure(in)
count,2243108.0,2180865.0,2171776.0,1800314.0,2185847.0
mean,2.383212,61.23114,9.124027,8.844072,30.03747
std,0.5482327,19.14641,2.986514,4.973374,0.2267387
min,2.0,-77.8,0.0,1.2,0.0
25%,2.0,48.9,10.0,5.8,29.92
50%,2.0,63.0,10.0,8.1,30.03
75%,3.0,75.9,10.0,11.5,30.15
max,4.0,170.6,140.0,822.8,33.04


In [38]:
import matplotlib.pyplot as plt
fig = df_panda["Temperature(F)"].plot(kind="hist", bins=50)
display(fig)

In [39]:
#Drop the rows whose every columns contain null
df_panda.dropna(how='all',inplace=True)

In [40]:
# fill null with zero values
df_panda_clean = df_panda.fillna(0)

In [41]:
print(df_panda_clean["Temperature(F)"].isnull().sum())
print(df_panda_clean["Visibility(mi)"].isnull().sum())
print(df_panda_clean["Wind_Speed(mph)"].isnull().sum())
print(df_panda_clean["Weather_Category"].isnull().sum())
print(df_panda_clean["Pressure(in)"].isnull().sum())

In [42]:
df_panda_clean.dtypes

In [43]:
df_panda_clean.head()

Unnamed: 0,Severity,Temperature(F),Visibility(mi),Wind_Speed(mph),Weather_Category,Pressure(in)
0,3,36.9,10.0,0.0,Clear,29.68
1,2,37.9,10.0,0.0,Clear,29.65
2,2,36.0,10.0,3.5,Cloudy,29.67
3,3,35.1,9.0,4.6,Cloudy,29.64
4,2,36.0,6.0,3.5,Cloudy,29.65


In [44]:
df_source = spark.createDataFrame(df_panda_clean)
df_source.head()

In [45]:
# data_for_model.csv is final csv ready for machine learning model.
df_source.repartition(1).write.format("com.databricks.spark.csv").mode("overwrite").option("header", "true").save("dbfs:/FileStore/df/data_for_model.csv")

In [46]:
X = df_panda_clean[["Temperature(F)", "Visibility(mi)", "Wind_Speed(mph)", "Weather_Category"]]
# X = df_panda_clean[["Temperature(F)", "Visibility(mi)", "Wind_Speed(mph)", "Weather_Category", "Pressure(in)"]]
# y = df_noweather_cond["Severity"].values.reshape(-1, 1)
y = df_panda_clean["Severity"]
print(X.shape, y.shape)

In [47]:
data = X.copy()

In [48]:
from sklearn.base import BaseEstimator, TransformerMixin
class PandasDummies(BaseEstimator, TransformerMixin):
    def transform(self, X, *_):
        return pd.get_dummies(X)
    
    def fit(self, *_):
        return self

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify =y)

In [50]:
model1 = make_pipeline(PandasDummies(), StandardScaler(), RandomForestClassifier())
paramgrid = {"randomforestclassifier__n_estimators": [10, 30, 50],
             "randomforestclassifier__max_depth":[1, 5, 10] }
clf_grid_model1 = GridSearchCV(model1, paramgrid)
clf_grid_model1.fit(X_train, y_train)

In [51]:
print(f"Training Accuracy with Random Forest Classifier with grid search: {clf_grid_model1.score(X_train, y_train)}")
print(f"Testing Accuracy with Random Forest Classifier with grid search: {clf_grid_model1.score(X_test, y_test)}")

In [52]:
clf_grid_model1.best_params_

In [53]:
rf = model1.steps[-1][1]
rf.feature_importances_

In [54]:
# # Using best hyperparameters
# model = make_pipeline(PandasDummies(), StandardScaler(), RandomForestClassifier(n_estimators=10, max_depth=10))
# model.fit(X_train, y_train)
# print(f"Training Data Score without Random Forest Classifier: {model.score(X_train, y_train)}")
# print(f"Testing Data Score without Random Forest Classifier: {model.score(X_test, y_test)}")

In [55]:
lg_model = make_pipeline(PandasDummies(), StandardScaler(), LogisticRegression())
paramgrid = {'logisticregression__C': [ 0.1, 1],
#              'logisticregression__max_iter': [110,120,130,140],
             'logisticregression__solver': ['saga'],
             'logisticregression__penalty': ['l1']
            }
lg_grid_model1 = GridSearchCV(lg_model, paramgrid)
lg_grid_model1.fit(X_train, y_train)

In [56]:
print(f"Training Data Score with Logistic Classifier with grid search: {lg_grid_model1.score(X_train, y_train)}")
print(f"Testing Data Score with Logistic Classifier with grid search: {lg_grid_model1.score(X_test, y_test)}")