In [1]:
pip install -U scikit-learn

# Exploratory Data Analysis

In [2]:
# Import libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sqlite3
import numpy as np

In [3]:
# Build connection
connection=sqlite3.connect('../input/california-traffic-collision-data-from-switrs/switrs.sqlite')

In [4]:
# Display columns of collisions table
query = "SELECT injured_victims, party_count, distance, population, type_of_collision FROM collisions"
collisions = pd.read_sql_query(query,connection)

In [5]:
# Get data info
collisions.info()

In [6]:
# Get missing values
collisions.isna().sum()

In [7]:
# Display data
collisions

In [8]:
# Drop nulls
collisions = collisions.dropna()

In [9]:
# Get missing values
collisions.isna().sum()

In [10]:
# Get data info
collisions.info()

# Preprocessing

In [11]:
# Separate data for preprocessing
numerical_columns = [
    "party_count",
    "distance"
]
categorical_columns = [
    "population",
    "type_of_collision"
]
target = "injured_victims"

In [12]:
# Create dummy variables for categorical columns
collisions_categorical = pd.get_dummies(collisions[categorical_columns].astype(str))

In [13]:
# Separate numerical columns
collisions_numerical = collisions[numerical_columns]

In [14]:
# Set target
collisions_target = collisions[target]

In [15]:
# Combine dataframes
collisions = pd.concat(
    [
        collisions_target, 
        collisions_numerical, 
        collisions_categorical
    ], 
    axis=1
)

In [16]:
# Display correlations
corr = collisions.corr()
corr

# Sampling

In [17]:
# Sample the data to decrease processing size
collisions = collisions.sample(frac=0.5, random_state=42)

# Modelling

In [18]:
# Assign values
X = collisions.drop(columns=["injured_victims"])
y = collisions["injured_victims"]

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.20, 
    random_state=42
)

In [20]:
# Get info of df
collisions.info()

In [21]:
# Get missing values
collisions.isna().sum()

In [22]:
# Implement Linear Regression
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
y_pred = np.round(y_pred)

# Compute metrics
from sklearn.metrics import mean_squared_error
RMSE = mean_squared_error(y_test, y_pred, squared=False)

print(f"RMSE: {RMSE}")

In [24]:
# Implement Gradient Boosting Regressor
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor()
gbr.fit(X_train,y_train)
y_pred = gbr.predict(X_test)
y_pred = np.round(y_pred)

# Compute metrics
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
RMSE = mean_squared_error(y_test, y_pred, squared=False)

print(f"RMSE: {RMSE}")

# Plot feature importance
feature_importance = gbr.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.figure(figsize=(15,7.5), dpi= 80)
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, np.array(X.columns)[sorted_idx])
plt.title('Feature Importance')