In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## EDA

### Initial exploration

When looking at data for the first time, try to get a high level of overview of things

In [None]:
cars = pd.read_csv("./data/cardata.csv")
cars.head()

In [None]:
# Look at the dimensionality of data
cars.shape

In [None]:
# Look for NAs
np.sum(cars.isna())

In [None]:
# Check the data types of columns
cars.dtypes

Determine how many records we have associated with each car make. 

In [None]:
car_counts = cars["Make"].value_counts()
car_counts

Visualize this

In [None]:
sns.barplot(x = car_counts.index, y = cars["Make"].value_counts(), color = "black");

Constrain the subset of values visualized

In [None]:
sns.barplot(x = car_counts.index[0:20], y = cars["Make"].value_counts()[0:20], color = "black")
# Put the tick labels at an angle
plt.xticks(rotation=70);

In [None]:
sns.barplot(y = car_counts.index[0:20], x = cars["Make"].value_counts()[0:20], color = "black")

#### Exploring associations

Now let's look at the distributions of our numeric variables, as well as any associations between them.

In [None]:
# Drop variables we're uninterested in
cars_sub = cars.drop(["Number of Doors", "Engine Cylinders", "Year"], axis = 1)
cars_sub.columns

Now let's look at the distributions of our numeric variables, as well as any associations between them.

In [None]:
# Select only numeric columns in the data
cars_sub.select_dtypes(include=np.number)

Use bulk visualizations for a birds-eye view of the data

In [None]:
# Show pairwise association between each numeric variable
sns.pairplot(cars_sub.select_dtypes(include=np.number), corner = True)

**What are some things that you notice?**

In [None]:
sns.boxplot(cars["highway MPG"])
plt.ylabel("Highway MPG")

In [None]:
cars[cars["highway MPG"] == np.max(cars["highway MPG"])]

Also take a closer look at a few of the distributions

In [None]:
sns.histplot(cars["MSRP"])

In [None]:
sns.boxplot(cars["MSRP"])

**What could we do about this**?

In [None]:
sns.histplot(np.log(cars["MSRP"]))

In [None]:
cars["MSRP"].sort_values(ascending = True)

A little strange but let's keep exploring the data

In [None]:
sns.relplot(data = cars, x = "Engine HP", y = "MSRP");

**What do you observe in this relationship?**

In [None]:
sns.relplot(x = cars["Engine HP"], y = np.log(cars["MSRP"]))
plt.ylabel("log MSRP");

Try to figure out what's going on with the weird grouping of data

In [None]:
cars[cars["MSRP"] < 9000].describe()

**Which associated variable stands out as potentially unusual?** (Use the summary statistics)

In [None]:
sns.displot(x = np.log(cars["MSRP"]), y = cars["Year"], kind = "kde")
plt.xlabel("log MSRP");

**What do you observe in this plot**?

## Working with datetimes

Read in data containing all police dispatches in Eugene from a certain timeframe

In [None]:
# Read in logs of eugene dispatches from January 1st 2022
dispatch = pd.read_csv("./data/eugene_dispatches_24.csv")
dispatch.head()

In [None]:
len(dispatch)

In [None]:
dispatch["Call Time"]

In [None]:
desc_counts = dispatch["Incident Desc"].value_counts()

plt.figure(figsize=(10, 8))
sns.barplot(x= desc_counts, y=desc_counts.index, color = "darkblue");

In [None]:
dispatch[dispatch['Incident Desc'].str.contains(r".*[tT]heft($|.*)", "Theft", regex = True)]

In [None]:
dispatch["Incident Desc"] = dispatch['Incident Desc'].str.replace(r".*[tT]heft($|.*)", "Theft", regex = True)

In [None]:
desc_counts = dispatch["Incident Desc"].value_counts()

plt.figure(figsize=(10, 8))
sns.barplot(x= desc_counts, y=desc_counts.index, color = "darkblue");

Did any of these crimes happen nearby?

In [None]:
dispatch[dispatch["Location"].str.contains(r"13TH|FRANKLIN|UNIVERSITY")]

In [None]:
dispatch["call_time_dt"] = pd.to_datetime(dispatch["Call Time"], format = "%m/%d/%Y %H:%M")
dispatch.head

In [None]:
np.min(dispatch["call_time_dt"])


In [None]:
np.max(dispatch["call_time_dt"])

In [None]:
np.mean(dispatch["call_time_dt"])

In [None]:
dispatch["hour"] = dispatch["call_time_dt"].dt.hour
dispatch.head()

In [None]:
sns.histplot(dispatch["hour"], bins = np.arange(0,24,1), kde = True);

In [None]:
sns.histplot(dispatch["hour"], bins = np.arange(0,24,1), kde = True);
sns.histplot(dispatch[dispatch["Incident Desc"] == "Traffic Stop"]["hour"], bins = np.arange(0,24,1), kde = True);