# Dream Team Project: EDA

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from google import colab
colab.drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Data: https://data.wprdc.org/dataset/allegheny-county-911-dispatches-ems-and-fire

In [4]:
url = '/content/drive/MyDrive/RCEL_506/Project/allegheny_county_911_EMS_dispatches.csv'

In [5]:
df = pd.read_csv(url)

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/RCEL_506/Project/allegheny_county_911_EMS_dispatches.csv'

In [None]:
df.head()

In [None]:
df.info()

We can see we have the following data types as above.

In [None]:
for col in df.columns:
    print(col, len(df[col].unique()))

We can see that "service" has only 1 unique identifier, so we can drop the column as it doesn't carry any useful information.

Also, "priority", "priority_desc", "call_quarter", and "call_year" have relatively few unique entries, so let's investigate that more.

In [None]:
df.priority.unique()

In [None]:
df.priority_desc.unique()

In [None]:
df.call_quarter.unique()

In [None]:
df.call_year.unique()

It looks like "priority" and "priority_desc" may be linked together ... if this is true then we want to drop one column because we will have two columns that contain the same information which could throw off our model. Keeping "priority_desc" may be useful, but it should be kept as a static dictionary connected to "priority", only ot be used when we need to decipher what a "priority" means.

We should turn "quarter" and "priority" into categories. "Priority" has an obvious **ordinal** relationship, but I think "call_quarter" has a **nominal** relationship.

In [None]:
df.priority.unique()

Creating a dictionary to relate `{'priority':'priority_desc'}`

In [None]:
priority_dict = {}
for priority in df.priority.unique():
  priority_desc = df.loc[df.priority == priority, 'priority_desc'].unique()[0]
  priority_dict[priority] = priority_desc

In [None]:
priority_dict

In [None]:
df.priority.value_counts()

In [None]:
priority_counts = df.priority.value_counts()
cumulative_percentage = (priority_counts.cumsum() / priority_counts.sum()) * 100

fig, ax1 = plt.subplots(figsize=(18, 6))

plt.bar(priority_counts.index, priority_counts.values)

plt.xlabel('Priority')
plt.ylabel('Count')
plt.title('Chart of 911 EMS Call Priorities')
plt.xticks(rotation=45, ha='right')

ax2 = ax1.twinx()
ax2.plot(cumulative_percentage.index, cumulative_percentage.values, color='red', marker='o')
ax2.set_ylabel('Cumulative Percentage (%)')
ax2.set_ylim(0, 105)


for i, v in enumerate(cumulative_percentage):
  plt.text(i, v + 1, f"{v:.1f}%", ha='center', va='bottom')


plt.show()

In [None]:
sns.barplot(df.call_quarter.value_counts())

In [None]:
year_counts = df.call_year.value_counts().sort_index()

cumulative_percentage = year_counts.cumsum()/year_counts.sum()*100

fig, ax1 = plt.subplots(figsize=(10, 6))

ax1.bar(year_counts.index, year_counts.values)
ax1.set_xlabel('Year')
ax1.set_ylabel('Number of Calls')

ax2 = ax1.twinx()
ax2.plot(cumulative_percentage.index, cumulative_percentage.values, color='red', marker='o')
ax2.set_ylabel('Cumulative Percentage')
ax2.set_ylim(0, 105)

plt.xticks(range(2015, 2025))
plt.xlim(2015 - 0.5, 2024 + 0.5)
plt.title('911 EMS Calls by Year')

plt.show()

In [None]:
df.city_code.unique()

In [None]:
city_dict = {}
for city_name in df.city_name.unique():
  city_codes = df.loc[df.city_name == city_name, 'city_code'].unique()
  if len(city_codes) > 0:
    city_dict[city_name] = city_codes[0]
city_dict


In [None]:
unique_city_codes = df.city_code.unique()

for code1 in unique_city_codes:
  for code2 in unique_city_codes:
    if code1 != code2 and str(code1) in str(code2):
      print(f"Code '{code1}' is contained within code '{code2}'")

In [None]:
df[df.city_code == 'P']

In [None]:
df[df.city_code == 'M']

In [None]:
df[df.city_code == 'MI']

In [None]:
df[df.city_code == 'PG']

In [None]:
df[df.city_code == 'OH']

In [None]:
df[df.city_code == 'AS']

In [None]:
nan_counts = df.isna().sum()
print(nan_counts)
df.isna().sum().sum()

In [None]:
new_df = df
for col in df.columns:
  new_df.dropna(subset=[col], inplace=True)

In [None]:
nan_counts = df.isna().sum()
print(nan_counts)

In [None]:
for col in df.columns:
    print(col, len(df[col].unique()))

In [None]:
df = df.drop('_id', axis=1)
df = df.drop('call_id_hash', axis=1)
df = df.drop('city_name', axis=1)
df = df.drop('geoid', axis=1)
df = df.drop('service', axis=1)
df = df.drop('priority_desc', axis=1)

In [None]:
df.head(5)

In [None]:
quarter_dummies = pd.get_dummies(df['call_quarter'])
df = pd.concat([df, quarter_dummies], axis=1)
df = df.drop('call_quarter', axis=1)

Below orders the ordinal data to ordered categories. Above makes the quarters into unordered categories

In [None]:
from pandas.api.types import CategoricalDtype

priority_series = pd.Series(df.priority.unique())
cat_type = CategoricalDtype(categories=df.priority.unique(), ordered=True)
ordered_priority_series = df.priority.astype(cat_type)

df.priority.unique()

In [None]:
ordered_priority_series

In [None]:
df['priority'] = ordered_priority_series

In [None]:
df

In [None]:
df.info()

In [None]:
df.priority.unique()

In [None]:
df['description_short'] = df['description_short'].astype('category')
df['city_code'] = df["city_code"].astype('category')

In [None]:
value_counts = df['priority'].value_counts()
df = df[df['priority'].isin(value_counts[value_counts >= 500].index)]

In [None]:
df.priority.unique()

In [None]:
df.info()

In [None]:
for col in df.columns:
  print(f'Column: {col}:\n', df[col].unique())
  print('-'*50)

In [None]:
for col in df.columns:
  print(f'Column: {col}:\n', df[col].nunique())
  print('-'*50)

In [None]:
df.columns

In [None]:
# from sklearn.model_selection import train_test_split

# X = df.drop('priority', axis=1)
# y = df['priority']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import GridSearchCV

# rf_model = RandomForestClassifier(random_state=1)

# param_grid = {
#    "n_estimators": np.arange(10, 100, 2),
#    "max_depth": [None] + list(np.arange(2, 10)),
#    "min_samples_leaf": np.arange(1, 10, 2),
#    "class_weight": [{0: 0.1, 1: 0.9}, {0: 0.2, 1: 0.8}, {0: 0.3, 1: 0.7}],
#    "random_state": [1]
# }

# gscv = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='recall')
# gscv.fit(X_train, y_train)

In [None]:
# print("Best parameters:", gscv.best_params_)
# print("Best score:", gscv.best_score_)

# rf_model = gscv.best_estimator_

# from sklearn.metrics import classification_report

# print(f"Performance on TEST\n*******************\n{classification_report(y_test, rf_model.predict(X_test))}")
# print(f"Performance on TRAIN\n********************\n{classification_report(y_train, rf_model.predict(X_train))}")