In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the cleaned dataset
drug_deaths = pd.read_csv('/Users/arka_bagchi/Desktop/Springboard/Data Storytelling/drug_deaths_cleaned.csv')

# Display the first few rows of the dataset to confirm it's loaded correctly
drug_deaths.head()


Unnamed: 0,ID,Date,DateType,Age,Sex,Race,ResidenceCity,ResidenceCounty,ResidenceState,DeathCity,...,Tramad,Morphine_NotHeroin,Hydromorphone,Other,OpiateNOS,AnyOpioid,MannerofDeath,DeathCityGeo,ResidenceCityGeo,InjuryCityGeo
0,14-0273,06/28/2014 12:00:00 AM,1.0,42.0,Unknown,Unknown,Unknown,Unknown,Unknown,,...,0,0,0,,0,0.0,Accident,"CT\n(41.575155, -72.738288)","CT\n(41.575155, -72.738288)","CT\n(41.575155, -72.738288)"
1,13-0102,03/21/2013 12:00:00 AM,0.0,48.0,Male,Black,NORWALK,Unknown,Unknown,NORWALK,...,0,0,0,,0,0.0,Accident,"Norwalk, CT\n(41.11805, -73.412906)","NORWALK, CT\n(41.11805, -73.412906)","CT\n(41.575155, -72.738288)"
2,16-0165,03/13/2016 12:00:00 AM,0.0,30.0,Female,White,SANDY HOOK,FAIRFIELD,CT,DANBURY,...,0,0,0,,0,1.0,Accident,"Danbury, CT\n(41.393666, -73.451539)","SANDY HOOK, CT\n(41.419998, -73.282501)",
3,16-0208,03/31/2016 12:00:00 AM,0.0,23.0,Male,White,RYE,WESTCHESTER,NY,GREENWICH,...,0,0,0,,0,1.0,Accident,"Greenwich, CT\n(41.026526, -73.628549)",,
4,13-0052,02/13/2013 12:00:00 AM,0.0,22.0,Male,"Asian, Other",FLUSHING,QUEENS,Unknown,GREENWICH,...,0,0,0,,0,0.0,Accident,"Greenwich, CT\n(41.026526, -73.628549)",,"CT\n(41.575155, -72.738288)"


Here are the steps we'll follow:

Handling Missing Values:
Fill or impute missing values where applicable, or drop rows/columns with excessive missing values.

Feature Engineering:
Create new features that might be relevant for predicting the presence of Fentanyl.
Extract features from existing columns if necessary.

Encoding Categorical Features:
Convert categorical features into a format suitable for machine learning models using one-hot encoding or label encoding.

Scaling Features:
Standardize or normalize the features so that they are on a similar scale, especially for models sensitive to the scale of input features.

Data Splitting:
Split the data into training and testing sets to evaluate the model's performance.

In [2]:
# Checking for missing values in each column
missing_values = drug_deaths.isnull().sum()
missing_values_percentage = (missing_values / len(drug_deaths)) * 100
missing_info = pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_values_percentage})
missing_info.sort_values(by='Missing Values', ascending=False, inplace=True)
missing_info.head(10)  # Displaying the top 10 columns with the most missing values


Unnamed: 0,Missing Values,Percentage
Other,4670,91.478942
LocationifOther,4515,88.442703
DeathCounty,1100,21.547502
ResidenceCityGeo,93,1.821743
InjuryCityGeo,78,1.527914
InjuryPlace,66,1.29285
Location,24,0.470127
AnyOpioid,6,0.117532
DeathCity,5,0.097943
Fentanyl,4,0.078355


In [3]:
# Dropping columns with high percentage of missing values
columns_to_drop = ['Other', 'LocationifOther', 'DeathCounty']
drug_deaths.drop(columns=columns_to_drop, inplace=True)

# Imputing missing values in 'AnyOpioid' and 'Fentanyl' columns with the mode
for column in ['AnyOpioid', 'Fentanyl']:
    mode_value = drug_deaths[column].mode()[0]
    drug_deaths[column].fillna(mode_value, inplace=True)

# Dropping rows with missing values in 'DeathCity' column
drug_deaths.dropna(subset=['DeathCity'], inplace=True)

# Rechecking the missing values
missing_values_updated = drug_deaths.isnull().sum()
missing_values_percentage_updated = (missing_values_updated / len(drug_deaths)) * 100
missing_info_updated = pd.DataFrame({'Missing Values': missing_values_updated, 'Percentage': missing_values_percentage_updated})
missing_info_updated.sort_values(by='Missing Values', ascending=False, inplace=True)
missing_info_updated.head(10)  # Displaying the top 10 columns with the most missing values


Unnamed: 0,Missing Values,Percentage
ResidenceCityGeo,93,1.823529
InjuryCityGeo,78,1.529412
InjuryPlace,65,1.27451
Location,20,0.392157
MannerofDeath,0,0.0
AnyOpioid,0,0.0
OpiateNOS,0,0.0
Hydromorphone,0,0.0
Morphine_NotHeroin,0,0.0
Tramad,0,0.0


In [8]:
Now moving onto feature engineering.

SyntaxError: invalid syntax (1762404901.py, line 1)

In [6]:
# Creating binary features for substance presence
substance_columns = [
    'Heroin', 'Cocaine', 'Fentanyl', 'Fentanyl_Analogue', 'Oxycodone', 
    'Oxymorphone', 'Ethanol', 'Hydrocodone', 'Benzodiazepine', 'Methadone', 
    'Amphet', 'Tramad', 'Morphine_NotHeroin', 'Hydromorphone', 'OpiateNOS', 'AnyOpioid'
]

# Converting substance columns to binary (presence or absence)
for column in substance_columns:
    drug_deaths[column] = drug_deaths[column].apply(lambda x: 1 if x > 0 else 0)

# Binning Age into categories
bins = [0, 18, 30, 40, 50, 60, 70, 80, 90, 100]
labels = ['0-18', '19-30', '31-40', '41-50', '51-60', '61-70', '71-80', '81-90', '91-100']
drug_deaths['AgeGroup'] = pd.cut(drug_deaths['Age'], bins=bins, labels=labels, right=False)

# Displaying the updated dataset with new features
drug_deaths.head()


Unnamed: 0,ID,Date,DateType,Age,Sex,Race,ResidenceCity,ResidenceCounty,ResidenceState,DeathCity,...,Tramad,Morphine_NotHeroin,Hydromorphone,OpiateNOS,AnyOpioid,MannerofDeath,DeathCityGeo,ResidenceCityGeo,InjuryCityGeo,AgeGroup
1,13-0102,03/21/2013 12:00:00 AM,0.0,48.0,Male,Black,NORWALK,Unknown,Unknown,NORWALK,...,0,0,0,0,0,Accident,"Norwalk, CT\n(41.11805, -73.412906)","NORWALK, CT\n(41.11805, -73.412906)","CT\n(41.575155, -72.738288)",41-50
2,16-0165,03/13/2016 12:00:00 AM,0.0,30.0,Female,White,SANDY HOOK,FAIRFIELD,CT,DANBURY,...,0,0,0,0,1,Accident,"Danbury, CT\n(41.393666, -73.451539)","SANDY HOOK, CT\n(41.419998, -73.282501)",,31-40
3,16-0208,03/31/2016 12:00:00 AM,0.0,23.0,Male,White,RYE,WESTCHESTER,NY,GREENWICH,...,0,0,0,0,1,Accident,"Greenwich, CT\n(41.026526, -73.628549)",,,19-30
4,13-0052,02/13/2013 12:00:00 AM,0.0,22.0,Male,"Asian, Other",FLUSHING,QUEENS,Unknown,GREENWICH,...,0,0,0,0,0,Accident,"Greenwich, CT\n(41.026526, -73.628549)",,"CT\n(41.575155, -72.738288)",19-30
5,14-0277,06/29/2014 12:00:00 AM,0.0,23.0,Male,White,BRISTOL,Unknown,Unknown,BRISTOL,...,0,0,0,0,0,Accident,"BRISTOL, CT\n(41.673037, -72.945791)","BRISTOL, CT\n(41.673037, -72.945791)","CT\n(41.575155, -72.738288)",19-30


In [7]:
# One-hot encoding categorical features
categorical_columns = ['Sex', 'Race', 'MannerofDeath', 'AgeGroup']
drug_deaths_encoded = pd.get_dummies(drug_deaths, columns=categorical_columns, drop_first=True)

# Displaying the updated dataset with encoded categorical features
drug_deaths_encoded.head()


Unnamed: 0,ID,Date,DateType,Age,ResidenceCity,ResidenceCounty,ResidenceState,DeathCity,Location,DescriptionofInjury,...,MannerofDeath_Unknown,MannerofDeath_accident,AgeGroup_19-30,AgeGroup_31-40,AgeGroup_41-50,AgeGroup_51-60,AgeGroup_61-70,AgeGroup_71-80,AgeGroup_81-90,AgeGroup_91-100
1,13-0102,03/21/2013 12:00:00 AM,0.0,48.0,NORWALK,Unknown,Unknown,NORWALK,Hospital,Unknown,...,False,False,False,False,True,False,False,False,False,False
2,16-0165,03/13/2016 12:00:00 AM,0.0,30.0,SANDY HOOK,FAIRFIELD,CT,DANBURY,Hospital,Substance Abuse,...,False,False,False,True,False,False,False,False,False,False
3,16-0208,03/31/2016 12:00:00 AM,0.0,23.0,RYE,WESTCHESTER,NY,GREENWICH,Hospital,substance abuse,...,False,False,True,False,False,False,False,False,False,False
4,13-0052,02/13/2013 12:00:00 AM,0.0,22.0,FLUSHING,QUEENS,Unknown,GREENWICH,Hospital,Transdermal Absorption,...,False,False,True,False,False,False,False,False,False,False
5,14-0277,06/29/2014 12:00:00 AM,0.0,23.0,BRISTOL,Unknown,Unknown,BRISTOL,Residence,Inhalation,...,False,False,True,False,False,False,False,False,False,False


The categorical features have been successfully one-hot encoded, expanding the dataset with binary columns for each category within the original categorical columns

Let's move onto scaling. Distance-based algorithms like K-Nearest Neighbors (KNN) and Support Vector Machines (SVM) require feature scaling, while tree-based algorithms like Random Forest and Decision Trees do not

We can standardize the features (subtract the mean and divide by the standard deviation) or normalize them (scale to a range between 0 and 1) depending on the model we choose later.

Since the scaling step is model-dependent, we can move on to the Data Splitting step where we'll split the data into training and testing subsets

In [9]:
from sklearn.model_selection import train_test_split

# Defining the features (X) and the target variable (y)
X = drug_deaths_encoded.drop(columns=['ID', 'Date', 'DateType', 'Age', 'ResidenceCity', 'ResidenceCounty', 
                                      'ResidenceState', 'DeathCity', 'Location', 'DescriptionofInjury', 
                                      'InjuryPlace', 'Heroin', 'Cocaine', 'Fentanyl_Analogue', 
                                      'Oxycodone', 'Oxymorphone', 'Ethanol', 'Hydrocodone', 'Benzodiazepine',
                                      'Methadone', 'Amphet', 'Tramad', 'Morphine_NotHeroin', 'Hydromorphone',
                                      'OpiateNOS', 'AnyOpioid'])  # Dropping irrelevant or redundant columns
y = drug_deaths_encoded['Fentanyl']

# Splitting the data into training (80%) and testing (20%) subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Checking the shape of the training and testing data
(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


((4080, 34), (1020, 34), (4080,), (1020,))

Training data: 4080 samples with 34 features each
Testing data: 1020 samples with 34 features each

In [10]:
# Defining the paths for saving the preprocessed data
train_data_path = '/Users/arka_bagchi/Desktop/Springboard/Data Storytelling/drug_deaths_train_data.csv'
test_data_path = '/Users/arka_bagchi/Desktop/Springboard/Data Storytelling/drug_deaths_test_data.csv'

# Saving the training data
train_data = pd.concat([X_train, y_train], axis=1)
train_data.to_csv(train_data_path, index=False)

# Saving the testing data
test_data = pd.concat([X_test, y_test], axis=1)
test_data.to_csv(test_data_path, index=False)

# Returning the paths for user access
train_data_path, test_data_path


('/Users/arka_bagchi/Desktop/Springboard/Data Storytelling/drug_deaths_train_data.csv',
 '/Users/arka_bagchi/Desktop/Springboard/Data Storytelling/drug_deaths_test_data.csv')

Now let's save the scaled data as CSVs as well. 

In [11]:
# Identifying non-numeric columns
non_numeric_columns = X_train.select_dtypes(include=['object']).columns
non_numeric_columns


Index(['InjuryCity', 'InjuryCounty', 'InjuryState', 'COD', 'OtherSignifican',
       'DeathCityGeo', 'ResidenceCityGeo', 'InjuryCityGeo'],
      dtype='object')

In [16]:
from sklearn.preprocessing import StandardScaler

# Creating the StandardScaler object
scaler = StandardScaler()


# Defining the paths for saving the scaled data again
train_data_scaled_path = '/Users/arka_bagchi/Desktop/Springboard/Data Storytelling/drug_deaths_train_data_scaled.csv'
test_data_scaled_path = '/Users/arka_bagchi/Desktop/Springboard/Data Storytelling/drug_deaths_test_data_scaled.csv'

# Scaling the training and testing data again
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# Saving the scaled training data
train_data_scaled = pd.concat([X_train_scaled, y_train.reset_index(drop=True)], axis=1)
train_data_scaled.to_csv(train_data_scaled_path, index=False)

# Saving the scaled testing data
test_data_scaled = pd.concat([X_test_scaled, y_test.reset_index(drop=True)], axis=1)
test_data_scaled.to_csv(test_data_scaled_path, index=False)

# Returning the paths for user access
train_data_scaled_path, test_data_scaled_path


('/Users/arka_bagchi/Desktop/Springboard/Data Storytelling/drug_deaths_train_data_scaled.csv',
 '/Users/arka_bagchi/Desktop/Springboard/Data Storytelling/drug_deaths_test_data_scaled.csv')