In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the cleaned dataset
drug_deaths = pd.read_csv('/Users/arka_bagchi/Desktop/Springboard/Data Storytelling/drug_deaths_cleaned.csv')

# Display the first few rows of the dataset to confirm it's loaded correctly
drug_deaths.head()


Unnamed: 0,ID,Date,DateType,Age,Sex,Race,ResidenceCity,ResidenceCounty,ResidenceState,DeathCity,...,Tramad,Morphine_NotHeroin,Hydromorphone,Other,OpiateNOS,AnyOpioid,MannerofDeath,DeathCityGeo,ResidenceCityGeo,InjuryCityGeo
0,14-0273,06/28/2014 12:00:00 AM,1.0,42.0,Unknown,Unknown,Unknown,Unknown,Unknown,,...,0,0,0,,0,0.0,Accident,"CT\n(41.575155, -72.738288)","CT\n(41.575155, -72.738288)","CT\n(41.575155, -72.738288)"
1,13-0102,03/21/2013 12:00:00 AM,0.0,48.0,Male,Black,NORWALK,Unknown,Unknown,NORWALK,...,0,0,0,,0,0.0,Accident,"Norwalk, CT\n(41.11805, -73.412906)","NORWALK, CT\n(41.11805, -73.412906)","CT\n(41.575155, -72.738288)"
2,16-0165,03/13/2016 12:00:00 AM,0.0,30.0,Female,White,SANDY HOOK,FAIRFIELD,CT,DANBURY,...,0,0,0,,0,1.0,Accident,"Danbury, CT\n(41.393666, -73.451539)","SANDY HOOK, CT\n(41.419998, -73.282501)",
3,16-0208,03/31/2016 12:00:00 AM,0.0,23.0,Male,White,RYE,WESTCHESTER,NY,GREENWICH,...,0,0,0,,0,1.0,Accident,"Greenwich, CT\n(41.026526, -73.628549)",,
4,13-0052,02/13/2013 12:00:00 AM,0.0,22.0,Male,"Asian, Other",FLUSHING,QUEENS,Unknown,GREENWICH,...,0,0,0,,0,0.0,Accident,"Greenwich, CT\n(41.026526, -73.628549)",,"CT\n(41.575155, -72.738288)"


In [2]:
# Checking for missing values in each column
missing_values = drug_deaths.isnull().sum()
missing_values_percentage = (missing_values / len(drug_deaths)) * 100
missing_info = pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_values_percentage})
missing_info.sort_values(by='Missing Values', ascending=False, inplace=True)
missing_info.head(10)  # Displaying the top 10 columns with the most missing values


Unnamed: 0,Missing Values,Percentage
Other,4670,91.478942
LocationifOther,4515,88.442703
DeathCounty,1100,21.547502
ResidenceCityGeo,93,1.821743
InjuryCityGeo,78,1.527914
InjuryPlace,66,1.29285
Location,24,0.470127
AnyOpioid,6,0.117532
DeathCity,5,0.097943
Fentanyl,4,0.078355


In [3]:
# Dropping columns with high percentage of missing values
columns_to_drop = ['Other', 'LocationifOther', 'DeathCounty']
drug_deaths.drop(columns=columns_to_drop, inplace=True)

# Imputing missing values in 'AnyOpioid' and 'Fentanyl' columns with the mode
for column in ['AnyOpioid', 'Fentanyl']:
    mode_value = drug_deaths[column].mode()[0]
    drug_deaths[column].fillna(mode_value, inplace=True)

# Dropping rows with missing values in 'DeathCity' column
drug_deaths.dropna(subset=['DeathCity'], inplace=True)

# Rechecking the missing values
missing_values_updated = drug_deaths.isnull().sum()
missing_values_percentage_updated = (missing_values_updated / len(drug_deaths)) * 100
missing_info_updated = pd.DataFrame({'Missing Values': missing_values_updated, 'Percentage': missing_values_percentage_updated})
missing_info_updated.sort_values(by='Missing Values', ascending=False, inplace=True)
missing_info_updated.head(10)  # Displaying the top 10 columns with the most missing values


Unnamed: 0,Missing Values,Percentage
ResidenceCityGeo,93,1.823529
InjuryCityGeo,78,1.529412
InjuryPlace,65,1.27451
Location,20,0.392157
MannerofDeath,0,0.0
AnyOpioid,0,0.0
OpiateNOS,0,0.0
Hydromorphone,0,0.0
Morphine_NotHeroin,0,0.0
Tramad,0,0.0


In [4]:
# Creating binary features for substance presence
substance_columns = [
    'Heroin', 'Cocaine', 'Fentanyl', 'Fentanyl_Analogue', 'Oxycodone', 
    'Oxymorphone', 'Ethanol', 'Hydrocodone', 'Benzodiazepine', 'Methadone', 
    'Amphet', 'Tramad', 'Morphine_NotHeroin', 'Hydromorphone', 'OpiateNOS', 'AnyOpioid'
]

# Converting substance columns to binary (presence or absence)
for column in substance_columns:
    drug_deaths[column] = drug_deaths[column].apply(lambda x: 1 if x > 0 else 0)

# Binning Age into categories
bins = [0, 18, 30, 40, 50, 60, 70, 80, 90, 100]
labels = ['0-18', '19-30', '31-40', '41-50', '51-60', '61-70', '71-80', '81-90', '91-100']
drug_deaths['AgeGroup'] = pd.cut(drug_deaths['Age'], bins=bins, labels=labels, right=False)

# Displaying the updated dataset with new features
drug_deaths.head()


TypeError: '>' not supported between instances of 'str' and 'int'

In [5]:
# Converting substance columns to numeric, coercing errors to handle non-numeric entries which will be converted to NaN
for column in substance_columns:
    drug_deaths[column] = pd.to_numeric(drug_deaths[column], errors='coerce')

# Now applying the binary conversion (presence or absence)
for column in substance_columns:
    drug_deaths[column] = drug_deaths[column].apply(lambda x: 1 if x > 0 else 0)

# Binning Age into categories
bins = [0, 18, 30, 40, 50, 60, 70, 80, 90, 100]
labels = ['0-18', '19-30', '31-40', '41-50', '51-60', '61-70', '71-80', '81-90', '91-100']
drug_deaths['AgeGroup'] = pd.cut(drug_deaths['Age'], bins=bins, labels=labels, right=False)

# Displaying the updated dataset with new features
drug_deaths.head()


Unnamed: 0,ID,Date,DateType,Age,Sex,Race,ResidenceCity,ResidenceCounty,ResidenceState,DeathCity,...,Tramad,Morphine_NotHeroin,Hydromorphone,OpiateNOS,AnyOpioid,MannerofDeath,DeathCityGeo,ResidenceCityGeo,InjuryCityGeo,AgeGroup
1,13-0102,03/21/2013 12:00:00 AM,0.0,48.0,Male,Black,NORWALK,Unknown,Unknown,NORWALK,...,0,0,0,0,0,Accident,"Norwalk, CT\n(41.11805, -73.412906)","NORWALK, CT\n(41.11805, -73.412906)","CT\n(41.575155, -72.738288)",41-50
2,16-0165,03/13/2016 12:00:00 AM,0.0,30.0,Female,White,SANDY HOOK,FAIRFIELD,CT,DANBURY,...,0,0,0,0,1,Accident,"Danbury, CT\n(41.393666, -73.451539)","SANDY HOOK, CT\n(41.419998, -73.282501)",,31-40
3,16-0208,03/31/2016 12:00:00 AM,0.0,23.0,Male,White,RYE,WESTCHESTER,NY,GREENWICH,...,0,0,0,0,1,Accident,"Greenwich, CT\n(41.026526, -73.628549)",,,19-30
4,13-0052,02/13/2013 12:00:00 AM,0.0,22.0,Male,"Asian, Other",FLUSHING,QUEENS,Unknown,GREENWICH,...,0,0,0,0,0,Accident,"Greenwich, CT\n(41.026526, -73.628549)",,"CT\n(41.575155, -72.738288)",19-30
5,14-0277,06/29/2014 12:00:00 AM,0.0,23.0,Male,White,BRISTOL,Unknown,Unknown,BRISTOL,...,0,0,0,0,0,Accident,"BRISTOL, CT\n(41.673037, -72.945791)","BRISTOL, CT\n(41.673037, -72.945791)","CT\n(41.575155, -72.738288)",19-30


In [6]:
# One-hot encoding categorical features
categorical_columns = ['Sex', 'Race', 'MannerofDeath', 'AgeGroup']
drug_deaths_encoded = pd.get_dummies(drug_deaths, columns=categorical_columns, drop_first=True)

# Displaying the updated dataset with encoded categorical features
drug_deaths_encoded.head()


Unnamed: 0,ID,Date,DateType,Age,ResidenceCity,ResidenceCounty,ResidenceState,DeathCity,Location,DescriptionofInjury,...,MannerofDeath_Unknown,MannerofDeath_accident,AgeGroup_19-30,AgeGroup_31-40,AgeGroup_41-50,AgeGroup_51-60,AgeGroup_61-70,AgeGroup_71-80,AgeGroup_81-90,AgeGroup_91-100
1,13-0102,03/21/2013 12:00:00 AM,0.0,48.0,NORWALK,Unknown,Unknown,NORWALK,Hospital,Unknown,...,False,False,False,False,True,False,False,False,False,False
2,16-0165,03/13/2016 12:00:00 AM,0.0,30.0,SANDY HOOK,FAIRFIELD,CT,DANBURY,Hospital,Substance Abuse,...,False,False,False,True,False,False,False,False,False,False
3,16-0208,03/31/2016 12:00:00 AM,0.0,23.0,RYE,WESTCHESTER,NY,GREENWICH,Hospital,substance abuse,...,False,False,True,False,False,False,False,False,False,False
4,13-0052,02/13/2013 12:00:00 AM,0.0,22.0,FLUSHING,QUEENS,Unknown,GREENWICH,Hospital,Transdermal Absorption,...,False,False,True,False,False,False,False,False,False,False
5,14-0277,06/29/2014 12:00:00 AM,0.0,23.0,BRISTOL,Unknown,Unknown,BRISTOL,Residence,Inhalation,...,False,False,True,False,False,False,False,False,False,False


Scaling: Standardize the feature scales so they have a mean of 0 and a variance of 1. This is important for algorithms that are sensitive to the scale of the data like SVM and k-NN.
Data Splitting: Split the data into training and testing subsets to evaluate the model's performance.
We'll start by dropping non-numeric and non-relevant columns, then standardize the numerical features, and finally split the data into training and testing subsets. Our target variable is the 'Fentanyl' column.

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Dropping non-numeric and non-relevant columns
columns_to_drop = ['ID', 'Date', 'ResidenceCity', 'ResidenceCounty', 'ResidenceState', 
                   'DeathCity', 'Location', 'DescriptionofInjury', 'InjuryPlace', 
                   'DeathCityGeo', 'ResidenceCityGeo', 'InjuryCityGeo', 
                   'Age',  # Dropping 'Age' as we have 'AgeGroup'
                   'InjuryCity', 'InjuryCounty', 'InjuryState', 
                   'OtherSignifican', 'COD']  # Additional columns to drop
drug_deaths_final = drug_deaths_encoded.drop(columns=columns_to_drop)

# Separating the features (X) from the target variable (y)
X = drug_deaths_final.drop('Fentanyl', axis=1)
y = drug_deaths_final['Fentanyl']

# Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Splitting the data into training and testing subsets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Checking the shape of the training and testing data
(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


((4080, 41), (1020, 41), (4080,), (1020,))

In [10]:
# Identifying columns with string values of 'Unknown'
unknown_string_columns = [col for col in X.columns if X[col].astype(str).str.contains('Unknown').any()]

# Checking the unique values in these columns to understand the data
unique_values_unknown_string = {column: X[column].unique() for column in unknown_string_columns}
unique_values_unknown_string


{'InjuryCounty': array(['Unknown', 'NEW HAVEN', 'HARTFORD', 'LITCHFIELD', 'MIDDLESEX',
        'FAIRFIELD', 'NEW LONDON', 'WINDHAM', 'TOLLAND', 'WESTCHESTER',
        'DUTCHESS', 'WASHINGTON', 'PUTNAM'], dtype=object),
 'InjuryState': array(['Unknown', 'CT', 'NY', 'CONNECTICUT'], dtype=object),
 'OtherSignifican': array(['Unknown', 'RECENT COCAINE USE', 'Chronic cocaine use',
        'Chronic Alcoholism', 'Cardiomegaly', 'Coronary Artery Disease',
        'Hypertension',
        'Hypertensive Cardiovascular Disease, Pulmonary Emphysema',
        'Cocaine Abuse',
        'Atherosclerotic Coronary Artery Disease, Diabetes Mellitus',
        'Obesity, Methadone, Alprazolam and Clonazepam Intoxication',
        'Acute Heroin Intoxication',
        'Acute Intoxication due to the Combined Effects of Heroin and Methadone',
        'Atherosclerotic Cardiovascular Disease', 'Diabetes',
        'Hypertensive Cardiovascular Disease, Morbid Obesity',
        'Recent cocaine use', 'Recent Cocaine U

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Dropping non-numeric and non-relevant columns
columns_to_drop = ['ID', 'Date', 'ResidenceCity', 'ResidenceCounty', 'ResidenceState', 
                   'DeathCity', 'Location', 'DescriptionofInjury', 'InjuryPlace', 
                   'DeathCityGeo', 'ResidenceCityGeo', 'InjuryCityGeo', 
                   'Age',  # Dropping 'Age' as we have 'AgeGroup'
                   'InjuryCity', 'InjuryCounty', 'InjuryState', 
                   'OtherSignifican', 'COD']  # Additional columns to drop
drug_deaths_final = drug_deaths_encoded.drop(columns=columns_to_drop)

# Separating the features (X) from the target variable (y)
X = drug_deaths_final.drop('Fentanyl', axis=1)
y = drug_deaths_final['Fentanyl']

# Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Splitting the data into training and testing subsets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Checking the shape of the training and testing data
(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


((4080, 41), (1020, 41), (4080,), (1020,))

In [13]:
# Creating DataFrame from the scaled training and testing data
train_data_scaled = pd.DataFrame(X_train, columns=X.columns)
test_data_scaled = pd.DataFrame(X_test, columns=X.columns)

# Adding the target variable back to the datasets
train_data_scaled['Fentanyl'] = y_train.values
test_data_scaled['Fentanyl'] = y_test.values

# Saving the preprocessed data into CSV files
train_data_scaled.to_csv('/Users/arka_bagchi/Desktop/Springboard/Data Storytelling/train_data_scaled.csv', index=False)
test_data_scaled.to_csv('/Users/arka_bagchi/Desktop/Springboard/Data Storytelling/test_data_scaled.csv', index=False)

# Providing the file paths for download
file_paths = {
    'Train Data Scaled': '/Users/arka_bagchi/Desktop/Springboard/Data Storytelling/train_data_scaled.csv',
    'Test Data Scaled': '/Users/arka_bagchi/Desktop/Springboard/Data Storytelling/test_data_scaled.csv'
}
file_paths


{'Train Data Scaled': '/Users/arka_bagchi/Desktop/Springboard/Data Storytelling/train_data_scaled.csv',
 'Test Data Scaled': '/Users/arka_bagchi/Desktop/Springboard/Data Storytelling/test_data_scaled.csv'}