In [90]:
# importing all the necessary libraries
import pandas as pd
import numpy as np
import pickle
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [82]:
# function to prepare the dataset into desired form

def data_prep(data):
    df = pd.read_csv(data)
    df = df.drop(columns=["INDICATOR","FLAG"])
    # Getting all the categorical variables
    # categorical_variables = df.select_dtypes(include=['object']).columns.tolist()
    # numerical_variables = df.select_dtypes(include=['number']).columns.tolist()
    # Check if there are any null values in the DataFrame
    if df.isnull().values.any():
    # Remove rows with any null values
        df = df.dropna()
    df = df.drop(columns=["PANEL_NUM","UNIT_NUM","STUB_NAME_NUM","STUB_LABEL_NUM","YEAR_NUM","AGE_NUM","STUB_NAME"])

    # Create the Gender column
    df['GENDER'] = df['STUB_LABEL'].apply(lambda x: 'Male' if 'Male' in x else 'Female' if 'Female' in x else 'Unknown')

    # Creating are you hispanic latino column 
    df['HISPANIC_LATINO_FLAG'] = df['STUB_LABEL'].apply(
            lambda x: 0 if  'Female: Not Hispanic or Latino: American Indian or Alaska Native' in x
            or 'Female: Not Hispanic or Latino: Black' in x
            or 'Female: Not Hispanic or Latino: White' in x
            or 'Male: Not Hispanic or Latino: American Indian or Alaska Native' in x
            or 'Male: Not Hispanic or Latino: Black' in x
            or 'Male: Not Hispanic or Latino: White' in x
            or 'Male: Not Hispanic or Latino: Asian or Pacific Islander' in x
            or 'Female: Not Hispanic or Latino: Asian or Pacific Islander' in x
            or 'Male: Not Hispanic or Latino: Asian' in x
            or 'Male: Not Hispanic or Latino: Native Hawaiian or Other Pacific Islander' in x
            or 'Female: Not Hispanic or Latino: Asian' in x
            or 'Female: Not Hispanic or Latino: Native Hawaiian or Other Pacific Islander' in x        
            else 1)

    df['HISPANIC_LATINO_RACE'] = df['STUB_LABEL'].apply(lambda x: 1 if 'Male: Hispanic or Latino: All races' in x or 'Female: Hispanic or Latino: All races' in x else 0)

    df['WHITE'] = df['STUB_LABEL'].apply(lambda x: 1 if 'Female: White' in x 
                                        or 'Male: White' in x 
                                        or 'Female: Not Hispanic or Latino: White' in x 
                                        or 'Male: Not Hispanic or Latino: White' in x
                                        else 0)

    df['NATIVE_BLACK'] = df['STUB_LABEL'].apply(lambda x: 1 if 'Female: Not Hispanic or Latino: Black' in x 
                                        or 'Male: Not Hispanic or Latino: Black' in x
                                        else 0)

    df['BLACK_AMERICAN'] = df['STUB_LABEL'].apply(lambda x: 1 if 'Female: Black or African American' in x 
                                        or 'Male: Black or African American' in x
                                        else 0)

    df['AMERICAN_INDIAN'] = df['STUB_LABEL'].apply(lambda x: 1 if 'Female: Not Hispanic or Latino: American Indian or Alaska Native' in x 
                                        or 'Male: Not Hispanic or Latino: American Indian or Alaska Native' in x
                                        or 'Male: American Indian or Alaska Native' in x
                                        or 'Female: American Indian or Alaska Native' in x
                                        else 0)

    df['ASIAN_PACIFIC_ISLANDER'] = df['STUB_LABEL'].apply(lambda x: 1 if 'Male: Asian or Pacific Islander' in x 
                                        or 'Female: Asian or Pacific Islander' in x
                                        or 'Male: Not Hispanic or Latino: Asian or Pacific Islander' in x
                                        or 'Female: Not Hispanic or Latino: Asian or Pacific Islander' in x
                                        else 0)

    df['ASIAN'] = df['STUB_LABEL'].apply(lambda x: 1 if 'Male: Not Hispanic or Latino: Asian' in x 
                                        or 'Female: Not Hispanic or Latino: Asian' in x
                                        else 0)

    df['NATIVE_HAWAIIAN'] = df['STUB_LABEL'].apply(lambda x: 1 if 'Male: Not Hispanic or Latino: Native Hawaiian or Other Pacific Islander' in x 
                                        or 'Female: Not Hispanic or Latino: Native Hawaiian or Other Pacific Islander' in x
                                        else 0)

    word_map = {
    "All drug overdose deaths": "All",
    "Drug overdose deaths involving any opioid": "Any_Opioid",
    "Drug overdose deaths involving natural and semisynthetic opioids": "Natural_Opioids",
    "Drug overdose deaths involving other synthetic opioids (other than methadone)": "Other_Synthetic",
    "Drug overdose deaths involving methadone": "Methadone",
    "Drug overdose deaths involving heroin": "Heroin"
    }
    # Replace values in the 'Panel' column with one-word representations
    df["PANEL"] = df["PANEL"].replace(word_map)
    df['PANEL'].value_counts()

    word_map2 = {
        "Deaths per 100,000 resident population, crude": "Crude",
        "Deaths per 100,000 resident population, age-adjusted": "Age_Adjusted",
    }
    # Replace values in the 'Unit' column with one-word representations
    df["UNIT"] = df["UNIT"].replace(word_map2)

    df = df.drop(columns="STUB_LABEL")

    #Print a list of categorical variables
    categorical_variables2 = df.select_dtypes(include=['object']).columns.tolist()

    # Perform one-hot encoding on 'STUB_LABEL' column
    df_encoded = pd.get_dummies(df, columns=categorical_variables2)

    # Convert all boolean columns to integer columns with 1 and 0
    bool_cols = df_encoded.select_dtypes(include='bool').columns
    df_encoded[bool_cols] = df_encoded[bool_cols].astype(int)

    # Change column names to lowercase and replace spaces with underscores
    df_encoded.columns = df_encoded.columns.str.lower().str.replace(' ', '_')

    df_encoded = df_encoded.drop(columns=["panel_all","age_all_ages","gender_unknown"])

    vif_data = pd.DataFrame()
    vif_data["Variable"] = df_encoded.columns
    vif_data["VIF"] = [variance_inflation_factor(df_encoded.values, i) for i in range(df_encoded.shape[1])]

    # print(vif_data)

    df_encoded = df_encoded.drop(columns=["unit_age_adjusted","unit_crude"])

    return df_encoded


In [83]:
df_cleaned = data_prep('Drug_overdose_death_rates__by_drug_type__sex__age__race__and_Hispanic_origin__United_States_20240518.csv')    

In [84]:
df_cleaned.head()

Unnamed: 0,year,estimate,hispanic_latino_flag,hispanic_latino_race,white,native_black,black_american,american_indian,asian_pacific_islander,asian,...,age_25-34_years,age_35-44_years,age_45-54_years,age_55-64_years,age_65-74_years,age_75-84_years,age_85_years_and_over,age_under_15_years,gender_female,gender_male
0,1999,6.1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2000,6.2,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2001,6.8,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2002,8.2,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2003,8.9,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [85]:
df_cleaned.columns

Index(['year', 'estimate', 'hispanic_latino_flag', 'hispanic_latino_race',
       'white', 'native_black', 'black_american', 'american_indian',
       'asian_pacific_islander', 'asian', 'native_hawaiian',
       'panel_any_opioid', 'panel_heroin', 'panel_methadone',
       'panel_natural_opioids', 'panel_other_synthetic', 'age_15-24_years',
       'age_25-34_years', 'age_35-44_years', 'age_45-54_years',
       'age_55-64_years', 'age_65-74_years', 'age_75-84_years',
       'age_85_years_and_over', 'age_under_15_years', 'gender_female',
       'gender_male'],
      dtype='object')

In [91]:
def model_iterations(df_encoded, cols_to_drop=None):
    if cols_to_drop is not None:
        df_encoded = df_encoded.drop(columns=cols_to_drop)
    # Preprocess the data
    X = df_encoded.drop('estimate', axis=1)  # Features
    y = df_encoded['estimate']  # Target variable

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create and train the Random Forest model
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = rf_model.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    r2 = r2_score(y_test, y_pred)

    print(f"Random Forest Regressor \n")
    if cols_to_drop is not None:
        print('columns dropped are: ',cols_to_drop)
    else:
        print('no columns are dropped')
    print(f'Root Mean Squared Error: {rmse:.2f}')
    print(f'R-squared: {r2:.2f}')   

    # Assume 'model' is your trained model object
    # Serialize the model object
    with open('model.pkl', 'wb') as file:
        pickle.dump(rf_model, file)

    # Create and train the Gradient Boosting Regressor
    gb_regressor = GradientBoostingRegressor(n_estimators=100, max_depth=3, random_state=42)
    gb_regressor.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = gb_regressor.predict(X_test)

    # Evaluate the model
    print(f"\nGradient Boosting Regressor \n")
    if cols_to_drop is not None:
        print('columns dropped are: ',cols_to_drop)
    else:
        print('no columns are dropped')
    mse = mean_squared_error(y_test, y_pred)
    print(f'Mean Squared Error: {mse:.2f}')
    print(f'R-squared: {r2:.2f}')

    # Create and train the Linear Regression model
    lin_regressor = LinearRegression()
    lin_regressor.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = lin_regressor.predict(X_test)

    # Evaluate the model
    print(f"\nLinear Regression\n")
    if cols_to_drop is not None:
        print('columns dropped are: ',cols_to_drop)
    else:
        print('no columns are dropped')
    mse = mean_squared_error(y_test, y_pred)
    print(f'Mean Squared Error: {mse:.2f}')
    print(f'R-squared: {r2:.2f}')

In [92]:
model_iterations(df_cleaned)

Random Forest Regressor 

no columns are dropped
Root Mean Squared Error: 1.33
R-squared: 0.96

Gradient Boosting Regressor 

no columns are dropped
Mean Squared Error: 12.98
R-squared: 0.96

Linear Regression

no columns are dropped
Mean Squared Error: 14.03
R-squared: 0.96
