## Imports

In [10]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

## Read dataset

In [11]:
cancer_water = pd.read_csv('data/cleaned/rivers-cancer-rates.csv')

In [12]:
cancer_water = cancer_water.filter(regex='^(?!Unnamed).*')

## Model 1: Preprocessing

In [13]:
X = cancer_water.drop(columns=['is_rising', 'Age-Adjusted Incidence Rate([rate note]) - cases per 100,000',
       'Lower 95% Confidence Interval', 'Upper 95% Confidence Interval',
       'CI*Rank([rank note])', 'Lower CI (CI*Rank)', 'Upper CI (CI*Rank)',
       'Average Annual Count', 'Recent Trend',
       'Recent 5-Year Trend ([trend note]) in Incidence Rates',
       'Lower 95% Confidence Interval.1', 'Upper 95% Confidence Interval.1'])
y = cancer_water['is_rising']

cancer_water_dummies = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(cancer_water_dummies, y, stratify=y, random_state=2022)

## Baseline

In [17]:
y.value_counts(normalize=True).max()

0.8784425451092118

## Model 1: Random Forest

In [14]:
#X_dummies = pd.get_dummies(X_train, drop_first=True)

rf = RandomForestClassifier(n_estimators=200, n_jobs=-1)
rf.fit(X_train, y_train);

## Model 1: Evaluation

In [15]:
rf.score(X_train, y_train)

1.0

In [16]:
rf.score(X_test, y_test)

0.9278937381404174

In [18]:
preds = rf.predict(X_train)

In [19]:
preds.sum()

192

In [20]:
preds = rf.predict(X_test)
preds.sum()

26

In [21]:
rf.feature_importances_

array([2.93133843e-03, 1.66315868e-02, 1.38591996e-02, ...,
       8.05758442e-04, 9.48329074e-09, 8.47384643e-08])