In [1]:
# Initial imports.
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [3]:
#  Import and read the csv.
data_df = pd.read_csv("Resources/sample_data.csv")
data_df.head()

Unnamed: 0,Year,FIPS,C_S,month,cases,Pcincome,POPESTIMATE,MFratio,POPEST_MALE,POPEST_FEM,...,WA,BA,IA,AA,NA,TOM,NH,H,Cost,RUCC
0,2017,1001,"Autauga County,Alabama",1,0,27824,55448,0.951913,27041,28407,...,42606,10816,254,693,59,1020,53908,1540,186712,2
1,2017,1001,"Autauga County,Alabama",2,0,27824,55448,0.951913,27041,28407,...,42606,10816,254,693,59,1020,53908,1540,186577,2
2,2017,1001,"Autauga County,Alabama",3,0,27824,55448,0.951913,27041,28407,...,42606,10816,254,693,59,1020,53908,1540,186397,2
3,2017,1001,"Autauga County,Alabama",4,0,27824,55448,0.951913,27041,28407,...,42606,10816,254,693,59,1020,53908,1540,186839,2
4,2017,1001,"Autauga County,Alabama",5,0,27824,55448,0.951913,27041,28407,...,42606,10816,254,693,59,1020,53908,1540,187155,2


### Preprocessing the data

In [4]:
# Drop the null columns where all values are null
data_df = data_df.dropna(axis='columns', how='all')

In [5]:
# Drop the null rows
data_df = data_df.dropna()

In [6]:
# Using drop() to delete rows based on column value
data_df.drop(data_df[data_df['Year'] < 2019].index, inplace = True)

In [None]:
# Dropping "C_S" due to FIPS representing the same information
# also removes a categorical variables
data_df = data_df.drop(columns=["C_S"])

In [None]:
# Define the features set.
X = data_df.copy()
X = X.drop("Cost", axis=1)

# Define the target set.
y = data_df["Cost"].ravel()

In [None]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=22)

In [None]:
# Creating a StandardScaler instance.
scaler = StandardScaler()

# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=22) 

In [None]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [None]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

In [None]:
y_test

In [None]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    #cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
    cm)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

cm_df

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))