In [33]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import QuantileTransformer
from sklearn.svm import OneClassSVM
from sklearn.metrics import roc_auc_score

In [34]:
birdData = pd.read_csv("assets/data/birdData.csv")

In [35]:
# remove locName and exoticCategory columns
birdData = birdData.drop(columns=["locName", "exoticCategoryN", "exoticCategoryP", "exoticCategoryX"])

In [None]:
# remove outliers
birdData = birdData[birdData["howMany"] < 5]

In [36]:
# scale the data
scaler = StandardScaler()
birdData[["lat", "lng", "tmed", "prec", "velmedia", "year", "month", "day"]] = scaler.fit_transform(birdData[["lat", "lng", "tmed", "prec", "velmedia", "year", "month", "day"]])

In [37]:
# one-hot encode the sciName column
birdData = pd.get_dummies(birdData, columns=["sciName"])

In [None]:
# split the data
X = birdData.drop(columns=["howMany"])
y = birdData["howMany"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# create a neural network to predict the howMany column
nn = MLPRegressor(hidden_layer_sizes=(200), max_iter=1000, learning_rate_init=1, learning_rate="adaptive", verbose=True, random_state=42)
nn.fit(X_train, y_train)
y_pred = nn.predict(X_test)

# print the root mean squared error
print(mean_squared_error(y_test, y_pred, squared=False))


In [None]:
# create a transformer model to predict the howMany column
nn = MLPRegressor(hidden_layer_sizes=(200), max_iter=1000, learning_rate_init=1, learning_rate="adaptive", verbose=True, random_state=42)
model = make_pipeline(
    QuantileTransformer(output_distribution='normal'),
    TransformedTargetRegressor(regressor=nn, transformer=QuantileTransformer(output_distribution='normal'))
)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# print the root mean squared error
print(mean_squared_error(y_test, y_pred, squared=False))

In [None]:
# create a one class svm model dropping the howMany column
model = OneClassSVM(verbose=True)
model.fit(X_train)
y_pred = model.predict(X_test)

# create a set of binary labels with the size of the test set and all inliers]
svm_test = [1] * len(X_test)

# Assuming y_test is your true binary labels and y_score is the score predicted by the model
y_score = model.decision_function(X_test)
roc_auc = roc_auc_score(svm_test, y_score)
print(f'AUROC: {roc_auc}')