In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# loading training data    
data = np.load('Data/species/species_train.npz')
train_locs = data['train_locs']  # 2D array, rows are number of datapoints and 
                                 # columns are "latitude" and "longitude"
train_ids = data['train_ids']    # 1D array, entries are the ID of the species 
                                 # that is present at the corresponding location in train_locs
species = data['taxon_ids']      # list of species IDe. Note these do not necessarily start at 0 (or 1)
species_names = dict(zip(data['taxon_ids'], data['taxon_names']))  # latin names of species 

# loading test data 
data_test = np.load('Data/species/species_test.npz', allow_pickle=True)
test_locs = data_test['test_locs']    # 2D array, rows are number of datapoints 
                                     # and columns are "latitude" and "longitude"
# data_test['test_pos_inds'] is a list of lists, where each list corresponds to 
# the indices in test_locs where a given species is present, it can be assumed 
# that they are not present in the other locations 
test_pos_inds = dict(zip(data_test['taxon_ids'], data_test['test_pos_inds']))    

In [2]:
test_ids = data_test['taxon_ids']

In [3]:
data_test['taxon_ids']

array([  31529,    3117,  116872,   13392,   13456,  508972,   11114,
         73849,   11896,    7920,   41301,   43138,   13532, 1289491,
        144460,    6364,    1224,   37721,   31150,  517047,  476523,
          5947,  339674,   24832,   25003,   13851,  318747,   46296,
         22038,   14881,   46116,  558619,   14658,  144764,   46014,
         29084,   12792,    7114,   43236,    3340,   22973,   42336,
         65373,   46180,    7576,  559131, 1289689,   47062,   64970,
         70725,   14069,  144531,    4793,   13522,   23027,   10090,
        558436,  117054,    5612,   65212,   12735,   67188,   14306,
         13632,    9612,   43448,   42328,  144646,   31236,   13092,
        201178,   40281,   37920,   44104,   13171,  472770,    9832,
          8077,   14167,  979677,    3914,    5261,  517053,  979682,
         12746,   30953,    1078,   12839,   19765,   73106,   74204,
         27409,   18204,    3045,   40523,   12716,    5367,    9477,
          2071,   27

In [4]:
ids= pd.DataFrame(train_ids)
locs= pd.DataFrame(train_locs)

In [5]:
data_train = pd.concat([ids,locs],axis=1)
data_train.columns = ['id', 'lat', 'long']
data_train

Unnamed: 0,id,lat,long
0,31529,-18.286728,143.481247
1,31529,-13.099798,130.783646
2,31529,-13.965274,131.695145
3,31529,-12.853950,132.800507
4,31529,-12.196790,134.279327
...,...,...,...
272032,145031,33.716885,73.203621
272033,145031,24.600239,72.730560
272034,145031,18.849600,80.654129
272035,145031,21.073837,75.945656


# Random Forest Regressor

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

# Load your data (species, latitude, longitude)
data = data_train.copy()

# Encode the species name into numeric labels
label_encoder = LabelEncoder()
data['species_encoded'] = label_encoder.fit_transform(data['id'])

# Features (species_encoded) and targets (latitude, longitude)
X = data[['species_encoded']]
y_lat = data['lat']
y_lon = data['long']

# Split into training and test sets
X_train, X_test, y_lat_train, y_lat_test, y_lon_train, y_lon_test = train_test_split(X, y_lat, y_lon, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor
lat_model = RandomForestRegressor()
lon_model = RandomForestRegressor()

# Train the models for latitude and longitude separately
lat_model.fit(X_train, y_lat_train)
lon_model.fit(X_train, y_lon_train)

# Make predictions on the test set
lat_pred = lat_model.predict(X_test)
lon_pred = lon_model.predict(X_test)

# Evaluate the models
mse_lat = mean_squared_error(y_lat_test, lat_pred)
mse_lon = mean_squared_error(y_lon_test, lon_pred)

print(f'MSE Latitude: {mse_lat}, MSE Longitude: {mse_lon}')

MSE Latitude: 68.00577609360968, MSE Longitude: 307.3298609063168


In [7]:
# Function to predict location for a new species
def predict_location(species_name):
    species_encoded = label_encoder.transform([species_name])
    lat_prediction = lat_model.predict([[species_encoded[0]]])
    lon_prediction = lon_model.predict([[species_encoded[0]]])
    return lat_prediction[0], lon_prediction[0]

# Example prediction
predicted_lat, predicted_lon = predict_location('458')
print(f'Predicted Location for species1: Latitude: {predicted_lat}, Longitude: {predicted_lon}')

Predicted Location for species1: Latitude: -23.6245171897878, Longitude: -46.57627680136457




# Mean Value

In [8]:
grouped_data = data_train.groupby('id').agg({
    'lat': 'mean',  # or 'first', 'last', etc.
    'long': 'mean'})
grouped_data

Unnamed: 0_level_0,lat,long
id,Unnamed: 1_level_1,Unnamed: 2_level_1
458,-23.635391,-46.610573
460,-31.017895,-57.912819
487,-25.073004,20.192400
585,33.882202,80.526962
871,35.205753,-2.006270
...,...,...
1289606,56.475147,-75.294388
1289647,-22.456312,-49.856724
1289681,22.039534,-79.168694
1289689,-12.044996,32.263771


In [9]:
grouped_data.loc[458]

lat    -23.635391
long   -46.610573
Name: 458, dtype: float32

In [10]:
if int('458') in grouped_data.index:
    print('hello')

hello


# Gaussian Distribution

In [11]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from scipy.stats import multivariate_normal

# Example data
data = data_train.copy()

# Fit a bivariate Gaussian for each species
species_distributions = {}

# Group by species and fit Gaussian distribution to each group's latitude and longitude
for species, group in data.groupby('id'):
    # Calculate mean of latitude and longitude
    mean = [group['lat'].mean(), group['long'].mean()]
    
    # Calculate covariance matrix for latitude and longitude
    covariance = np.cov(group['lat'], group['long'])
    
    # Store the mean and covariance in a dictionary for each species
    species_distributions[species] = {
        'mean': mean,
        'covariance': covariance
    }

# Function to predict location for a given species by sampling from the bivariate normal distribution
def predict_location(species_name):
    if species_name in species_distributions:
        mean = species_distributions[species_name]['mean']
        covariance = species_distributions[species_name]['covariance']
        
        # Sample a new (latitude, longitude) from the bivariate Gaussian distribution
        location = multivariate_normal.rvs(mean=mean, cov=covariance)
        return location[0], location[1]  # Return latitude, longitude
    else:
        return None  # Species not found

# Example prediction
predicted_lat, predicted_lon = predict_location(458)
print(f'Predicted Location for species1: Latitude: {predicted_lat}, Longitude: {predicted_lon}')


Predicted Location for species1: Latitude: -28.13183434726232, Longitude: -52.489895386070835


# K-Nearest Neighbors

In [12]:
data_train1 = data_train[:10000]


In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error


# Assuming 'df' is your DataFrame with 'species_id', 'plan', and 'lot'
# Step 1: Create dummy variables for species_id
dummies = pd.get_dummies(data_train1['id'], prefix='species')
df_with_dummies = pd.concat([data_train1.drop('id', axis=1), dummies], axis=1)

# Step 2: Prepare features and target
X = df_with_dummies.drop(columns=['lat', 'long'])  # Features (dummy variables)
y = df_with_dummies[['lat', 'long']]                # Target (plan and lot)


# Step 5: Set up KNN Regressor with MultiOutputRegressor for multiple targets
knn = KNeighborsRegressor()
multi_output_knn = MultiOutputRegressor(knn)

# Step 6: Use GridSearchCV to optimize the number of neighbors 'k'
param_grid = {'estimator__n_neighbors': list(range(1, 30))}  # Testing k from 1 to 30
grid_search = GridSearchCV(multi_output_knn, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X, y)

# Get the best model
best_knn = grid_search.best_estimator_
best_k = grid_search.best_params_['estimator__n_neighbors']
print(f"Optimal number of neighbors: {best_k}")


KeyboardInterrupt: 

In [35]:
dummies_test = pd.get_dummies(test_ids[:100], prefix='species')
y_pred = best_knn.predict(dummies_test)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- species_10090
- species_1078
- species_117054
- species_12716
- species_12735
- ...
