In [None]:
# This is the Analysis section of the project. This section will consist of 
# exploratory data analysis (EDA) and visualization of the datasets used in the project.
# These visualizations will help to understand the data and provide insights that may be useful for 
# future analysis and training of ML models.ipynb

# Sources:
# Disclaimer: GenAI was used for idea generation, suggestions, and debugging but not for full code generation.

In [1]:
# Imports
import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import scipy
from matplotlib.pyplot import subplots
from ISLP import load_data
from ISLP.models import (ModelSpec as MS, summarize, poly)
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler


In [2]:
# Import datasets with pandas
filepath = os.getcwd() + "/datasets"
dataset_4_1 = pd.read_csv(os.path.join(filepath, "4.1_final_assignments.csv"))
dataset_4_2 = pd.read_csv(os.path.join(filepath, "4.2_final_assignments.csv"))
dataset_4_3 = pd.read_csv(os.path.join(filepath, "4.3_final_assignments.csv"))
dataset_4_4 = pd.read_csv(os.path.join(filepath, "4.4_final_assignments.csv"))
dataset_4_5 = pd.read_csv(os.path.join(filepath, "4.5_final_assignments.csv"))
dataset_8_1 = pd.read_csv(os.path.join(filepath, "8.1_final_assignments.csv"))
dataset_8_2 = pd.read_csv(os.path.join(filepath, "8.2_final_assignments.csv"))
dataset_8_3 = pd.read_csv(os.path.join(filepath, "8.3_final_assignments.csv"))
dataset_8_4 = pd.read_csv(os.path.join(filepath, "8.4_final_assignments.csv"))
dataset_8_5 = pd.read_csv(os.path.join(filepath, "8.5_final_assignments.csv"))

In [14]:
# Combine Datasets into one set for visualization and analysis
combined_dataset = pd.concat([dataset_4_1, dataset_4_2, dataset_4_3, dataset_4_4, dataset_4_5, dataset_8_1, 
                              dataset_8_2, dataset_8_3, dataset_8_4, dataset_8_5], ignore_index=True)

# Drop any fully duplicated rows
combined_dataset.drop_duplicates(inplace=True)

# Drop rows where critical values are missing (essentially dropNA on key columns)
critical_cols = ['wwtp', 'wrf', 'GenomeName', 'Proteins', 'network']
combined_dataset.dropna(subset=critical_cols, inplace=True)

# Convert time and pore size collumns to appropriate data types
combined_dataset['collection.date'] = pd.to_datetime(combined_dataset['collection.date'], errors='ignore')
combined_dataset['pore.size.um'] = combined_dataset['pore.size'].str.extract(r'(\d+\.?\d*)').astype(float)
combined_dataset.drop('pore.size', axis=1, inplace=True)


# Standardize string columns and clean prediction columns
object_cols = combined_dataset.select_dtypes(include='object').columns.tolist()

for col in object_cols:
    combined_dataset[col] = combined_dataset[col].astype(str).str.lower()
    
    # Clean predictions (e.x 'taleaviricota|nan' -> 'taleaviricota')
    if 'prediction' in col:
        # Split by '|' and take the first element (the most likely prediction)
        combined_dataset[col] = combined_dataset[col].str.split('|').str[0]

# Fill nan Genome Size (Kb) with the median
median_size = combined_dataset['Size (Kb)'].median()
combined_dataset['Size (Kb)'].fillna(median_size, inplace=True)

# Fill remaining NaN values in object columns with 'unknown'
object_cols_for_filling = combined_dataset.select_dtypes(include='object').columns.tolist()
for col in object_cols_for_filling:
    combined_dataset[col].fillna('unknown', inplace=True)

# Generate CSV of combined dataset for future use
combined_dataset.to_csv(os.path.join(filepath, "combined_final_assignments.csv"), index=False)

  combined_dataset['collection.date'] = pd.to_datetime(combined_dataset['collection.date'], errors='ignore')
  combined_dataset['collection.date'] = pd.to_datetime(combined_dataset['collection.date'], errors='ignore')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_dataset['Size (Kb)'].fillna(median_size, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform 