In [None]:
import pandas as pd

column_names = ["EC_number", "Species", "smiles", "Compound_name", "Amino_encoding", "Kcat", "unit"]
df = pd.read_csv(file_path, header=None, names=column_names)


In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

# Step 1: Log-transform the Kcat values to handle skewness
df['Log_Kcat'] = np.log1p(df['Kcat'])

# Step 2: One-hot encoding for EC_number and Species
# Due to high cardinality, we'll just note this step for now and check dimensionality before applying

# Step 3: Text-based Features
## For smiles: Count the number of atoms (letters), rings (=), and bonds (+)
df['smiles_atoms_count'] = df['smiles'].apply(lambda x: sum(c.isalpha() for c in x))
df['smiles_rings_count'] = df['smiles'].apply(lambda x: x.count('='))
df['smiles_bonds_count'] = df['smiles'].apply(lambda x: x.count('+'))

## For Amino_encoding: Compute k-mer frequency for k=2,3 and sequence length
def compute_kmer_frequencies(sequence, k):
    kmer_counts = {}
    for i in range(len(sequence) - k + 1):
        kmer = sequence[i:i+k]
        if kmer in kmer_counts:
            kmer_counts[kmer] += 1
        else:
            kmer_counts[kmer] = 1
    return kmer_counts

# Compute k-mer frequencies for k=2 and k=3
df['kmer_counts_2'] = df['Amino_encoding'].apply(lambda x: compute_kmer_frequencies(x, 2))
df['kmer_counts_3'] = df['Amino_encoding'].apply(lambda x: compute_kmer_frequencies(x, 3))
df['sequence_length'] = df['Amino_encoding'].apply(len)

# Step 4: Interaction Features
# We will add this after checking the importance of individual features

# Step 5: Feature Scaling
# This will be the final step once all features are created

# Show the DataFrame with new features
df[['Log_Kcat', 'smiles_atoms_count', 'smiles_rings_count', 'smiles_bonds_count', 'sequence_length']].head()


In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction import DictVectorizer

# Step 2 Continued: One-hot encoding for EC_number and Species (if dimensionality allows)
# Checking the dimensionality increase if one-hot encoding is applied
one_hot_dimension = df['EC_number'].nunique() + df['Species'].nunique()

# If the increase in dimensionality is too high, we might opt for target encoding
# For demonstration, let's proceed with one-hot encoding
if one_hot_dimension < 1000:  # Arbitrary threshold
    one_hot_features = ['EC_number', 'Species']
else:
    print("High dimensionality due to one-hot encoding. Consider target encoding.")

# Step 4 Continued: Interaction Features
# For demonstration, let's create an interaction between 'smiles_atoms_count' and 'sequence_length'
df['atom_seq_interaction'] = df['smiles_atoms_count'] * df['sequence_length']

# Step 5 Continued: Feature Scaling
# We will scale the numerical features. The scaling of one-hot encoded features is generally not required.
numerical_features = ['Log_Kcat', 'smiles_atoms_count', 'smiles_rings_count', 'smiles_bonds_count', 
                      'sequence_length', 'atom_seq_interaction']

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the numerical features and transform both train and test sets
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Convert k-mer counts to a format suitable for scikit-learn's DictVectorizer
# This will essentially perform a one-hot encoding of k-mer features
dict_vectorizer = DictVectorizer(sparse=False)
kmer_2_array = dict_vectorizer.fit_transform(df['kmer_counts_2'].tolist())
kmer_3_array = dict_vectorizer.fit_transform(df['kmer_counts_3'].tolist())

# Convert the numpy arrays to DataFrames to concatenate with the original DataFrame
kmer_2_df = pd.DataFrame(kmer_2_array, columns=[f"kmer_2_{i}" for i in range(kmer_2_array.shape[1])])
kmer_3_df = pd.DataFrame(kmer_3_array, columns=[f"kmer_3_{i}" for i in range(kmer_3_array.shape[1])])

# Concatenate the k-mer features to the original DataFrame
df_extended = pd.concat([df, kmer_2_df, kmer_3_df], axis=1)

# Show a snippet of the DataFrame with new features
df_extended.head()

In [None]:
# One-hot encode the 'EC_number' and 'Species' columns
one_hot_encoder = OneHotEncoder(sparse=False)
one_hot_encoded = one_hot_encoder.fit_transform(df[['EC_number', 'Species']])

# Convert the one-hot encoded array to a DataFrame
one_hot_df = pd.DataFrame(one_hot_encoded, columns=one_hot_encoder.get_feature_names_out(['EC_number', 'Species']))

# Concatenate the one-hot encoded columns to the DataFrame
df_final = pd.concat([df_extended, one_hot_df], axis=1)

# Show a snippet of the DataFrame with one-hot encoded features
df_final.head()
