HW4: Classification of Green Fluorescent Protein

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder


In [None]:
# Load the datasets
train_X = pd.read_csv('train_X.csv')
train_y = pd.read_csv('train_y.csv')
test_X = pd.read_csv('test_X.csv')
submission_template = pd.read_csv('y_sample_submission.csv')


In [None]:
# Preprocessing the data
# Encode the amino acid sequences using a label encoder as a simple featurization method
label_encoder = LabelEncoder()

# Fit and transform ConstructedAASeq_cln column into integer labels
train_X['Encoded_Seq'] = train_X['ConstructedAASeq_cln'].apply(
    lambda x: list(label_encoder.fit_transform(list(x)))
)
test_X['Encoded_Seq'] = test_X['ConstructedAASeq_cln'].apply(
    lambda x: list(label_encoder.fit_transform(list(x)))
)


In [None]:
# Create feature vectors
X = train_X['Encoded_Seq'].apply(pd.Series).fillna(0).astype(int)  # Padding uneven sequences
y = train_y['Brightness_Class']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shapes of the feature vectors and labels
print(f"Feature matrix (X) shape: {X.shape}")
print(f"Target vector (y) shape: {y.shape}")

# Check the training and validation set sizes
print(f"Training set feature matrix shape: {X_train.shape}, Training set labels shape: {y_train.shape}")
print(f"Validation set feature matrix shape: {X_val.shape}, Validation set labels shape: {y_val.shape}")

# Preview a few rows of the training and validation feature matrices
print("Sample rows from the training feature matrix:\n", X_train.head())
print("Sample rows from the validation feature matrix:\n", X_val.head())


Feature matrix (X) shape: (31029, 237)
Target vector (y) shape: (31029,)
Training set feature matrix shape: (24823, 237), Training set labels shape: (24823,)
Validation set feature matrix shape: (6206, 237), Validation set labels shape: (6206,)
Sample rows from the training feature matrix:
        0    1    2    3    4    5    6    7    8    9    ...  227  228  229  \
9233    15    8    5    3    3    9    4   16    5   17  ...    7   16    6   
19468   15    8    5    3    3    9    4   16    5   17  ...   16   16    6   
12586   15    8    5    3    3    9    4   16    5   17  ...    7   16    6   
16831   15    8    5    3    3    9    4   16    5   17  ...    7   16    6   
25347   15    8    5    3    3    9    4   16    5   17  ...   17   16    6   

       230  231  232  233  234  235  236  
9233     5   10    2    3    9   19    8  
19468    5   10    2    3    9   19    8  
12586    5   10    2    3    9   19    8  
16831    5   10    2    3    9    6    8  
25347    5   10   

#Logistic Regression Performed the best as it gave 0.87 F1 score here and on Kaggle I got 0.88073 Score

In [None]:
# Logistic Regression Performed the best as it gave 0.87 F1 score
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Amino acid property tables (for feature mapping)
amino_acid_mapping = {
    'A': {'Vol': -2.9, 'Hydro': -1.03, 'D1': -1.02, 'D2': -2.88, 'T1': -9.11, 'VHSE1': 0.15, 'Z(1)': 0.07},
    'R': {'Vol': 2.41, 'Hydro': 1.31, 'D1': 1.99, 'D2': 4.13, 'T1': 0.23, 'VHSE1': -1.47, 'Z(1)': 2.88},
    'N': {'Vol': -0.68, 'Hydro': 0.79, 'D1': -2.19, 'D2': 1.86, 'T1': -4.62, 'VHSE1': -0.99, 'Z(1)': 3.22},
    'D': {'Vol': -0.92, 'Hydro': 1.23, 'D1': -6.6, 'D2': 3.32, 'T1': -4.65, 'VHSE1': -1.15, 'Z(1)': 3.64},
    'C': {'Vol': -1.89, 'Hydro': 0.15, 'D1': 0.21, 'D2': 1.12, 'T1': -7.35, 'VHSE1': 0.18, 'Z(1)': 0.71},
    'Q': {'Vol': 0.36, 'Hydro': 1.09, 'D1': -0.47, 'D2': 1.16, 'T1': -3.0, 'VHSE1': -0.96, 'Z(1)': 2.18},
    'E': {'Vol': 0.16, 'Hydro': 1.28, 'D1': -5.39, 'D2': 0.65, 'T1': -3.03, 'VHSE1': -1.18, 'Z(1)': 3.08},
    'G': {'Vol': -4.04, 'Hydro': 0.01, 'D1': -2.86, 'D2': -5.0, 'T1': -10.61, 'VHSE1': -0.2, 'Z(1)': 2.23},
    'H': {'Vol': 0.83, 'Hydro': 1.15, 'D1': 0.73, 'D2': 2.68, 'T1': -1.01, 'VHSE1': -0.43, 'Z(1)': 2.41},
    'I': {'Vol': 0.51, 'Hydro': -1.32, 'D1': 1.91, 'D2': -3.13, 'T1': -4.25, 'VHSE1': 1.27, 'Z(1)': -4.44},
    'L': {'Vol': 0.52, 'Hydro': -1.4, 'D1': 1.64, 'D2': -2.57, 'T1': -4.38, 'VHSE1': 1.36, 'Z(1)': -4.19},
    'K': {'Vol': 0.92, 'Hydro': 1.23, 'D1': 2.47, 'D2': 1.54, 'T1': -2.59, 'VHSE1': -1.17, 'Z(1)': 2.84},
    'M': {'Vol': 0.92, 'Hydro': -1.42, 'D1': 1.93, 'D2': -0.01, 'T1': -4.08, 'VHSE1': 1.01, 'Z(1)': -2.49},
    'F': {'Vol': 2.22, 'Hydro': -1.47, 'D1': 2.68, 'D2': 0.84, 'T1': 0.49, 'VHSE1': 1.52, 'Z(1)': -4.92},
    'P': {'Vol': -1.25, 'Hydro': -0.64, 'D1': 0.45, 'D2': -2.89, 'T1': -5.11, 'VHSE1': 0.22, 'Z(1)': -1.22},
    'S': {'Vol': -2.36, 'Hydro': 0.38, 'D1': -1.76, 'D2': -0.19, 'T1': -7.44, 'VHSE1': -0.67, 'Z(1)': 1.96},
    'T': {'Vol': -1.19, 'Hydro': 0.28, 'D1': -0.55, 'D2': -0.66, 'T1': -5.97, 'VHSE1': -0.34, 'Z(1)': 0.92},
    'W': {'Vol': 4.28, 'Hydro': -0.18, 'D1': 3.88, 'D2': 1.78, 'T1': 5.73, 'VHSE1': 1.5, 'Z(1)': -4.75},
    'Y': {'Vol': 2.75, 'Hydro': -0.18, 'D1': 2.1, 'D2': 1.26, 'T1': 2.08, 'VHSE1': 0.61, 'Z(1)': -1.39},
    'V': {'Vol': -0.65, 'Hydro': -1.27, 'D1': 0.83, 'D2': -3.02, 'T1': -5.87, 'VHSE1': 0.76, 'Z(1)': -2.69}
}

# Read datasets
train_X = pd.read_csv('train_X.csv')
train_y = pd.read_csv('train_y.csv')
test_X = pd.read_csv('test_X.csv')

# Function to featurize amino acid sequences
def featurize_sequence(sequence, aa_mapping):
    features = []
    for aa in sequence:
        if aa in aa_mapping:
            features.extend(aa_mapping[aa].values())
        else:
            features.extend([0] * len(next(iter(aa_mapping.values()))))
    return features

# Apply feature mapping to the dataset
train_features = train_X['ConstructedAASeq_cln'].apply(lambda x: featurize_sequence(x, amino_acid_mapping))
test_features = test_X['ConstructedAASeq_cln'].apply(lambda x: featurize_sequence(x, amino_acid_mapping))

# Convert feature lists to DataFrame
train_features_df = pd.DataFrame(train_features.tolist())
test_features_df = pd.DataFrame(test_features.tolist())

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(train_features_df)
X_test = scaler.transform(test_features_df)

# Extract labels
y_train = train_y['Brightness_Class']

# Split training data for model evaluation
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train multiple models and evaluate
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Support Vector Classifier': SVC(kernel='rbf', random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5)
}

for model_name, model in models.items():
    model.fit(X_train_split, y_train_split)
    y_pred = model.predict(X_val)
    print(f'Performance of {model_name}:')
    print(classification_report(y_val, y_pred))
    print('-' * 50)

# Choose the best model and predict on test data
best_model = LogisticRegression(max_iter=1000, random_state=42)
best_model.fit(X_train, y_train)
y_test_pred = best_model.predict(X_test)

# Save predictions
test_X['Brightness_Class'] = y_test_pred
test_X[['Id', 'Brightness_Class']].to_csv('y_sample_submission.csv', index=False)


Performance of Random Forest:
              precision    recall  f1-score   support

           0       0.91      0.83      0.87      3777
           1       0.77      0.87      0.82      2429

    accuracy                           0.85      6206
   macro avg       0.84      0.85      0.84      6206
weighted avg       0.85      0.85      0.85      6206

--------------------------------------------------
Performance of Logistic Regression:
              precision    recall  f1-score   support

           0       0.92      0.90      0.91      3777
           1       0.85      0.88      0.87      2429

    accuracy                           0.89      6206
   macro avg       0.89      0.89      0.89      6206
weighted avg       0.89      0.89      0.89      6206

--------------------------------------------------
Performance of Support Vector Classifier:
              precision    recall  f1-score   support

           0       0.91      0.85      0.88      3777
           1       0.79    

The table above mentioned, where each amino acid (represented by a one-letter code like 'A', 'R', etc.) is associated with multiple descriptors (e.g., 'Vol', 'Hydro', 'D1', etc.), was constructed using various quantitative descriptors of amino acids.I used several reference CSV files that contained these properties.

Reference Datasets: I had access to multiple CSV files containing properties for each amino acid, such as hydrophobicity (Hydro), volume (Vol), and other descriptors like VHSE and Z-scale values.

Extracting Values: I extracted these values from the reference datasets and then organized them for each amino acid, so that every amino acid is mapped to multiple features. This allowed you to create an integrated data structure where each amino acid's attributes are easily accessible.

Combining Properties: These different properties from various CSVs were combined into a single dictionary (amino_acid_mapping). This dictionary is then used for feature mapping, where each amino acid is represented by a set of numerical features instead of just the sequence.

#In this code, I wanted to see if there will be any change if I removed D2 row. But I got the same Logistic regression score.

In [None]:
# In this code, I wanted to see if there will be any change if I removed D2 row.
# But I got the same Logistic regression score of 0.87.
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Amino acid property tables (for feature mapping)
amino_acid_mapping = {
    'A': {'Vol': -2.9, 'Hydro': -1.03, 'D1': -1.02, 'T1': -9.11, 'VHSE1': 0.15, 'Z(1)': 0.07},
    'R': {'Vol': 2.41, 'Hydro': 1.31, 'D1': 1.99, 'T1': 0.23, 'VHSE1': -1.47, 'Z(1)': 2.88},
    'N': {'Vol': -0.68, 'Hydro': 0.79, 'D1': -2.19, 'T1': -4.62, 'VHSE1': -0.99, 'Z(1)': 3.22},
    'D': {'Vol': -0.92, 'Hydro': 1.23, 'D1': -6.6, 'T1': -4.65, 'VHSE1': -1.15, 'Z(1)': 3.64},
    'C': {'Vol': -1.89, 'Hydro': 0.15, 'D1': 0.21, 'T1': -7.35, 'VHSE1': 0.18, 'Z(1)': 0.71},
    'Q': {'Vol': 0.36, 'Hydro': 1.09, 'D1': -0.47, 'T1': -3.0, 'VHSE1': -0.96, 'Z(1)': 2.18},
    'E': {'Vol': 0.16, 'Hydro': 1.28, 'D1': -5.39, 'T1': -3.03, 'VHSE1': -1.18, 'Z(1)': 3.08},
    'G': {'Vol': -4.04, 'Hydro': 0.01, 'D1': -2.86, 'T1': -10.61, 'VHSE1': -0.2, 'Z(1)': 2.23},
    'H': {'Vol': 0.83, 'Hydro': 1.15, 'D1': 0.73, 'T1': -1.01, 'VHSE1': -0.43, 'Z(1)': 2.41},
    'I': {'Vol': 0.51, 'Hydro': -1.32, 'D1': 1.91, 'T1': -4.25, 'VHSE1': 1.27, 'Z(1)': -4.44},
    'L': {'Vol': 0.52, 'Hydro': -1.4, 'D1': 1.64,  'T1': -4.38, 'VHSE1': 1.36, 'Z(1)': -4.19},
    'K': {'Vol': 0.92, 'Hydro': 1.23, 'D1': 2.47, 'T1': -2.59, 'VHSE1': -1.17, 'Z(1)': 2.84},
    'M': {'Vol': 0.92, 'Hydro': -1.42, 'D1': 1.93,  'T1': -4.08, 'VHSE1': 1.01, 'Z(1)': -2.49},
    'F': {'Vol': 2.22, 'Hydro': -1.47, 'D1': 2.68,  'T1': 0.49, 'VHSE1': 1.52, 'Z(1)': -4.92},
    'P': {'Vol': -1.25, 'Hydro': -0.64, 'D1': 0.45,  'T1': -5.11, 'VHSE1': 0.22, 'Z(1)': -1.22},
    'S': {'Vol': -2.36, 'Hydro': 0.38, 'D1': -1.76,  'T1': -7.44, 'VHSE1': -0.67, 'Z(1)': 1.96},
    'T': {'Vol': -1.19, 'Hydro': 0.28, 'D1': -0.55,  'T1': -5.97, 'VHSE1': -0.34, 'Z(1)': 0.92},
    'W': {'Vol': 4.28, 'Hydro': -0.18, 'D1': 3.88,  'T1': 5.73, 'VHSE1': 1.5, 'Z(1)': -4.75},
    'Y': {'Vol': 2.75, 'Hydro': -0.18, 'D1': 2.1,  'T1': 2.08, 'VHSE1': 0.61, 'Z(1)': -1.39},
    'V': {'Vol': -0.65, 'Hydro': -1.27, 'D1': 0.83, 'T1': -5.87, 'VHSE1': 0.76, 'Z(1)': -2.69}
}

# Read datasets
train_X = pd.read_csv('train_X.csv')
train_y = pd.read_csv('train_y.csv')
test_X = pd.read_csv('test_X.csv')

# Function to featurize amino acid sequences
def featurize_sequence(sequence, aa_mapping):
    features = []
    for aa in sequence:
        if aa in aa_mapping:
            features.extend(aa_mapping[aa].values())
        else:
            features.extend([0] * len(next(iter(aa_mapping.values()))))
    return features

# Apply feature mapping to the dataset
train_features = train_X['ConstructedAASeq_cln'].apply(lambda x: featurize_sequence(x, amino_acid_mapping))
test_features = test_X['ConstructedAASeq_cln'].apply(lambda x: featurize_sequence(x, amino_acid_mapping))

# Convert feature lists to DataFrame
train_features_df = pd.DataFrame(train_features.tolist())
test_features_df = pd.DataFrame(test_features.tolist())

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(train_features_df)
X_test = scaler.transform(test_features_df)

# Extract labels
y_train = train_y['Brightness_Class']

# Split training data for model evaluation
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train multiple models and evaluate
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Support Vector Classifier': SVC(kernel='rbf', random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5)
}

for model_name, model in models.items():
    model.fit(X_train_split, y_train_split)
    y_pred = model.predict(X_val)
    print(f'Performance of {model_name}:')
    print(classification_report(y_val, y_pred))
    print('-' * 50)

# Choose the best model and predict on test data
best_model = LogisticRegression(max_iter=1000, random_state=42)
best_model.fit(X_train, y_train)
y_test_pred = best_model.predict(X_test)

# Save predictions
test_X['Brightness_Class'] = y_test_pred
test_X[['Id', 'Brightness_Class']].to_csv('y_sample_submission.csv', index=False)

Performance of Random Forest:
              precision    recall  f1-score   support

           0       0.91      0.83      0.87      3777
           1       0.77      0.87      0.82      2429

    accuracy                           0.85      6206
   macro avg       0.84      0.85      0.84      6206
weighted avg       0.85      0.85      0.85      6206

--------------------------------------------------
Performance of Logistic Regression:
              precision    recall  f1-score   support

           0       0.93      0.90      0.91      3777
           1       0.85      0.89      0.87      2429

    accuracy                           0.89      6206
   macro avg       0.89      0.89      0.89      6206
weighted avg       0.90      0.89      0.90      6206

--------------------------------------------------
Performance of Support Vector Classifier:
              precision    recall  f1-score   support

           0       0.91      0.85      0.88      3777
           1       0.78    

#In this code, I also included name of the amino acids to see if there is any difference in result but nothing changed.

In [None]:
# Amino acids including a row with the names this time
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer

# Amino acid property tables (for feature mapping)
amino_acid_mapping = {
    'A': {'Name': 'Ala', 'Vol': -2.9, 'Hydro': -1.03, 'D1': -1.02, 'T1': -9.11, 'VHSE1': 0.15, 'Z(1)': 0.07},
    'R': {'Name': 'Arg', 'Vol': 2.41, 'Hydro': 1.31, 'D1': 1.99, 'T1': 0.23, 'VHSE1': -1.47, 'Z(1)': 2.88},
    'N': {'Name': 'Asn', 'Vol': -0.68, 'Hydro': 0.79, 'D1': -2.19, 'T1': -4.62, 'VHSE1': -0.99, 'Z(1)': 3.22},
    'D': {'Name': 'Asp', 'Vol': -0.92, 'Hydro': 1.23, 'D1': -6.6, 'T1': -4.65, 'VHSE1': -1.15, 'Z(1)': 3.64},
    'C': {'Name': 'Cys', 'Vol': -1.89, 'Hydro': 0.15, 'D1': 0.21, 'T1': -7.35, 'VHSE1': 0.18, 'Z(1)': 0.71},
    'Q': {'Name': 'Gln', 'Vol': 0.36, 'Hydro': 1.09, 'D1': -0.47, 'T1': -3.0, 'VHSE1': -0.96, 'Z(1)': 2.18},
    'E': {'Name': 'Glu', 'Vol': 0.16, 'Hydro': 1.28, 'D1': -5.39, 'T1': -3.03, 'VHSE1': -1.18, 'Z(1)': 3.08},
    'G': {'Name': 'Gly', 'Vol': -4.04, 'Hydro': 0.01, 'D1': -2.86, 'T1': -10.61, 'VHSE1': -0.2, 'Z(1)': 2.23},
    'H': {'Name': 'His', 'Vol': 0.83, 'Hydro': 1.15, 'D1': 0.73, 'T1': -1.01, 'VHSE1': -0.43, 'Z(1)': 2.41},
    'I': {'Name': 'Ile', 'Vol': 0.51, 'Hydro': -1.32, 'D1': 1.91, 'T1': -4.25, 'VHSE1': 1.27, 'Z(1)': -4.44},
    'L': {'Name': 'Leu', 'Vol': 0.52, 'Hydro': -1.4, 'D1': 1.64,  'T1': -4.38, 'VHSE1': 1.36, 'Z(1)': -4.19},
    'K': {'Name': 'Lys', 'Vol': 0.92, 'Hydro': 1.23, 'D1': 2.47, 'T1': -2.59, 'VHSE1': -1.17, 'Z(1)': 2.84},
    'M': {'Name': 'Met', 'Vol': 0.92, 'Hydro': -1.42, 'D1': 1.93,  'T1': -4.08, 'VHSE1': 1.01, 'Z(1)': -2.49},
    'F': {'Name': 'Phe', 'Vol': 2.22, 'Hydro': -1.47, 'D1': 2.68,  'T1': 0.49, 'VHSE1': 1.52, 'Z(1)': -4.92},
    'P': {'Name': 'Pro', 'Vol': -1.25, 'Hydro': -0.64, 'D1': 0.45,  'T1': -5.11, 'VHSE1': 0.22, 'Z(1)': -1.22},
    'S': {'Name': 'Ser', 'Vol': -2.36, 'Hydro': 0.38, 'D1': -1.76,  'T1': -7.44, 'VHSE1': -0.67, 'Z(1)': 1.96},
    'T': {'Name': 'Thr', 'Vol': -1.19, 'Hydro': 0.28, 'D1': -0.55,  'T1': -5.97, 'VHSE1': -0.34, 'Z(1)': 0.92},
    'W': {'Name': 'Trp', 'Vol': 4.28, 'Hydro': -0.18, 'D1': 3.88,  'T1': 5.73, 'VHSE1': 1.5, 'Z(1)': -4.75},
    'Y': {'Name': 'Tyr', 'Vol': 2.75, 'Hydro': -0.18, 'D1': 2.1,  'T1': 2.08, 'VHSE1': 0.61, 'Z(1)': -1.39},
    'V': {'Name': 'Val', 'Vol': -0.65, 'Hydro': -1.27, 'D1': 0.83, 'T1': -5.87, 'VHSE1': 0.76, 'Z(1)': -2.69}
}

# Read datasets
train_X = pd.read_csv('train_X.csv')
train_y = pd.read_csv('train_y.csv')
test_X = pd.read_csv('test_X.csv')

# Function to featurize amino acid sequences
def featurize_sequence(sequence, aa_mapping):
    features = []
    names = []
    for aa in sequence:
        if aa in aa_mapping:
            aa_features = aa_mapping[aa].copy()
            names.append(aa_features.pop('Name'))  # Store the 'Name' field separately
            features.extend(aa_features.values())
        else:
            names.append('Unknown')
            features.extend([0] * (len(next(iter(aa_mapping.values()))) - 1))
    return features, names

# Apply feature mapping to the dataset
train_features_names = train_X['ConstructedAASeq_cln'].apply(lambda x: featurize_sequence(x, amino_acid_mapping))
test_features_names = test_X['ConstructedAASeq_cln'].apply(lambda x: featurize_sequence(x, amino_acid_mapping))

# Split features and names
train_features = train_features_names.apply(lambda x: x[0])
train_names = train_features_names.apply(lambda x: x[1])
test_features = test_features_names.apply(lambda x: x[0])
test_names = test_features_names.apply(lambda x: x[1])

# Convert feature lists to DataFrame
train_features_df = pd.DataFrame(train_features.tolist())
test_features_df = pd.DataFrame(test_features.tolist())

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(train_features_df)
X_test = scaler.transform(test_features_df)

# Extract labels
y_train = train_y['Brightness_Class']

# Split training data for model evaluation
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train multiple models and evaluate
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Support Vector Classifier': SVC(kernel='rbf', random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5)
}

for model_name, model in models.items():
    model.fit(X_train_split, y_train_split)
    y_pred = model.predict(X_val)
    print(f'Performance of {model_name}:')
    print(classification_report(y_val, y_pred))
    print('-' * 50)

# Choose the best model and predict on test data
best_model = LogisticRegression(max_iter=1000, random_state=42)
best_model.fit(X_train, y_train)
y_test_pred = best_model.predict(X_test)

# Save predictions
test_X['Brightness_Class'] = y_test_pred
test_X[['Id', 'Brightness_Class']].to_csv('y_sample_submission.csv', index=False)

# Save amino acid names for each sequence
test_X['Amino_Acid_Names'] = test_names.apply(lambda x: ','.join(x))
test_X[['Id', 'Amino_Acid_Names']].to_csv('amino_acid_names.csv', index=False)


Performance of Random Forest:
              precision    recall  f1-score   support

           0       0.91      0.83      0.87      3777
           1       0.77      0.87      0.82      2429

    accuracy                           0.85      6206
   macro avg       0.84      0.85      0.84      6206
weighted avg       0.85      0.85      0.85      6206

--------------------------------------------------
Performance of Logistic Regression:
              precision    recall  f1-score   support

           0       0.93      0.90      0.91      3777
           1       0.85      0.89      0.87      2429

    accuracy                           0.89      6206
   macro avg       0.89      0.89      0.89      6206
weighted avg       0.90      0.89      0.90      6206

--------------------------------------------------
Performance of Support Vector Classifier:
              precision    recall  f1-score   support

           0       0.91      0.85      0.88      3777
           1       0.78    

#Here I included rows of D1, D2, D3...D10 just to see if there is any difference in result. I got lower F1 score for all four models. But logistic regression still give better result comapred to the other three. Thus, for this dataset, logistic regression worked best.

In [None]:
# amino acids with both D1 and D2 row
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer

# Amino acid property tables (for feature mapping)
amino_acid_mapping = {
    'A': {'Vol': -2.9, 'Hydro': -1.03, 'D1': -1.02, 'D2': -2.88, 'T1': -9.11, 'VHSE1': 0.15, 'Z(1)': 0.07, 'D3': -0.56, 'D4': 0.36, 'D5': -6.15, 'D6': -1.68, 'D7': 0.04, 'D8': -2.51, 'D9': -1.94, 'D10': -0.01, 'Ist': -0.73, '2nd': 0.2, '3rd': -0.62},
    'R': {'Vol': 2.41, 'Hydro': 1.31, 'D1': 1.99, 'D2': 4.13, 'T1': 0.23, 'VHSE1': -1.47, 'Z(1)': 2.88, 'D3': -4.41, 'D4': -1.02, 'D5': 4.78, 'D6': 3.04, 'D7': -9.06, 'D8': 6.71, 'D9': 4.41, 'D10': 0.07, 'Ist': -0.22, '2nd': 0.27, '3rd': 1},
    'N': {'Vol': -0.68, 'Hydro': 0.79, 'D1': -2.19, 'D2': 1.86, 'T1': -4.62, 'VHSE1': -0.99, 'Z(1)': 3.22, 'D3': 0.38, 'D4': -0.13, 'D5': -2.3, 'D6': 1.41, 'D7': -5.71, 'D8': -1.11, 'D9': 1.73, 'D10': -0.19, 'Ist': 0.14, '2nd': 0.2, '3rd': -0.66},
    'D': {'Vol': -0.92, 'Hydro': 1.23, 'D1': -6.6, 'D2': 3.32, 'T1': -4.65, 'VHSE1': -1.15, 'Z(1)': 3.64, 'D3': 1.61, 'D4': 0.36, 'D5': -3.25, 'D6': 1.95, 'D7': -7.36, 'D8': 0.14, 'D9': 1.24, 'D10': -0.15, 'Ist': 0.11, '2nd': -1, '3rd': -0.96},
    'C': {'Vol': -1.89, 'Hydro': 0.15, 'D1': 0.21, 'D2': 1.12, 'T1': -7.35, 'VHSE1': 0.18, 'Z(1)': 0.71, 'D3': 3.42, 'D4': -0.68, 'D5': -2.27, 'D6': -1.22, 'D7': 3.11, 'D8': -2.98, 'D9': -1.7, 'D10': 1.57, 'Ist': -0.66, '2nd': 0.26, '3rd': -0.27},
    'Q': {'Vol': 0.36, 'Hydro': 1.09, 'D1': -0.47, 'D2': 1.16, 'T1': -3.0, 'VHSE1': -0.96, 'Z(1)': 2.18},
    'E': {'Vol': 0.16, 'Hydro': 1.28, 'D1': -5.39, 'D2': 0.65, 'T1': -3.03, 'VHSE1': -1.18, 'Z(1)': 3.08},
    'G': {'Vol': -4.04, 'Hydro': 0.01, 'D1': -2.86, 'D2': -5.0, 'T1': -10.61, 'VHSE1': -0.2, 'Z(1)': 2.23},
    'H': {'Vol': 0.83, 'Hydro': 1.15, 'D1': 0.73, 'D2': 2.68, 'T1': -1.01, 'VHSE1': -0.43, 'Z(1)': 2.41},
    'I': {'Vol': 0.51, 'Hydro': -1.32, 'D1': 1.91, 'D2': -3.13, 'T1': -4.25, 'VHSE1': 1.27, 'Z(1)': -4.44},
    'L': {'Vol': 0.52, 'Hydro': -1.4, 'D1': 1.64, 'D2': -2.57, 'T1': -4.38, 'VHSE1': 1.36, 'Z(1)': -4.19},
    'K': {'Vol': 0.92, 'Hydro': 1.23, 'D1': 2.47, 'D2': 1.54, 'T1': -2.59, 'VHSE1': -1.17, 'Z(1)': 2.84},
    'M': {'Vol': 0.92, 'Hydro': -1.42, 'D1': 1.93, 'D2': -0.01, 'T1': -4.08, 'VHSE1': 1.01, 'Z(1)': -2.49},
    'F': {'Vol': 2.22, 'Hydro': -1.47, 'D1': 2.68, 'D2': 0.84, 'T1': 0.49, 'VHSE1': 1.52, 'Z(1)': -4.92},
    'P': {'Vol': -1.25, 'Hydro': -0.64, 'D1': 0.45, 'D2': -2.89, 'T1': -5.11, 'VHSE1': 0.22, 'Z(1)': -1.22},
    'S': {'Vol': -2.36, 'Hydro': 0.38, 'D1': -1.76, 'D2': -0.19, 'T1': -7.44, 'VHSE1': -0.67, 'Z(1)': 1.96},
    'T': {'Vol': -1.19, 'Hydro': 0.28, 'D1': -0.55, 'D2': -0.66, 'T1': -5.97, 'VHSE1': -0.34, 'Z(1)': 0.92},
    'W': {'Vol': 4.28, 'Hydro': -0.18, 'D1': 3.88, 'D2': 1.78, 'T1': 5.73, 'VHSE1': 1.5, 'Z(1)': -4.75},
    'Y': {'Vol': 2.75, 'Hydro': -0.18, 'D1': 2.1, 'D2': 1.26, 'T1': 2.08, 'VHSE1': 0.61, 'Z(1)': -1.39},
    'V': {'Vol': -0.65, 'Hydro': -1.27, 'D1': 0.83, 'D2': -3.02, 'T1': -5.87, 'VHSE1': 0.76, 'Z(1)': -2.69}
}

# Read datasets
train_X = pd.read_csv('train_X.csv')
train_y = pd.read_csv('train_y.csv')
test_X = pd.read_csv('test_X.csv')

# Function to featurize amino acid sequences
def featurize_sequence(sequence, aa_mapping):
    features = []
    for aa in sequence:
        if aa in aa_mapping:
            features.extend(aa_mapping[aa].values())
        else:
            features.extend([0] * len(next(iter(aa_mapping.values()))))
    return features

# Apply feature mapping to the dataset
train_features = train_X['ConstructedAASeq_cln'].apply(lambda x: featurize_sequence(x, amino_acid_mapping))
test_features = test_X['ConstructedAASeq_cln'].apply(lambda x: featurize_sequence(x, amino_acid_mapping))

# Convert feature lists to DataFrame
train_features_df = pd.DataFrame(train_features.tolist())
test_features_df = pd.DataFrame(test_features.tolist())

# Handle missing values by imputing them with the mean of each column
imputer = SimpleImputer(strategy='mean')
train_features_df = imputer.fit_transform(train_features_df)
test_features_df = imputer.transform(test_features_df)

# Ensure there are no NaN values left
assert not np.isnan(train_features_df).any(), "There are still NaN values in train_features_df"
assert not np.isnan(test_features_df).any(), "There are still NaN values in test_features_df"

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(train_features_df)
X_test = scaler.transform(test_features_df)

# Extract labels
y_train = train_y['Brightness_Class']

# Split training data for model evaluation
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train multiple models and evaluate
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Support Vector Classifier': SVC(kernel='rbf', random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5)
}

for model_name, model in models.items():
    model.fit(X_train_split, y_train_split)
    y_pred = model.predict(X_val)
    print(f'Performance of {model_name}:')
    print(classification_report(y_val, y_pred))
    print('-' * 50)

# Choose the best model and predict on test data
best_model = LogisticRegression(max_iter=1000, random_state=42)
best_model.fit(X_train, y_train)
y_test_pred = best_model.predict(X_test)

# Save predictions
test_X['Brightness_Class'] = y_test_pred
test_X[['Id', 'Brightness_Class']].to_csv('y_sample_submission.csv', index=False)



Performance of Random Forest:
              precision    recall  f1-score   support

           0       0.85      0.79      0.82      3777
           1       0.71      0.78      0.74      2429

    accuracy                           0.79      6206
   macro avg       0.78      0.79      0.78      6206
weighted avg       0.79      0.79      0.79      6206

--------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Performance of Logistic Regression:
              precision    recall  f1-score   support

           0       0.90      0.85      0.87      3777
           1       0.79      0.85      0.82      2429

    accuracy                           0.85      6206
   macro avg       0.84      0.85      0.84      6206
weighted avg       0.85      0.85      0.85      6206

--------------------------------------------------
Performance of Support Vector Classifier:
              precision    recall  f1-score   support

           0       0.81      0.75      0.78      3777
           1       0.66      0.73      0.69      2429

    accuracy                           0.74      6206
   macro avg       0.73      0.74      0.74      6206
weighted avg       0.75      0.74      0.75      6206

--------------------------------------------------
Performance of K-Nearest Neighbors:
              precision    recall  f1-score   support

           0       0.79      0.52      0.63      3777
           1       0.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
