# Data Understanding

In [1]:
import pandas as pd
import pickle

activity_train = pd.read_csv('activity_train.csv', header=None)
activity_test_blanked = pd.read_csv('activity_test_blanked.csv', header=None)

with open('mol_bits.pkl', 'rb') as f:
    mol_bits = pickle.load(f)

print(activity_train.head())
print("------------------------------------------")
print(activity_test_blanked.head())
print("------------------------------------------")

print(f"Number of molecular fingerprints: {len(mol_bits)}")
for key, value in list(mol_bits.items())[:5]:
    print("------------------------------------------")
    print(f"ChEMBL ID: {key}, Fingerprint: {value}")


        0               1  2
0  O14842   CHEMBL2022243  4
1  O14842   CHEMBL2022244  6
2  O14842   CHEMBL2022245  2
3  O14842   CHEMBL2022246  1
4  O14842   CHEMBL2022247  4
------------------------------------------
        0               1  2
0  O14842   CHEMBL2022258  0
1  O14842   CHEMBL2047161  0
2  O14842   CHEMBL2047163  0
3  O14842   CHEMBL2047168  0
4  O14842   CHEMBL2047169  0
------------------------------------------
Number of molecular fingerprints: 73865
------------------------------------------
ChEMBL ID: CHEMBL2022243, Fingerprint: [10, 38, 50, 80, 107, 113, 180, 217, 315, 322, 365, 366, 389, 400, 545, 602, 650, 654, 679, 695, 713, 718, 736, 745, 797, 799, 807, 850, 875, 926, 935, 1004, 1019, 1027, 1035, 1039, 1050, 1057, 1066, 1088, 1118, 1121, 1160, 1171, 1173, 1179, 1236, 1237, 1325, 1380, 1391, 1419, 1442, 1452, 1456, 1476, 1501, 1611, 1660, 1722, 1737, 1738, 1746, 1747, 1750, 1754, 1758, 1799, 1873, 1917, 1999, 2021]
------------------------------------------
ChE

In [2]:
# Convert mol_bits to DataFrame
mol_bits_df = pd.DataFrame(mol_bits.items(), columns=['Molecule_ID', 'Fingerprint'])

# Remove leading space from molecule IDs in activity datasets
activity_train[1] = activity_train[1].str.strip()
activity_test_blanked[1] = activity_test_blanked[1].str.strip()

# Merge activity datasets with molecular fingerprints
activity_train = pd.merge(activity_train, mol_bits_df, left_on=1, right_on='Molecule_ID', how='left')
activity_test_blanked = pd.merge(activity_test_blanked, mol_bits_df, left_on=1, right_on='Molecule_ID', how='left')

# Drop redundant columns
activity_train.drop(['Molecule_ID'], axis=1, inplace=True)
activity_test_blanked.drop(['Molecule_ID'], axis=1, inplace=True)

# Explore the data
print("Activity Train Dataset:")
print(activity_train.head())
print("Activity Test Blanked Dataset:")
print(activity_test_blanked.head())
print("Molecular Fingerprint Dataset:")
print(mol_bits_df.head())

Activity Train Dataset:
        0              1  2                                        Fingerprint
0  O14842  CHEMBL2022243  4  [10, 38, 50, 80, 107, 113, 180, 217, 315, 322,...
1  O14842  CHEMBL2022244  6  [10, 38, 50, 80, 107, 113, 180, 217, 315, 322,...
2  O14842  CHEMBL2022245  2  [10, 38, 50, 80, 104, 107, 113, 180, 184, 217,...
3  O14842  CHEMBL2022246  1  [10, 38, 50, 80, 107, 113, 118, 123, 217, 315,...
4  O14842  CHEMBL2022247  4  [10, 22, 38, 50, 66, 80, 107, 113, 160, 180, 2...
Activity Test Blanked Dataset:
        0              1  2                                        Fingerprint
0  O14842  CHEMBL2022258  0  [22, 38, 39, 50, 66, 80, 107, 113, 160, 180, 2...
1  O14842  CHEMBL2047161  0  [22, 38, 39, 50, 66, 80, 107, 113, 119, 160, 1...
2  O14842  CHEMBL2047163  0  [3, 22, 38, 39, 50, 66, 80, 107, 113, 160, 166...
3  O14842  CHEMBL2047168  0  [13, 22, 38, 66, 80, 107, 113, 133, 160, 180, ...
4  O14842  CHEMBL2047169  0  [13, 22, 38, 66, 80, 107, 113, 133, 160, 180, .

In [3]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

# Define function to pad or truncate fingerprint arrays to a fixed length
def pad_fingerprint(fingerprint, length=100):
    if len(fingerprint) < length:
        return np.pad(fingerprint, (0, length - len(fingerprint)), mode='constant')
    else:
        return fingerprint[:length]

# Pad or truncate molecular fingerprints to a fixed length
activity_train['Fingerprint'] = activity_train['Fingerprint'].apply(pad_fingerprint)
activity_test_blanked['Fingerprint'] = activity_test_blanked['Fingerprint'].apply(pad_fingerprint)

# Convert molecular fingerprints to 2D array
activity_train_array = np.stack(activity_train['Fingerprint'].values)
activity_test_blanked_array = np.stack(activity_test_blanked['Fingerprint'].values)

# Perform clustering using K-means algorithm
kmeans = KMeans(n_clusters=5, n_init=10, random_state=42)  # Adjust number of clusters as needed
activity_train['Cluster'] = kmeans.fit_predict(activity_train_array)
activity_test_blanked['Cluster'] = kmeans.predict(activity_test_blanked_array)

# Apply SVD for dimensionality reduction
svd = TruncatedSVD(n_components=50, random_state=42)  # Adjust number of components as needed
svd.fit(activity_train_array)
activity_train_svd = svd.transform(activity_train_array)
activity_test_blanked_svd = svd.transform(activity_test_blanked_array)

# Calculate cosine similarity between molecular fingerprints
cos_sim = cosine_similarity(activity_train_array, activity_test_blanked_array)

# Print shapes of transformed data
print("Shape of activity_train_svd:", activity_train_svd.shape)
print("Shape of activity_test_blanked_svd:", activity_test_blanked_svd.shape)
print("Shape of cosine similarity matrix:", cos_sim.shape)

Shape of activity_train_svd: (135711, 50)
Shape of activity_test_blanked_svd: (4628, 50)
Shape of cosine similarity matrix: (135711, 4628)


In [4]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import joblib

# Split the training data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(activity_train_svd, activity_train[2], test_size=0.2, random_state=42)

In [6]:
# Initialize models
models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    mse = mean_squared_error(y_valid, y_pred)
    print(f"{name} - Mean Squared Error: {mse}")

Random Forest - Mean Squared Error: 6.8410603456022825
Linear Regression - Mean Squared Error: 7.942160330709447
Decision Tree - Mean Squared Error: 9.625827071269388
Gradient Boosting - Mean Squared Error: 7.823133696828843


In [9]:
'''# Define hyperparameters grid for each model
param_grid = {
    'Decision Tree': {'max_depth': [None, 5, 10, 20]},
    'Random Forest': {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20]},
    'Gradient Boosting': {'n_estimators': [100, 200, 300], 'learning_rate': [0.1, 0.05, 0.01]}
 }
# Perform hyperparameter tuning using grid search
best_models = {}
for name, model in models.items():
    if name in param_grid:
        grid_search = GridSearchCV(model, param_grid[name], cv=5, scoring='neg_mean_squared_error')
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_models[name] = best_model
        print(f"Best hyperparameters for {name}: {grid_search.best_params_}")
        print(f"Best mean squared error for {name}: {-grid_search.best_score_}")'''

'# Define hyperparameters grid for each model\nparam_grid = {\n    \'Decision Tree\': {\'max_depth\': [None, 5, 10, 20]},\n    \'Random Forest\': {\'n_estimators\': [100, 200, 300], \'max_depth\': [None, 10, 20]},\n    \'Gradient Boosting\': {\'n_estimators\': [100, 200, 300], \'learning_rate\': [0.1, 0.05, 0.01]}\n }\n# Perform hyperparameter tuning using grid search\nbest_models = {}\nfor name, model in models.items():\n    if name in param_grid:\n        grid_search = GridSearchCV(model, param_grid[name], cv=5, scoring=\'neg_mean_squared_error\')\n        grid_search.fit(X_train, y_train)\n        best_model = grid_search.best_estimator_\n        best_models[name] = best_model\n        print(f"Best hyperparameters for {name}: {grid_search.best_params_}")\n        print(f"Best mean squared error for {name}: {-grid_search.best_score_}")'

In [10]:
# Select the best model based on validation performance
best_model = RandomForestRegressor(n_estimators=100, random_state=42)
best_model.fit(activity_train_svd, activity_train[2])

In [11]:
# Predict activities for the test set
activity_test_blanked[2] = best_model.predict(activity_test_blanked_svd)

# Save the predictions
activity_test_blanked.to_csv('activity_test_predictions.csv', index=False)

In [13]:
import pandas as pd

# Load the CSV file
file_path = 'activity_test_predictions.csv'
df = pd.read_csv(file_path)

# Select the first two columns
df_selected = df.iloc[:, :3]

# Save the new DataFrame to a new CSV file
output_file_path = 'activity_test_blanked_predicted.csv'  # Replace with the desired path for the output file
df_selected.to_csv(output_file_path, index=False)