In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

%cd /content/gdrive/MyDrive/


Mounted at /content/gdrive
/content/gdrive/MyDrive
[Errno 2] No such file or directory: 'PATH_TO_DATA_DIRECTORY'
/content/gdrive/MyDrive


In [2]:
import pandas as pd

# Load the training data
train_df = pd.read_csv('/content/gdrive/MyDrive/train.csv')

# Display the first few rows of the training data
print(train_df.head())

# Summary statistics of the target variable
print(train_df['target'].describe())

   id                                           sequence     target
0   0  MESNHKSGDGLSGTQKEAALRALVQRTGYSLVQENGQRKYGGPPPG...  43.329788
1   4  MCSLGLFPPPPPRGQVTLYEHNNELVTGSSYESPPPDFRGQWINLP...  51.782791
2   5  MSKEERPGREEILECQVMWEPDSKKNTQMDRFRAAVGAACGLALES...  45.080222
3   6  MRSSCVLLTALVALAAYYVYIPLPGSVSDPWKLMLLDATFRGAQQV...  59.651078
4   7  MNYARFITAASAARNPSPIRTMTDILSRGPKSMISLAGGLPNPNMF...  55.985467
count    8148.000000
mean       52.302745
std         4.738780
min        38.520141
25%        48.608375
50%        51.620779
75%        55.496006
max        69.875198
Name: target, dtype: float64


In [3]:
# Load the test data
test_df = pd.read_csv('/content/gdrive/MyDrive/test.csv')

# Display the first few rows of the test data
print(test_df.head())

   id                                           sequence
0   1  MALVFVYGTLKRGQPNHRVLRDGAHGSAAFRARGRTLEPYPLVIAG...
1   2  MGKNKLLHPSLVLLLLVLLPTDASVSGKPQYMVLVPSLLHTETTEK...
2   3  MWAQLLLGMLALSPAIAEELPNYLVTLPARLNFPSVQKVCLDLSPG...
3   8  MAAGVPCALVTSCSSVFSGDQLVQHILGTEDLIVEVTSNDAVRFYP...
4  11  MESESESGAAADTPPLETLSFHGDEEIIEVVELDPGPPDPDDLAQE...


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

# Adjust the generate_kmer_features function to only transform the data
def generate_kmer_features(data, vectorizer=None, k=3):
    # Convert sequences to k-mers
    data['kmer_sequence'] = data['sequence'].apply(lambda x: [x[i:i+k] for i in range(len(x)-k+1)])
    data['kmer_sequence'] = data['kmer_sequence'].apply(lambda x: ' '.join(x))

    if vectorizer is None:
        # If no vectorizer is provided, create a new one and fit it
        vectorizer = CountVectorizer()
        features = vectorizer.fit_transform(data['kmer_sequence'])
    else:
        # If a vectorizer is provided, use it to transform the data
        features = vectorizer.transform(data['kmer_sequence'])

    return features, vectorizer


In [5]:
# Generate k-mer features for the training data
X_train, vectorizer = generate_kmer_features(train_df, k=3)
y_train = train_df['target']


In [6]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Initialize the Gradient Boosting model
model = GradientBoostingRegressor()

# Define a simpler parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100],
    'learning_rate': [0.1],
    'max_depth': [3]
}

# Perform grid search with cross-validation, using fewer folds and parallel jobs
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Select the best model
best_model = grid_search.best_estimator_

# Evaluate the model on the validation set
predictions = best_model.predict(X_val)
mse = mean_squared_error(y_val, predictions)
print(f'Mean Squared Error on Validation Set: {mse}')

Mean Squared Error on Validation Set: 17.50510916265811


In [7]:
# Generate k-mer features for the test data
X_test, _ = generate_kmer_features(test_df, vectorizer=vectorizer, k=3)

# Predict the target variable for the test data
test_predictions = best_model.predict(X_test)

# Create a DataFrame with the predictions and the 'id' column from the test data
predictions_df = pd.DataFrame({'id': test_df['id'], 'target': test_predictions})

# Save the DataFrame to a .csv file
predictions_file_path = '/content/gdrive/MyDrive/test_predictions.csv'
predictions_df.to_csv(predictions_file_path, index=False)

print(f"Predictions saved to '{predictions_file_path}'")

Predictions saved to '/content/gdrive/MyDrive/test_predictions.csv'
