# Import some packages

In [None]:
import numpy as np
import pandas as pd
import time
import torch

# Load data

In [None]:
train_df = pd.read_csv("../data/kaggle/train.csv")
test_df = pd.read_csv("../data/kaggle/test.csv")
print(f"Train dataframe has shape: {train_df.shape}")
print(f"Test dataframe has shape: {test_df.shape}")
display(train_df.head())
display(test_df.head())

Train dataframe has shape: (31390, 5)
Test dataframe has shape: (2413, 4)


Unnamed: 0,seq_id,protein_sequence,pH,data_source,tm
0,0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,doi.org/10.1038/s41592-020-0801-4,75.7
1,1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,doi.org/10.1038/s41592-020-0801-4,50.5
2,2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,doi.org/10.1038/s41592-020-0801-4,40.5
3,3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,doi.org/10.1038/s41592-020-0801-4,47.2
4,4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,7.0,doi.org/10.1038/s41592-020-0801-4,49.5


Unnamed: 0,seq_id,protein_sequence,pH,data_source
0,31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes
1,31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes
2,31392,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,Novozymes
3,31393,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes
4,31394,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes


In [None]:
train_df["protein_sequence_len"] = train_df["protein_sequence"].apply(lambda x: len(x))
test_df["protein_sequence_len"] = test_df["protein_sequence"].apply(lambda x: len(x))

Only use sequences lower than 221

In [None]:
from scipy.sparse import csr_matrix

train_df = train_df[train_df["protein_sequence_len"]<=221]
train_df.reset_index(inplace=True)
sequences = [list(string) for string in train_df["protein_sequence"].values.tolist()]
sequences_train = pd.DataFrame(sequences)
sequences_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,211,212,213,214,215,216,217,218,219,220
0,A,A,F,Q,V,T,S,N,E,I,...,,,,,,,,,,
1,A,A,G,G,Q,P,Q,G,A,T,...,A,Q,Q,Q,C,N,,,,
2,A,A,I,G,I,G,I,L,G,G,...,,,,,,,,,,
3,A,A,K,S,G,D,A,E,E,A,...,,,,,,,,,,
4,A,A,L,A,L,G,L,P,A,F,...,,,,,,,,,,


# Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

sequences_train = sequences_train.apply(LabelEncoder().fit_transform)
sequences_train["tm"] = train_df["tm"]
sequences_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,212,213,214,215,216,217,218,219,220,tm
0,0,0,4,13,17,16,15,11,3,7,...,20,19,20,20,20,20,20,20,18,49.7
1,0,0,5,5,13,12,13,5,0,16,...,13,13,13,1,11,20,20,20,18,45.1
2,0,0,7,5,7,5,7,9,5,5,...,20,19,20,20,20,20,20,20,18,62.8
3,0,0,8,15,5,2,0,3,3,0,...,20,19,20,20,20,20,20,20,18,36.3
4,0,0,9,0,9,5,9,12,0,4,...,20,19,20,20,20,20,20,20,18,83.0


# Using XGBoost Regressor

In [None]:
from sklearn.model_selection import train_test_split
# import xgboost

X = sequences_train.loc[:, sequences_train.columns != "tm"]
y = sequences_train.loc[:, sequences_train.columns == "tm"]

X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.2, random_state=42)
# # create an xgboost regression model
# model_xgboost = xgboost.XGBRegressor(n_estimators=500, max_depth=15)
# model_xgboost.fit(X_train, y_train)
# y_pred = model_xgboost.predict(X_test)

In [None]:
# from scipy import stats

# stats.spearmanr(y_test, y_pred)

# Predictions

In [None]:
from scipy.sparse import csr_matrix

test_df = test_df[test_df["protein_sequence_len"]<=221]
sequences = [list(string) for string in test_df["protein_sequence"].values.tolist()]
sequences_test = pd.DataFrame(sequences)
sequences_test = sequences_test.apply(LabelEncoder().fit_transform)
sequences_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,211,212,213,214,215,216,217,218,219,220
0,0,0,0,0,0,0,0,0,0,0,...,7,11,7,5,1,8,13,15,2,6
1,0,0,0,0,0,0,0,0,0,0,...,7,11,7,5,1,8,13,15,2,6
2,0,0,0,0,0,0,0,0,0,0,...,10,11,6,2,5,16,11,4,4,13
3,0,0,0,0,0,0,0,0,0,0,...,7,11,7,5,1,8,13,15,2,6
4,0,0,0,0,0,0,0,0,0,0,...,7,11,7,5,1,8,13,15,2,6


In [None]:
def make_prediction(model, file):
  print(f'Saving results to {file}')
  submission = pd.DataFrame()
  submission["tm"] = model.predict(sequences_test.values)
  submission["seq_id"] = test_df["seq_id"]
  submission.to_csv(file, index=False)

In [None]:
# make_prediction(model_xgboost, "XBGpred.csv")

# Importing Models

In [None]:
from sklearn.linear_model import LinearRegression, SGDRegressor, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor

In [None]:
def save_pred(preds, file, startAt=0):
    print('Saving results to {}'.format(file))
    df_pred = pd.DataFrame({'tm': preds})
    df_pred.insert(0, 'seq_id', range(startAt, startAt + len(preds)))
    df_pred.to_csv(file, index=False)

In [None]:
make_prediction(DecisionTreeRegressor().fit(X_train, y_train), 'DTRpred.csv')
make_prediction(RandomForestRegressor().fit(X_train, y_train), 'RFRpred.csv')
make_prediction(AdaBoostRegressor().fit(X_train, y_train), 'ABRpred.csv')
make_prediction(GradientBoostingRegressor().fit(X_train, y_train), 'GBRpred.csv')

Saving results to DTRpred.csv


  


KeyboardInterrupt: ignored

In [None]:
def tryPipeline(scaler, regType, file):
    pipe = make_pipeline(scaler(), regType())
    pipe.fit(X_train, y_train)
    submission = pd.DataFrame()
    submission["tm"] = pipe.predict(sequences_test.values).tolist()
    if (submission['tm'].dtype != float):
      submission["tm"] = submission["tm"].apply(lambda x: x[0])
    submission["seq_id"] = test_df["seq_id"]
    submission.to_csv(file, index=False)

In [None]:
tryPipeline(StandardScaler, LinearRegression, "LinRegpred.csv")
tryPipeline(StandardScaler, SGDRegressor, "SGDRegpred.csv")

  y = column_or_1d(y, warn=True)
