In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Load the training dataset
train_data_path = '/data/challenge2_files/mitomycinTrain.tsv'
train_df = pd.read_csv(train_data_path, sep='\t')

# Assuming 'Mitomycin_response' is the target column
target_column = 'Mitomycin_response'

# Extract features and target variable
X = train_df.drop(columns=[target_column])
y = train_df[target_column]

# Label encode categorical columns if present
label_encoder = LabelEncoder()
for column in X.select_dtypes(include=['object']).columns:
    X[column] = label_encoder.fit_transform(X[column])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Calculate mutual information scores
mutual_info_scores = mutual_info_classif(X_train, y_train, discrete_features='auto', random_state=42)

# Create a DataFrame to store feature names and their corresponding mutual information scores
mutual_info_df = pd.DataFrame({'Feature': X_train.columns, 'Mutual_Info_Score': mutual_info_scores})

# Sort the DataFrame by mutual information score in descending order
mutual_info_df = mutual_info_df.sort_values(by='Mutual_Info_Score', ascending=False)

# Display the top 10 features based on mutual information score
top_10_mutual_info = mutual_info_df.head(10)
print("Top 10 features based on Mutual Information Score:")
print(top_10_mutual_info)

# Train a Decision Tree model to get feature importances
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Create a DataFrame to store feature names and their corresponding importance scores from Decision Tree
dt_feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': dt_model.feature_importances_})

# Sort the DataFrame by feature importance in descending order
dt_feature_importance_df = dt_feature_importance_df.sort_values(by='Importance', ascending=False)

# Display the top 10 features based on feature importance from Decision Tree
top_10_dt_features = dt_feature_importance_df.head(10)
print("\nTop 10 features based on Decision Tree Feature Importance:")
print(top_10_dt_features)

# Print distinct values from both lists
distinct_values = list(set(top_10_mutual_info['Feature']).union(set(top_10_dt_features['Feature'])))
print("\nDistinct Values from both lists:")
print(distinct_values)


Top 10 features based on Mutual Information Score:
               Feature  Mutual_Info_Score
6354           rna_TES           0.455737
2663       rna_CYP27A1           0.414492
11987         rna_WWOX           0.394963
11705        rna_SEPT1           0.359349
824         rna_PPAP2B           0.344170
2568        rna_STK17B           0.336564
8700   rna_TMX2-CTNND1           0.328218
11709       rna_DCTPP1           0.328084
9810         rna_TMCC3           0.325380
13950     rna_HNRNPUL1           0.318589

Top 10 features based on Decision Tree Feature Importance:
                Feature  Importance
11987          rna_WWOX    0.702108
15853        rna_ZNF75D    0.192754
5633   rna_LOC101928304    0.105138
10661          rna_COQ6    0.000000
10660        rna_ENTPD5    0.000000
10659         rna_PTGR2    0.000000
10658  rna_LOC100506476    0.000000
10657         rna_PNMA1    0.000000
10656          rna_NUMB    0.000000
10662        rna_ZNF410    0.000000

Distinct Values from both list

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE

# Load the training and test datasets

test_data_path = '/data/testData.tsv'

test_df = pd.read_csv(test_data_path, sep='\t')

# Columns to include for modeling
columns_of_interest = ['rna_WWOX', 'rna_STK17B', 'rna_CYP27A1', 'rna_TMCC3', 'rna_DCTPP1', 'rna_NUMB', 'rna_SEPT1', 'rna_TES', 'rna_TMX2-CTNND1', 
                        'rna_HNRNPUL1', 'rna_COQ6', 'rna_ZNF75D', 'type', 'rna_ENTPD5', 'rna_LOC100506476', 'rna_PPAP2B', 'rna_PTGR2', 
                        'rna_LOC101928304', 'rna_PNMA1']
test_df['Mitomycin_response'] = 0

# Ensure both dataframes have the same columns, excluding 'Mitomycin_response'
common_columns = set(train_df.columns) & set(test_df.columns)
common_columns.discard('Mitomycin_response')

# Keep only the common columns in both datasets
columns_of_interest = list(set(columns_of_interest).intersection(common_columns))

# Keep only the specified columns in both datasets
train_df = train_df[['Mitomycin_response'] + columns_of_interest]
test_df = test_df[['Mitomycin_response'] + columns_of_interest]

# Assuming 'Mitomycin_response' is the target column
target_column = 'Mitomycin_response'

# Assuming your features are all columns except the target column
features = [col for col in train_df.columns if col != target_column]

# Identify categorical columns
categorical_columns = train_df[features].select_dtypes(include=['object']).columns.tolist()

# One-hot encode categorical columns using ColumnTransformer
encoder = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), categorical_columns)
    ],
    remainder='passthrough'
)

# Apply one-hot encoding to training and test data
X_train_encoded = encoder.fit_transform(train_df[features])
X_test_encoded = encoder.transform(test_df[features])

# Standardize the data
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train_encoded)
X_test_std = scaler.transform(X_test_encoded)

# Upsample the minority class using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_std, train_df[target_column])

# Apply PCA to reduce dimensionality
pca = PCA(n_components=0.95)  # Retain 95% of the variance
X_train_pca = pca.fit_transform(X_train_resampled)
X_test_pca = pca.transform(X_test_std)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_pca, y_train_resampled, test_size=0.2, random_state=42)

# Define the base models
rf_classifier = RandomForestClassifier(random_state=42)
svm_classifier = SVC(probability=True, random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid_rf = {'n_estimators': [50, 100, 200],
                 'max_depth': [None, 10, 20, 30]}
grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)

param_grid_svm = {'C': [0.1, 1, 10],
                  'kernel': ['linear', 'rbf']}
grid_search_svm = GridSearchCV(svm_classifier, param_grid_svm, cv=5, scoring='accuracy')
grid_search_svm.fit(X_train, y_train)

# Get the best hyperparameters from GridSearchCV
best_params_rf = grid_search_rf.best_params_
best_params_svm = grid_search_svm.best_params_

# Create the stacking classifier with the best hyperparameters
stacking_classifier = StackingClassifier(
    estimators=[('rf', RandomForestClassifier(**best_params_rf, random_state=42)),
                ('svm', SVC(**best_params_svm, probability=True, random_state=42))],
    final_estimator=RandomForestClassifier(n_estimators=100, random_state=42),
    stack_method='auto',
    passthrough=True,
    cv=5
)

# Train the stacking classifier
stacking_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_test_pred = stacking_classifier.predict(X_test_pca)
y_test_pred_prob = stacking_classifier.predict_proba(X_test_pca)[:, 1]

# Display the predictions and the updated test_df with row numbers
test_df['Mitomycin_response'] = y_test_pred
result_df = test_df[['Mitomycin_response']]
print(result_df)

# Evaluate the model on the validation set
y_val_pred = stacking_classifier.predict(X_val)
y_val_pred_prob = stacking_classifier.predict_proba(X_val)[:, 1]

# Print additional evaluation metrics
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(f"Precision: {precision_score(y_val, y_val_pred)}")
print(f"Recall: {recall_score(y_val, y_val_pred)}")
print(f"F1 Score: {f1_score(y_val, y_val_pred)}")
print(f"AUC-ROC: {roc_auc_score(y_val, y_val_pred_prob)}")
a = result_df['Mitomycin_response'].tolist()


                 Mitomycin_response
BrainCNS_VFF746                   1
Bowel_KI64                        1
Bowel_QEVT62                      1
Lung_TXY395                       0
Lung_JIL5                         1
Blood_CB485                       1
Skin_SFL7                         0
Skin_VAK5                         0
Ovary_XT38                        0
Validation Accuracy: 0.9090909090909091
Precision: 0.8571428571428571
Recall: 1.0
F1 Score: 0.923076923076923
AUC-ROC: 0.8833333333333334


In [18]:
import pandas as pd

# Load the existing CSV file
existing_file_path = '/results/Code_sharks_model_info.csv'
existing_df = pd.read_csv(existing_file_path)

# Values for the second column
third_column_values = ['Mitomycin_response.ipynb', 'Stacking random forest with SVM', '90.9', '19', 'Mutual index score', "'rna_PNMA1', 'rna_TES', 'rna_TMCC3', 'rna_DCTPP1', 'rna_HNRNPUL1', 'rna_ZNF410', 'rna_PPAP2B', 'rna_CYP27A1', 'rna_STK17B', 'rna_ZNF75D', 'rna_SEPT1', 'rna_NUMB', 'rna_WWOX', 'rna_PTGR2', 'rna_COQ6', 'rna_LOC100506476', 'rna_ENTPD5', 'rna_LOC101928304', 'rna_TMX2-CTNND1'", "'rna_WWOX', 'rna_STK17B', 'rna_CYP27A1', 'rna_TMCC3', 'rna_DCTPP1', 'rna_NUMB', 'rna_SEPT1', 'rna_TES', 'rna_TMX2-CTNND1','rna_HNRNPUL1', 'rna_COQ6', 'rna_ZNF75D', 'type', 'rna_ENTPD5', 'rna_LOC100506476', 'rna_PPAP2B', 'rna_PTGR2', 'rna_LOC101928304', 'rna_PNMA1'", "'rna_ENTPD5', 'rna_ZNF75D', 'rna_TMCC3', 'type', 'rna_COQ6'", "'rna_ENTPD5', 'rna_ZNF75D', 'rna_TMCC3', 'type', 'rna_COQ6'", "'rna_ENTPD5', 'rna_ZNF75D', 'rna_TMCC3', 'type', 'rna_COQ6'", "'rna_ENTPD5', 'rna_ZNF75D', 'rna_TMCC3', 'type', 'rna_COQ6'", "'rna_ENTPD5', 'rna_ZNF75D', 'rna_TMCC3', 'type', 'rna_COQ6'", "'rna_ENTPD5', 'rna_ZNF75D', 'rna_TMCC3', 'type', 'rna_COQ6'", "'rna_ENTPD5', 'rna_ZNF75D', 'rna_TMCC3', 'type', 'rna_COQ6'", "'rna_ENTPD5', 'rna_ZNF75D', 'rna_TMCC3', 'type', 'rna_COQ6'"]

# Add the second column to the DataFrame
existing_df['Mitomycin_response'] = third_column_values
# Save the updated DataFrame to the same CSV file
existing_df.to_csv(existing_file_path, index=False)

# Display the updated DataFrame
print(existing_df)


                   Unnamed: 0  \
0                    Filename   
1              Algorithm used   
2           Training accuracy   
3          Number of Features   
4   Feature evaluation method   
5         Overall key feature   
6          Breast key feature   
7        BrainCNS key feature   
8           Bowel key feature   
9           Blood key feature   
10           Skin key feature   
11           Lung key feature   
12          Ovary key feature   
13       Prostate key feature   
14         Kidney key feature   

                                 Fulvestrant_response  \
0                          Fulvestrant_response.ipynb   
1                                    sklearn_adaboost   
2                                                  95   
3                                                  30   
4                                  Mutual Index score   
5   'rna_LOC100505938', 'rna_ITGA1', 'rna_ZNF296',...   
6   'rna_LOC100505938', 'rna_ITGA1', 'rna_ZNF296',...   
7   'rna_LOC100

In [7]:
# Get feature importances for each sample in the test set
feature_names_per_sample = []

# Assuming that RandomForestClassifier is the first estimator in your stacking classifier
rf_base_model = stacking_classifier.named_estimators_['rf']

for i, sample in enumerate(X_test_pca):  # Using PCA-transformed data
    # Get feature importances for the current sample from the RandomForestClassifier
    sample_importances = rf_base_model.feature_importances_

    # Get the indices of the top features (adjust n_top_features as needed)
    n_top_features = 5  # Adjust this number to get the top N features
    top_feature_indices = sample_importances.argsort()[-n_top_features:][::-1]

    # Get the feature names for the top features
    top_feature_names = [features[idx] for idx in top_feature_indices]

    # Store the top feature names in a list
    feature_names_per_sample.append({'SampleIndex': i, 'TopFeatures': top_feature_names})

# Display the list of top feature names for each sample
for sample_info in feature_names_per_sample:
    print(f"Sample {sample_info['SampleIndex']} Top Features:")
    print(sample_info['TopFeatures'])
    print("\n")


Sample 0 Top Features:
['rna_ENTPD5', 'rna_ZNF75D', 'rna_TMCC3', 'type', 'rna_COQ6']


Sample 1 Top Features:
['rna_ENTPD5', 'rna_ZNF75D', 'rna_TMCC3', 'type', 'rna_COQ6']


Sample 2 Top Features:
['rna_ENTPD5', 'rna_ZNF75D', 'rna_TMCC3', 'type', 'rna_COQ6']


Sample 3 Top Features:
['rna_ENTPD5', 'rna_ZNF75D', 'rna_TMCC3', 'type', 'rna_COQ6']


Sample 4 Top Features:
['rna_ENTPD5', 'rna_ZNF75D', 'rna_TMCC3', 'type', 'rna_COQ6']


Sample 5 Top Features:
['rna_ENTPD5', 'rna_ZNF75D', 'rna_TMCC3', 'type', 'rna_COQ6']


Sample 6 Top Features:
['rna_ENTPD5', 'rna_ZNF75D', 'rna_TMCC3', 'type', 'rna_COQ6']


Sample 7 Top Features:
['rna_ENTPD5', 'rna_ZNF75D', 'rna_TMCC3', 'type', 'rna_COQ6']


Sample 8 Top Features:
['rna_ENTPD5', 'rna_ZNF75D', 'rna_TMCC3', 'type', 'rna_COQ6']




In [16]:
import pandas as pd

# Load the existing CSV file
existing_file_path = '/results/CodeSharks_model_output.csv'
existing_df = pd.read_csv(existing_file_path)

# Add two additional columns without overwriting existing ones
existing_df['Mitomycin_response'] = a
existing_df['Mitomycin_response_features'] = 'rna_ENTPD5'

# Save the updated DataFrame to the same CSV file
existing_df.to_csv(existing_file_path, index=False)

# Display the updated DataFrame
print(existing_df)


             Unnamed: 0  Fulvestrant_response Fulvestrant_response_features  \
0    Breast key feature                     0              rna_LOC100505938   
1  BrainCNS key feature                     0              rna_LOC100505938   
2     Bowel key feature                     0              rna_LOC100505938   
3     Blood key feature                     0              rna_LOC100505938   
4      Skin key feature                     1              rna_LOC100505938   
5      Lung key feature                     0              rna_LOC100505938   
6     Ovary key feature                     0              rna_LOC100505938   
7  Prostate key feature                     0              rna_LOC100505938   
8    Kidney key feature                     0              rna_LOC100505938   

   Gefitinib_response Gefitinib_response_features  Mitomycin_response  \
0                   0                 rna_B3GALT1                   1   
1                   0                 rna_B3GALT1              

In [17]:
existing_df


Unnamed: 0.1,Unnamed: 0,Fulvestrant_response,Fulvestrant_response_features,Gefitinib_response,Gefitinib_response_features,Mitomycin_response,Mitomycin_response_features
0,Breast key feature,0,rna_LOC100505938,0,rna_B3GALT1,1,rna_ENTPD5
1,BrainCNS key feature,0,rna_LOC100505938,0,rna_B3GALT1,1,rna_ENTPD5
2,Bowel key feature,0,rna_LOC100505938,0,rna_B3GALT1,1,rna_ENTPD5
3,Blood key feature,0,rna_LOC100505938,1,rna_B3GALT1,0,rna_ENTPD5
4,Skin key feature,1,rna_LOC100505938,1,rna_B3GALT1,1,rna_ENTPD5
5,Lung key feature,0,rna_LOC100505938,0,rna_B3GALT1,1,rna_ENTPD5
6,Ovary key feature,0,rna_LOC100505938,0,rna_B3GALT1,0,rna_ENTPD5
7,Prostate key feature,0,rna_LOC100505938,1,rna_B3GALT1,0,rna_ENTPD5
8,Kidney key feature,0,rna_LOC100505938,0,rna_B3GALT1,0,rna_ENTPD5
