<a href="https://colab.research.google.com/github/boteny02/Research_Outcome/blob/main/Machine_Learning_for_Cancer_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#data handling
import pandas as pd
import numpy as np

#data visualization
import matplotlib.pyplot as plt
import seaborn as sns

#preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import label_binarize
from sklearn.preprocessing import MinMaxScaler

#feature selection
from sklearn.feature_selection import mutual_info_classif

#classification
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

# performance metrics
from sklearn.metrics import balanced_accuracy_score,f1_score,precision_score, recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay # import ConfusionMatrixDisplay instead of plot_confusion_matrix
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import roc_auc_score

In [None]:
# prompt: #read data directly from a github repository
url = 'https://github.com/vappiah/Machine-Learning-Tutorials/raw/main/datasets/cancer_gene_expression.zip'
df = pd.read_csv(url)


In [None]:
#print(dataframe.shape())
#print(dataframe.head())

In [None]:
# prompt: let's check the number of samples and features

print(df.shape)


(801, 8001)


In [None]:
print(df.head())

   gene_1  gene_2  gene_3    gene_4  gene_5  gene_6  gene_7  gene_8    gene_9  \
0     0.0     0.0     0.0  2.088413     0.0     0.0     0.0     0.0  0.550605   
1     0.0     0.0     0.0  3.205955     0.0     0.0     0.0     0.0  0.425244   
2     0.0     0.0     0.0  4.746646     0.0     0.0     0.0     0.0  2.639417   
3     0.0     0.0     0.0  1.173191     0.0     0.0     0.0     0.0  1.527371   
4     0.0     0.0     0.0  1.366532     0.0     0.0     0.0     0.0  0.000000   

    gene_10  ...  gene_7992  gene_7993  gene_7994  gene_7995  gene_7996  \
0  2.815760  ...  11.558803   8.881802   6.014840   6.643534  11.740624   
1  2.354396  ...  11.062829   9.032864   5.054193   6.432320  12.104985   
2  1.657091  ...  12.497640   7.198160   0.943434   7.371690  11.202356   
3  2.732899  ...  11.261713   8.725676   6.300418   6.036451  11.732303   
4  3.388355  ...  12.241965   7.685204   5.142948   6.355788  11.493950   

   gene_7997  gene_7998  gene_7999  gene_8000  Cancer_Type  
0

In [None]:
print(df.columns[0:3])

Index(['gene_1', 'gene_2', 'gene_3'], dtype='object')


In [None]:
print(df.columns)

Index(['gene_1', 'gene_2', 'gene_3', 'gene_4', 'gene_5', 'gene_6', 'gene_7',
       'gene_8', 'gene_9', 'gene_10',
       ...
       'gene_7992', 'gene_7993', 'gene_7994', 'gene_7995', 'gene_7996',
       'gene_7997', 'gene_7998', 'gene_7999', 'gene_8000', 'Cancer_Type'],
      dtype='object', length=8001)


In [None]:
#check for missing values
dfnull=df.isnull().sum()
df_type=df['Cancer_Type'].head()
#print(dfnull)
print(df_type)


0    KIRC
1    KIRC
2    BRCA
3    KIRC
4    COAD
Name: Cancer_Type, dtype: object


In [None]:
# prompt: #check for missing values with for loop

for column in df.columns:
  print(f"Column: {column}, Missing values: {df[column].isnull().sum()}")



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Column: gene_3002, Missing values: 0
Column: gene_3003, Missing values: 0
Column: gene_3004, Missing values: 0
Column: gene_3005, Missing values: 0
Column: gene_3006, Missing values: 0
Column: gene_3007, Missing values: 0
Column: gene_3008, Missing values: 0
Column: gene_3009, Missing values: 0
Column: gene_3010, Missing values: 0
Column: gene_3011, Missing values: 0
Column: gene_3012, Missing values: 0
Column: gene_3013, Missing values: 0
Column: gene_3014, Missing values: 0
Column: gene_3015, Missing values: 0
Column: gene_3016, Missing values: 0
Column: gene_3017, Missing values: 0
Column: gene_3018, Missing values: 0
Column: gene_3019, Missing values: 0
Column: gene_3020, Missing values: 0
Column: gene_3021, Missing values: 0
Column: gene_3022, Missing values: 0
Column: gene_3023, Missing values: 0
Column: gene_3024, Missing values: 0
Column: gene_3025, Missing values: 0
Column: gene_3026, Missing values: 0
Column: ge

In [None]:
# prompt: explain iloc for data processing

# iloc is primarily used for integer-location based indexing for selection by position.
# Here are some examples demonstrating its use for data processing:


# Select the first 5 rows of the dataframe
print(df.iloc[:5])
print("=============")


# Select rows with index 1, 3, and 5:
print(df.iloc[[1, 3, 5]])
print("=============")

# Select rows from index 10 to 20 (inclusive) and columns from index 2 to 4 (inclusive)
print(df.iloc[10:21, 2:5])
print("=============")
# Selecting specific rows and columns using lists
print(df.iloc[[1, 2, 4], [0, 2]])
print("=============")

# Replacing a specific cell value using .iloc
df.iloc[0, 0] = 'New Value'
print("=============")
# Note: df.iloc is zero-based indexing meaning the first row has an index of 0.






      gene_1  gene_2  gene_3    gene_4  gene_5  gene_6  gene_7  gene_8  \
0  New Value     0.0     0.0  2.088413     0.0     0.0     0.0     0.0   
1        0.0     0.0     0.0  3.205955     0.0     0.0     0.0     0.0   
2        0.0     0.0     0.0  4.746646     0.0     0.0     0.0     0.0   
3        0.0     0.0     0.0  1.173191     0.0     0.0     0.0     0.0   
4        0.0     0.0     0.0  1.366532     0.0     0.0     0.0     0.0   

     gene_9   gene_10  ...  gene_7992  gene_7993  gene_7994  gene_7995  \
0  0.550605  2.815760  ...  11.558803   8.881802   6.014840   6.643534   
1  0.425244  2.354396  ...  11.062829   9.032864   5.054193   6.432320   
2  2.639417  1.657091  ...  12.497640   7.198160   0.943434   7.371690   
3  1.527371  2.732899  ...  11.261713   8.725676   6.300418   6.036451   
4  0.000000  3.388355  ...  12.241965   7.685204   5.142948   6.355788   

   gene_7996  gene_7997  gene_7998  gene_7999  gene_8000  Cancer_Type  
0  11.740624   7.065012   9.932659   6

In [None]:
print(df['Cancer_Type'].value_counts())

Cancer_Type
BRCA    300
KIRC    146
LUAD    141
PRAD    136
COAD     78
Name: count, dtype: int64


In [None]:
prst=df['Cancer_Type']=='PRAD'
prst.value_counts()

Unnamed: 0_level_0,count
Cancer_Type,Unnamed: 1_level_1
False,665
True,136
