# <center> ***MACHINE LEARNING BASED LUEKAEMIA CANCER PREDICTION SYSTEM USING PROTEIN SEQUENTIAL DATA*** </center>


## ***Note:***
#### *The features are extracted from Di-peptide Compissiton Technique*

# ***Table of Content***
1. Importing Libraries.

2. Collecting information about dataset.
3. Spltting dataset into X and Y.
4. Checking Co-realtion.
5. Outliers Identification
6. Outlier Removal.
7. Null Value Checking.
8. Dropping some Features.
9. Filling NaN values
10. Data Augmentation:

    - Meringing X and Y dataset

    - Then split into Positive and Negative Dataframes
    - Perfroming Augmentation on Positive Dataset only
    - Perfroming Augmentation on Negative Dataset only
    - Merge the Postive and Negative & split into X and Y
    

11. Passing data through Machine Learning Algorithms

    - SVM = 90% ~ 93%
    
    - Random Forest = 90% ~ 92%
    - K Neighbour Classifier = 83% ~ 84%
    - XG-Boost = 84% ~ 85%
    - Decision Tree = 81% ~ 84%
    - Logistic Regression = 66% ~ 67%
    - ROC Curve for all the above algorithms

### ***Importing Libraries***

In [None]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, confusion_matrix

### ***Reading Dataset csv file***

In [None]:
df=pd.read_csv('CML_PAAC_Combined_1.csv')
df.head(3)

### ***Checking Shape of Dataset***

In [None]:
df.shape

### ***Getting some information***

In [None]:
df.info()

### ***Splitting into Input & Output***

In [None]:
X_input = df.iloc[:,0:25] 
Y_output = df.iloc[:,-1]
print(f'Shape of X_input:{X_input.shape}\nShape of Y_output:{Y_output.shape}')

### ***Input data sample***

In [None]:
print('X_input sample:')
X_input.sample(3)

### ***Output data sample***

In [None]:
print('Y_output sample:')
Y_output.sample(3)

## ***Checking Outliers***

In [None]:
sns.boxplot(data=X_input)

In [None]:
sns.boxplot(data=pd.melt(X_input))

### ***Removing Outliers***

In [None]:
max_threshold = X_input.quantile(0.95)
min_threshold = X_input.quantile(0.01)
X_input_removed_outliers = X_input[(X_input<max_threshold)&(X_input>min_threshold)]

### ***Results***

In [None]:
sns.boxplot(data=X_input_removed_outliers)

In [None]:
sns.boxplot(data=pd.melt(X_input_removed_outliers))

## ***Checking Co-relation***

In [None]:
corr1=X_input.corr(method='pearson')
fig, ax = plt.subplots(figsize=(10,10)) 
print(sns.heatmap(corr1, annot=True,linewidths=2,ax=ax))

In [None]:
sns.pairplot(X_input)

## ***Now Checking Null values***

In [None]:
null_checking=X_input_removed_outliers.isnull().sum()
null_checking = null_checking.to_frame()

### ***Dropping Columns***

In [None]:
X_input_dropped = X_input_removed_outliers.drop(columns=['Var1_21','Var1_22','Var1_23','Var1_23','Var1_24','Var1_25'])

### ***Checking Mean values of each column***

In [None]:
mean=X_input_dropped.mean()
mean

### ***Filling Null values of each column with its mean***

In [None]:
X_input_new=X_input_dropped.fillna(mean)

In [None]:
X_input_new.shape

### ***After filling Null values:***
#### - ***we will make sure there is no NaN values in dataset***

In [None]:
X_input_new.isnull().sum()

## ***DATA AUGMENTATION***

### ***Merging dataset***
- Merging the input and output data

In [None]:
dataset_merge=pd.concat([X_input_new,Y_output], axis=1) 
dataset_merge.head(3)

In [None]:
data_aug=dataset_merge

## ***Splitting Positive DATA SAMPLE***

In [None]:
positive=data_aug.drop(data_aug[(data_aug.Label==0)].index)
#positive.drop('Label',inplace=True, axis=1)
positive.head(3)

### ***DATA AUGMENTATION ON POSITIVE DATA SAMPLES***
##### ***saving each column name with their standard deviation value***

In [None]:
var1p=positive.Var1_1
var2p=positive.Var1_2
var3p=positive.Var1_3
var4p=positive.Var1_4
var5p=positive.Var1_5
var6p=positive.Var1_6
var7p=positive.Var1_7
var8p=positive.Var1_8
var9p=positive.Var1_9
var10p=positive.Var1_10
var11p=positive.Var1_11
var12p=positive.Var1_12
var13p=positive.Var1_13
var14p=positive.Var1_14
var15p=positive.Var1_15
var16p=positive.Var1_16
var17p=positive.Var1_17
var18p=positive.Var1_18
var19p=positive.Var1_19
var20p=positive.Var1_20
labelp=1
##############
vars1p=np.std(var1p)
vars2p=np.std(var2p)
vars3p=np.std(var3p)
vars4p=np.std(var4p)
vars5p=np.std(var5p)
vars6p=np.std(var6p)
vars7p=np.std(var7p)
vars8p=np.std(var8p)
vars9p=np.std(var9p)
vars10p=np.std(var10p)
vars11p=np.std(var11p)
vars12p=np.std(var12p)
vars13p=np.std(var13p)
vars14p=np.std(var14p)
vars15p=np.std(var15p)
vars16p=np.std(var16p)
vars17p=np.std(var17p)
vars18p=np.std(var18p)
vars19p=np.std(var19p)
vars20p=np.std(var20p)

#### ***performing full data augmentation on POSITIVE samples***

In [None]:
dataset_positive=[]
for _,row in positive.iterrows():
        temp={
            'Var1_1':row['Var1_1'],
            'Var1_2':row['Var1_2'],
            'Var1_3':row['Var1_3'],
            'Var1_4':row['Var1_4'],
            'Var1_5':row['Var1_5'],
            'Var1_6':row['Var1_6'],
            'Var1_7':row['Var1_7'],
            'Var1_8':row['Var1_8'],
            'Var1_9':row['Var1_9'],
            'Var1_10':row['Var1_10'],
            'Var1_11':row['Var1_11'],
            'Var1_12':row['Var1_12'],
            'Var1_13':row['Var1_13'],
            'Var1_14':row['Var1_14'],
            'Var1_15':row['Var1_15'],
            'Var1_16':row['Var1_16'],
            'Var1_17':row['Var1_17'],
            'Var1_18':row['Var1_18'],
            'Var1_19':row['Var1_19'],
            'Var1_20':row['Var1_20'],
            'Label':1
        }
        dataset_positive.append(temp)

for _ in range(50):
    for _,row in positive.iterrows():
        temp={
            'Var1_1':row['Var1_1']+np.random.uniform(vars1p),
            'Var1_2':row['Var1_2']+np.random.uniform(vars2p),
            'Var1_3':row['Var1_3']+np.random.uniform(vars3p),
            'Var1_4':row['Var1_4']+np.random.uniform(vars4p),
            'Var1_5':row['Var1_5']+np.random.uniform(vars5p),
            'Var1_6':row['Var1_6']+np.random.uniform(vars6p),
            'Var1_7':row['Var1_7']+np.random.uniform(vars7p),
            'Var1_8':row['Var1_8']+np.random.uniform(vars8p),
            'Var1_9':row['Var1_9']+np.random.uniform(vars9p),
            'Var1_10':row['Var1_10']+np.random.uniform(vars10p),
            'Var1_11':row['Var1_11']+np.random.uniform(vars11p),
            'Var1_12':row['Var1_12']+np.random.uniform(vars12p),
            'Var1_13':row['Var1_13']+np.random.uniform(vars13p),
            'Var1_14':row['Var1_14']+np.random.uniform(vars14p),
            'Var1_15':row['Var1_15']+np.random.uniform(vars15p),
            'Var1_16':row['Var1_16']+np.random.uniform(vars16p),
            'Var1_17':row['Var1_17']+np.random.uniform(vars17p),
            'Var1_18':row['Var1_18']+np.random.uniform(vars18p),
            'Var1_19':row['Var1_19']+np.random.uniform(vars19p),
            'Var1_20':row['Var1_20']+np.random.uniform(vars20p),
            'Label':1
        }
        dataset_positive.append(temp)

print(f'Data size before Performing DataAugmentation:{len(positive)}\n \nData size after performing Data Augmentation:{len(dataset_positive)}')

### ***making list into dataframe***

In [None]:
dataset_positive1=pd.DataFrame(dataset_positive)

## ***Splitting Negative DATA SAMPLE***

In [None]:
negative=df.drop(data_aug[(data_aug.Label==1)].index)
negative.drop('Label',inplace=True, axis=1)
negative.head(3)

### ***DATA AUGMENTATION ON NEGATIVE DATA SAMPLES***
##### ***saving each column name with their standard deviation value***

In [None]:
var1n=negative.Var1_1
var2n=negative.Var1_2
var3n=negative.Var1_3
var4n=negative.Var1_4
var5n=negative.Var1_5
var6n=negative.Var1_6
var7n=negative.Var1_7
var8n=negative.Var1_8
var9n=negative.Var1_9
var10n=negative.Var1_10
var11n=negative.Var1_11
var12n=negative.Var1_12
var13n=negative.Var1_13
var14n=negative.Var1_14
var15n=negative.Var1_15
var16n=negative.Var1_16
var17n=negative.Var1_17
var18n=negative.Var1_18
var19n=negative.Var1_19
var20n=negative.Var1_20
labeln=0
##############
vars1n=np.std(var1n)
vars2n=np.std(var2n)
vars3n=np.std(var3n)
vars4n=np.std(var4n)
vars5n=np.std(var5n)
vars6n=np.std(var6n)
vars7n=np.std(var7n)
vars8n=np.std(var8n)
vars9n=np.std(var9n)
vars10n=np.std(var10n)
vars11n=np.std(var11n)
vars12n=np.std(var12n)
vars13n=np.std(var13n)
vars14n=np.std(var14n)
vars15n=np.std(var15n)
vars16n=np.std(var16n)
vars17n=np.std(var17n)
vars18n=np.std(var18n)
vars19n=np.std(var19n)
vars20n=np.std(var20n)
labeln=0

#### ***performing full data augmentation on NEGATIVE samples***

In [None]:
dataset_negative=[]
for _,row in negative.iterrows():
        temp={
            'Var1_1':row['Var1_1'],
            'Var1_2':row['Var1_2'],
            'Var1_3':row['Var1_3'],
            'Var1_4':row['Var1_4'],
            'Var1_5':row['Var1_5'],
            'Var1_6':row['Var1_6'],
            'Var1_7':row['Var1_7'],
            'Var1_8':row['Var1_8'],
            'Var1_9':row['Var1_9'],
            'Var1_10':row['Var1_10'],
            'Var1_11':row['Var1_11'],
            'Var1_12':row['Var1_12'],
            'Var1_13':row['Var1_13'],
            'Var1_14':row['Var1_14'],
            'Var1_15':row['Var1_15'],
            'Var1_16':row['Var1_16'],
            'Var1_17':row['Var1_17'],
            'Var1_18':row['Var1_18'],
            'Var1_19':row['Var1_19'],
            'Var1_20':row['Var1_20'],
            'Label':0
        }
        dataset_negative.append(temp)

for _ in range(50):
    for _,row in negative.iterrows():
        temp={
            'Var1_1':row['Var1_1']+np.random.uniform(vars1n),
            'Var1_2':row['Var1_2']+np.random.uniform(vars2n),
            'Var1_3':row['Var1_3']+np.random.uniform(vars3n),
            'Var1_4':row['Var1_4']+np.random.uniform(vars4n),
            'Var1_5':row['Var1_5']+np.random.uniform(vars5n),
            'Var1_6':row['Var1_6']+np.random.uniform(vars6n),
            'Var1_7':row['Var1_7']+np.random.uniform(vars7n),
            'Var1_8':row['Var1_8']+np.random.uniform(vars8n),
            'Var1_9':row['Var1_9']+np.random.uniform(vars9n),
            'Var1_10':row['Var1_10']+np.random.uniform(vars10n),
            'Var1_11':row['Var1_11']+np.random.uniform(vars11n),
            'Var1_12':row['Var1_12']+np.random.uniform(vars12n),
            'Var1_13':row['Var1_13']+np.random.uniform(vars13n),
            'Var1_14':row['Var1_14']+np.random.uniform(vars14n),
            'Var1_15':row['Var1_15']+np.random.uniform(vars15n),
            'Var1_16':row['Var1_16']+np.random.uniform(vars16n),
            'Var1_17':row['Var1_17']+np.random.uniform(vars17n),
            'Var1_18':row['Var1_18']+np.random.uniform(vars18n),
            'Var1_19':row['Var1_19']+np.random.uniform(vars19n),
            'Var1_20':row['Var1_20']+np.random.uniform(vars20n),
            'Label':0
        }
        dataset_negative.append(temp)


print(f'Data size before Performing DataAugmentation:{len(negative)}\n \nData size after performing Data Augmentation:{len(dataset_negative)}')

### ***making list into dataframe***

In [None]:
dataset_negative1=pd.DataFrame(dataset_negative)

## ***MERGING POSITIVE AND NEGATIVE DATA SAMPLES INTO ONE DATASETS***

In [None]:
dataset_full=dataset_positive1.append(dataset_negative1)


## ***SPLITTING FINAL DATASET INTO INPUT (X) AND OUTPUT (Y)***

In [None]:
X_final=dataset_full.iloc[:,0:20]
Y_final=dataset_full.iloc[:,-1]

## <center>***Passing Data to Machine Learning Algorithms***</center>

### ***Implementing Decision Tree Classifier***

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X_final,Y_final,test_size=0.25)
model = DecisionTreeClassifier(random_state=20).fit(X_train,Y_train)
Y_predicted = model.predict(X_test)
# in line 5 Y_test means Y_true
score = accuracy_score(Y_test,Y_predicted)
print(f'the accuracy score is:{score}')
f1 = f1_score(Y_test,Y_predicted)
print(f'the f1-score is:{f1}')
rcl = recall_score(Y_test,Y_predicted)
print(f'the recall-score is:{rcl}')
con_matrix = confusion_matrix(Y_test,Y_predicted)
print(f'the confusion_matrix is:{con_matrix}')
tn, fp, fn, tp = con_matrix.ravel() # ravel is used to flatten returns contiguous flattened array
specificity = tn / (tn+fp)
print(f'Specificity is:',specificity)

### ***ROC_Curve for Decision Tree***

In [None]:
y_pred_proba = model.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(Y_test,  y_pred_proba)

#create ROC curve
plt.plot(fpr,tpr)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

### ***Implementing Random Forest Classifier***

In [None]:
model = RandomForestClassifier(n_estimators=50).fit(X_train,Y_train)
Y_predicted = model.predict(X_test)
# in line 5 Y_test means Y_true
score = accuracy_score(Y_test,Y_predicted)
print(f'the accuracy score is:{score}')
f1 = f1_score(Y_test,Y_predicted)
print(f'the f1-score is:{f1}')
rcl = recall_score(Y_test,Y_predicted)
print(f'the recall-score is:{rcl}')
con_matrix = confusion_matrix(Y_test,Y_predicted)
print(f'the confusion_matrix is:{con_matrix}')
tn, fp, fn, tp = con_matrix.ravel() # ravel is used to flatten returns contiguous flattened array
specificity = tn / (tn+fp)
print(f'Specificity is:',specificity)

### ***ROC_Curve for Random Forest***

In [None]:
y_pred_proba = model.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(Y_test,  y_pred_proba)

#create ROC curve
plt.plot(fpr,tpr)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

### ***Implementing Logistic Regression***

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X_input,Y_output,test_size=0.2)
model = LogisticRegression(C=10,penalty='l2',tol=0.1).fit(X_train,Y_train)
Y_predicted = model.predict(X_test)
# in line 5 Y_test means Y_true
score = accuracy_score(Y_test,Y_predicted)
print(f'the accuracy score is:{score}')
f1 = f1_score(Y_test,Y_predicted)
print(f'the f1-score is:{f1}')
#Recall (aka Sensitivity)
rcl = recall_score(Y_test,Y_predicted)
print(f'the recall-score is:{rcl}')
con_matrix = confusion_matrix(Y_test,Y_predicted)
print(f'the confusion_matrix is:{con_matrix}')
tn, fp, fn, tp = con_matrix.ravel() # ravel is used to flatten returns contiguous flattened array
specificity = tn / (tn+fp)
print(f'Specificity is:',specificity)

### ***ROC_Curve for Logistic Reggrestion***

In [None]:
y_pred_proba = model.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(Y_test,  y_pred_proba)

#create ROC curve
plt.plot(fpr,tpr)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

### ***Implementing Support Vector Classifier***

In [None]:
model = SVC(kernel='rbf',degree=8,  C=1000,gamma=1000000,probability=True).fit(X_train,Y_train)
Y_predicted = model.predict(X_test)
# in line 5 Y_test means Y_true
score = accuracy_score(Y_test,Y_predicted)
print(f'the accuracy score is:{score}')
f1 = f1_score(Y_test,Y_predicted)
print(f'the f1-score is:{f1}')
#Recall (aka Sensitivity)
rcl = recall_score(Y_test,Y_predicted)
print(f'the recall-score is:{rcl}')
con_matrix = confusion_matrix(Y_test,Y_predicted)
print(f'the confusion_matrix is:{con_matrix}')
tn, fp, fn, tp = con_matrix.ravel() # ravel is used to flatten returns contiguous flattened array
specificity = tn / (tn+fp)
print(f'Specificity is:',specificity)

### ***ROC_Curve for SVC***

In [None]:
y_pred_proba = model.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(Y_test,  y_pred_proba)

#create ROC curve
plt.plot(fpr,tpr)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

### ***Implementing K Neighbour Classifier***

In [None]:

model = KNeighborsClassifier(n_neighbors=1,weights="uniform",).fit(X_train,Y_train)
Y_predicted = model.predict(X_test)
# in line 5 Y_test means Y_true
score = accuracy_score(Y_test,Y_predicted)
print(f'the accuracy score is:{score}')
f1 = f1_score(Y_test,Y_predicted)
print(f'the f1-score is:{f1}')
#Recall (aka Sensitivity)
rcl = recall_score(Y_test,Y_predicted)
print(f'the recall-score is:{rcl}')
con_matrix = confusion_matrix(Y_test,Y_predicted)
print(f'the confusion_matrix is:{con_matrix}')
tn, fp, fn, tp = con_matrix.ravel() # ravel is used to flatten returns contiguous flattened array
specificity = tn / (tn+fp)
print(f'Specificity is:',specificity)

### ***Implementing XG Boost***

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X_input,Y_output,test_size=0.2)
model = XGBClassifier().fit(X_train,Y_train)
Y_predicted = model.predict(X_test)
# in line 5 Y_test means Y_true
score = accuracy_score(Y_test,Y_predicted)
print(f'the accuracy score is:{score}')
f1 = f1_score(Y_test,Y_predicted)
print(f'the f1-score is:{f1}')
rcl = recall_score(Y_test,Y_predicted)
print(f'the recall-score is:{rcl}')
con_matrix = confusion_matrix(Y_test,Y_predicted)
print(f'the confusion_matrix is:{con_matrix}')
tn, fp, fn, tp = con_matrix.ravel() # ravel is used to flatten returns contiguous flattened array
specificity = tn / (tn+fp)
print(f'Specificity is:',specificity)





### ***ROC_Curve for XG Bosst***

In [None]:
y_pred_proba = model.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(Y_test,  y_pred_proba)

#create ROC curve
plt.plot(fpr,tpr)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

******