# Cancer grade extraction from pathology reports of renal cell carcinoma patients
- aim of the project is to design deep learning models extracting Fuhrman grade from pathology reports of renal cell carcinoma patients in The Cancer Genome Atlas (TCGA).
- training data is csv file that contains the patient id and there cancer type and a csv file with all the patients and there pathology reports.

## data preprocessing 
- merge the patient reports with their cancer types.
- remove stopwords for the reports.
- vectorize the report
- check that the inputs and outputs have the same number of rows.
- split the training set

In [None]:
import numpy as np
import pandas as pd

In [2]:
df=pd.read_table("dataT.csv",sep=",")
df.set_index('patient_id',inplace=True)
df

Unnamed: 0_level_0,report
patient_id,Unnamed: 1_level_1
TCGA-2K-A9WE,final pathologic diagnosis: left kidney and ur...
TCGA-2Z-A9J1,"left renal mass, partial nefhpectomy tumor typ..."
TCGA-2Z-A9J2,specimen: left kidney tumor type: papillary re...
TCGA-2Z-A9J3,"right kidney, radical nephrectomy: renal cell ..."
TCGA-2Z-A9J6,"tumor type: renal cell carcinoma, papillary ty..."
...,...
TCGA-Y8-A896,"research dx left kidney, radical nephrectomy: ..."
TCGA-Y8-A897,case summary for nephrectomy for renal cell ca...
TCGA-Y8-A8RY,case summary for nephrectomy for renal cell ca...
TCGA-Y8-A8S0,procedure : laparoscopic partial nephrectomy s...


In [4]:
tf=pd.read_csv("trainT.csv")
ds=tf.merge(df,on="patient_id")
ds

Unnamed: 0,patient_id,g1,g2,g3,g4,report
0,TCGA-2Z-A9J1,0,0,1,0,"left renal mass, partial nefhpectomy tumor typ..."
1,TCGA-2Z-A9J7,0,0,1,0,procedure: right radical nephrectomy tumor typ...
2,TCGA-2Z-A9J8,0,0,1,0,left kidney tumor tumor type: renal cell carci...
3,TCGA-2Z-A9JE,0,1,0,0,"right renal mass, heminephrectomy: tumor type:..."
4,TCGA-2Z-A9JG,0,0,1,0,"final diagnosis a. left kidney, partial nephre..."
...,...,...,...,...,...,...
400,TCGA-WN-AB4C,0,0,1,0,"final diagnosis: a. kidney, ""right kidney mass..."
401,TCGA-Y8-A896,0,0,0,1,"research dx left kidney, radical nephrectomy: ..."
402,TCGA-Y8-A897,0,1,0,0,case summary for nephrectomy for renal cell ca...
403,TCGA-Y8-A8S0,0,1,0,0,procedure : laparoscopic partial nephrectomy s...


In [5]:
import neattext as nt
import neattext.functions as nfx

In [6]:
reports=ds["report"].apply(nfx.remove_stopwords)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
tfid=TfidfVectorizer()
inputs=tfid.fit_transform(reports)

In [10]:
outputs=np.array(ds[['g1','g2','g3','g4']])

array([[0, 0, 1, 0],
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       ...,
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 0]], dtype=int64)

In [45]:
inputs.shape,outputs.shape

((405, 1684), (405, 4))

In [11]:
from sklearn.model_selection import train_test_split
Xtrain,Xtest,ytrain,ytest=train_test_split(inputs,outputs,test_size=0.2,random_state=42,stratify=outputs)

# model selection and evaluation
- use SDG classifier inside OneVsRest classifier for multiclass classification
- fit the data to SDG classifier since it gave higher accuracy after trying multiple models.
- used score metric for evaluation.


In [12]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
sv=SGDClassifier(loss="log_loss")
ov=OneVsRestClassifier(sv)
ov.fit(Xtrain, ytrain)
ov.predict(Xtest)

array([[0, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 1, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0,

In [48]:
print(ov.score(Xtrain,ytrain))
print(ov.score(Xtest,ytest))

0.9753086419753086
0.5802469135802469


## prepairing test data for kaggle
- predict_proba() is used to redict the probability of each cancer type.

In [49]:
dT=pd.read_table("testT.csv",sep=",")
Td=dT.merge(df,on="patient_id")

In [51]:
report=tfid.transform(Td["report"])
pred=ov.predict_proba(report)
pred=np.array(pred)

In [52]:
g1=[]
g2=[]
g3=[]
g4=[]
for i in range(len(pred)):
    split=np.array_split(pred[i],4)
    g1.append(split[0])
    g2.append(split[1])
    g3.append(split[2])
    g4.append(split[3])
    
    

In [53]:
g1=np.hstack(g1)
g2=np.hstack(g2)
g3=np.hstack(g3)
g4=np.hstack(g4)

In [54]:
import csv
P_id=dT["patient_id"]
p_id=np.array(P_id)
with open('NLPF.csv','w+') as file:
    myfile=csv.writer(file)
    myfile.writerow(['pateint_id','likelihood_G1', 'likelihood_G2', 'likelihood_G3','likelihood_G4'])
    for i in range(len(p_id)):
        myfile.writerow([P_id[i],g1[i],g2[i],g3[i],g4[i]])

## kaggle result 
- evaluation metric negative log loss

![kaggle result negative log loss](NLPres.png)