In [84]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import copy
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error, f1_score
from sklearn.metrics import accuracy_score, classification_report

In [2]:
data = pd.read_csv("C:\Aniruddh\Python\Jupyter\RCC\CSV FILES\Cleaned data\RCC_cleaned_data.csv")
data

Unnamed: 0,AGE,SEX,DM,HTN,CKD,INCIDENTAL,PAIN,HEMATURIA,MASS,LOW,...,T,N,M,IVC INV,MAX DIA,APPROACH,HISTOPATHOLOGY,GRADE,MONTHS,RECURRENCE
0,32,M,0,0,0,0,1,0,0,0.0,...,2,0,1,0,8.0,2,clear cell,2.0,95,0
1,45,F,0,0,0,0,1,1,0,0.0,...,4,1,1,0,4.9,1,clear cell,2.0,95,0
2,47,M,0,0,0,1,0,0,0,0.0,...,3b,0,1,1,9.0,1,clear cell,3.0,95,0
3,58,M,0,0,0,0,0,1,0,0.0,...,3a,0,0,0,7.0,2,clear cell,2.0,95,1
4,22,M,0,0,0,0,1,0,0,0.0,...,2b,0,0,0,10.4,1,clear cell,2.0,94,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
690,64,M,0,0,0,0,1,0,0,0.0,...,3a,0,0,0,8.0,1,clear cell,4.0,18,0
691,49,M,0,0,0,0,0,1,0,0.0,...,2a,0,0,0,10.0,1,clear cell,3.0,18,0
692,73,M,0,1,0,0,0,1,0,0.0,...,3b,0,0,1,7.5,2,clear cell,1.0,18,0
693,65,F,0,0,0,0,1,0,0,0.0,...,3a,0,0,0,10.0,1,clear cell,1.0,18,0


In [4]:
data.loc[0, "T"], data.loc[64, "LOW"]

('2', nan)

In [5]:
red_data = data.drop([0, 64], axis = 0).reset_index(drop = True)
red_data

Unnamed: 0,AGE,SEX,DM,HTN,CKD,INCIDENTAL,PAIN,HEMATURIA,MASS,LOW,...,T,N,M,IVC INV,MAX DIA,APPROACH,HISTOPATHOLOGY,GRADE,MONTHS,RECURRENCE
0,45,F,0,0,0,0,1,1,0,0.0,...,4,1,1,0,4.9,1,clear cell,2.0,95,0
1,47,M,0,0,0,1,0,0,0,0.0,...,3b,0,1,1,9.0,1,clear cell,3.0,95,0
2,58,M,0,0,0,0,0,1,0,0.0,...,3a,0,0,0,7.0,2,clear cell,2.0,95,1
3,22,M,0,0,0,0,1,0,0,0.0,...,2b,0,0,0,10.4,1,clear cell,2.0,94,0
4,42,M,0,1,0,0,1,0,0,0.0,...,1a,0,0,0,3.0,1,clear cell,,94,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688,64,M,0,0,0,0,1,0,0,0.0,...,3a,0,0,0,8.0,1,clear cell,4.0,18,0
689,49,M,0,0,0,0,0,1,0,0.0,...,2a,0,0,0,10.0,1,clear cell,3.0,18,0
690,73,M,0,1,0,0,0,1,0,0.0,...,3b,0,0,1,7.5,2,clear cell,1.0,18,0
691,65,F,0,0,0,0,1,0,0,0.0,...,3a,0,0,0,10.0,1,clear cell,1.0,18,0


In [8]:
for col in red_data.columns:
    print(f"{col}: {red_data[col].dtype}")

AGE: int64
SEX: object
DM: int64
HTN: int64
CKD: int64
INCIDENTAL: int64
PAIN: int64
HEMATURIA: int64
MASS: int64
LOW: float64
FEVER: int64
SITE: object
T: object
N: int64
M: int64
IVC INV: int64
MAX DIA: float64
APPROACH: int64
HISTOPATHOLOGY: object
GRADE: float64
MONTHS: int64
RECURRENCE: int64


In [6]:
missing_cols = [col for col in red_data.columns if red_data[col].isna().any()]
missing_cols

['GRADE']

In [7]:
red_data['GRADE'].value_counts()

GRADE
2.0    277
1.0    226
3.0    123
4.0     56
Name: count, dtype: int64

In [9]:
y = red_data.pop('RECURRENCE')

In [10]:
obj_cols = []
for col in data.columns:
    if data[col].dtype == 'object':
        obj_cols.append(col)
print(obj_cols)
num_cols = [col for col in red_data.columns if col not in obj_cols]

['SEX', 'SITE', 'T', 'HISTOPATHOLOGY']


In [11]:
for col in obj_cols:
    print(f"{col}: {red_data[col].nunique()}")

SEX: 2
SITE: 3
T: 8
HISTOPATHOLOGY: 2


In [12]:
num_cols

['AGE',
 'DM',
 'HTN',
 'CKD',
 'INCIDENTAL',
 'PAIN',
 'HEMATURIA',
 'MASS',
 'LOW',
 'FEVER',
 'N',
 'M',
 'IVC INV',
 'MAX DIA',
 'APPROACH',
 'GRADE',
 'MONTHS']

In [13]:
OH_cols = [col for col in obj_cols if red_data[col].nunique() < 5]
ord_cols = list(set(obj_cols) - set(OH_cols))
OH_cols, ord_cols

(['SEX', 'SITE', 'HISTOPATHOLOGY'], ['T'])

In [14]:
sorted(red_data.columns) == sorted(num_cols + OH_cols + ord_cols)

True

In [53]:
X_train, X_valid, y_train, y_valid = train_test_split(red_data, y, train_size = 0.75, test_size = 0.25, random_state = 0) # NO STRATIFY!

In [27]:
# my god i forgot scaling??
numerical_transformer = SimpleImputer(strategy = 'most_frequent')
onehot_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('onehot', OneHotEncoder()) # handle_unknown = 'ignore'))
])
ordinal_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('ordinal', OrdinalEncoder()) # handle_unknown = 'use_encoded_value'))
])
preprocessor = ColumnTransformer(transformers = [
    ('num', numerical_transformer, num_cols),
    ('OH', onehot_transformer, OH_cols),
    ('ord', ordinal_transformer, ord_cols),
])
model = RandomForestClassifier(n_estimators = 100, random_state = 0, class_weight = 'balanced')
clf = Pipeline(steps = [
    ('preprocessor', preprocessor),
    # ('model', model)
])

In [28]:
clf.fit(X_train, y_train)

In [31]:
X_train_ = clf.transform(X_train)
X_valid_ = clf.transform(X_valid)

In [19]:
X_train_

array([[60.,  0.,  0., ...,  1.,  0.,  4.],
       [37.,  0.,  0., ...,  1.,  0.,  0.],
       [55.,  0.,  0., ...,  1.,  0.,  3.],
       ...,
       [42.,  0.,  0., ...,  1.,  0.,  0.],
       [57.,  0.,  0., ...,  1.,  0.,  4.],
       [68.,  0.,  0., ...,  1.,  0.,  7.]])

In [20]:
X_train_.shape

(519, 25)

In [21]:
list(enumerate(X_train_[0]))

[(0, 60.0),
 (1, 0.0),
 (2, 0.0),
 (3, 0.0),
 (4, 1.0),
 (5, 0.0),
 (6, 0.0),
 (7, 0.0),
 (8, 0.0),
 (9, 0.0),
 (10, 0.0),
 (11, 0.0),
 (12, 0.0),
 (13, 7.1),
 (14, 2.0),
 (15, 2.0),
 (16, 55.0),
 (17, 0.0),
 (18, 1.0),
 (19, 0.0),
 (20, 1.0),
 (21, 0.0),
 (22, 1.0),
 (23, 0.0),
 (24, 4.0)]

In [22]:
list(enumerate(X_train.columns))

[(0, 'AGE'),
 (1, 'SEX'),
 (2, 'DM'),
 (3, 'HTN'),
 (4, 'CKD'),
 (5, 'INCIDENTAL'),
 (6, 'PAIN'),
 (7, 'HEMATURIA'),
 (8, 'MASS'),
 (9, 'LOW'),
 (10, 'FEVER'),
 (11, 'SITE'),
 (12, 'T'),
 (13, 'N'),
 (14, 'M'),
 (15, 'IVC INV'),
 (16, 'MAX DIA'),
 (17, 'APPROACH'),
 (18, 'HISTOPATHOLOGY'),
 (19, 'GRADE'),
 (20, 'MONTHS')]

In [None]:
# T is 12

In [23]:
print(X_train)

     AGE SEX  DM  HTN  CKD  INCIDENTAL  PAIN  HEMATURIA  MASS  LOW  ...  \
271   60   M   0    0    0           1     0          0     0  0.0  ...   
380   37   M   0    0    0           1     0          0     0  0.0  ...   
485   55   M   0    0    0           0     1          0     0  0.0  ...   
520   35   M   0    0    0           0     0          1     0  0.0  ...   
12    63   M   1    1    0           1     0          0     0  0.0  ...   
..   ...  ..  ..  ...  ...         ...   ...        ...   ...  ...  ...   
359   34   M   0    0    0           0     1          0     0  0.0  ...   
192   34   M   0    0    0           1     0          0     0  0.0  ...   
629   42   M   0    0    0           0     1          0     0  0.0  ...   
559   57   F   0    0    0           0     0          0     0  1.0  ...   
684   68   M   0    0    0           0     0          1     0  0.0  ...   

      SITE   T  N  M  IVC INV  MAX DIA  APPROACH  HISTOPATHOLOGY GRADE  MONTHS  
271   left  3a  0 

In [25]:
clf?

[1;31mType:[0m        Pipeline
[1;31mString form:[0m
Pipeline(steps=[('preprocessor',
           ColumnTransformer(transformers=[('num',
           <...>                OrdinalEncoder())]),
           ['T'])]))])
[1;31mLength:[0m      1
[1;31mFile:[0m        c:\users\aniru\anaconda3\lib\site-packages\sklearn\pipeline.py
[1;31mDocstring:[0m  
Pipeline of transforms with a final estimator.

Sequentially apply a list of transforms and a final estimator.
Intermediate steps of the pipeline must be 'transforms', that is, they
must implement `fit` and `transform` methods.
The final estimator only needs to implement `fit`.
The transformers in the pipeline can be cached using ``memory`` argument.

The purpose of the pipeline is to assemble several steps that can be
cross-validated together while setting different parameters. For this, it
enables setting parameters of the various steps using their names and the
parameter name separated by a `'__'`, as in the example below. A step's
est

In [35]:
model.fit(X_train_, y_train)

In [36]:
preds = model.predict(X_valid_)
print("MAE:", mean_absolute_error(y_valid, preds))

MAE: 0.10919540229885058


In [26]:
preds = clf.predict(X_valid)
print("MAE:", mean_absolute_error(y_valid, preds))

AttributeError: 'ColumnTransformer' object has no attribute 'predict'

In [38]:
print(np.mean(y_valid == preds) * 100)

89.08045977011494


In [43]:
np.sum(preds == 0), len(y_valid)

(170, 174)

In [44]:
# most are zero again

In [45]:
train_preds = model.predict(X_train_)
print("MAE:", mean_absolute_error(y_train, train_preds))
print(np.mean(y_train == train_preds) * 100)

MAE: 0.0
100.0


In [47]:
np.sum(train_preds == 0), np.sum(y_train == 0), len(y_train)

(453, 453, 519)

In [18]:
train_preds = clf.predict(X_train)
print("MAE:", mean_absolute_error(y_train, train_preds))
print(np.mean(y_train == train_preds) * 100)

MAE: 0.0
100.0


In [48]:
y.value_counts()

RECURRENCE
0    606
1     87
Name: count, dtype: int64

In [49]:
f1 = f1_score(y_valid, preds)
print(f1)

0.24


In [50]:
f1 = f1_score(y_train, train_preds)
print(f1)

1.0


In [52]:
y_train.sum()/len(y_train), y_valid.sum()/len(y_valid)

(0.12716763005780346, 0.1206896551724138)

In [54]:
a, b, c, d = train_test_split(red_data, y, train_size = 0.75, test_size = 0.25, random_state = 0, stratify = y)

In [55]:
c.sum()/len(c), d.sum()/len(d)

(0.1252408477842004, 0.12643678160919541)

In [63]:
X = red_data.copy()
X.loc[X[X.GRADE.isna() == True].index, 'GRADE'] = 2

In [66]:
T_dict = {'1a':1, '1b':2, '2a':3, '2b':4, '3a':5, '3b':6, '3c':7, '4':8}
X['T'] = X['T'].replace(T_dict)

In [69]:
X = pd.get_dummies(X, columns = ['SEX', 'SITE', 'HISTOPATHOLOGY', 'APPROACH'])

In [70]:
X.rename(columns = {'SITE_b/l':'SITE_BL', 'SITE_left':'SITE_LEFT', 'SITE_right':'SITE_RIGHT',
                    'HISTOPATHOLOGY_clear cell': 'HISTOPATHOLOGY_CC', 'HISTOPATHOLOGY_papillary':'HISTOPATHOLOGY_PAP'}, inplace = True)

In [71]:
for col in X.columns:
    if X[col].dtype == 'bool':
        X[col] = X[col].astype(np.int64)

In [73]:
one_hot_cols = []
for col in X.columns[-10:]:
    one_hot_cols.append(col)
one_hot_cols

['SEX_F',
 'SEX_M',
 'SITE_BL',
 'SITE_LEFT',
 'SITE_RIGHT',
 'HISTOPATHOLOGY_CC',
 'HISTOPATHOLOGY_PAP',
 'APPROACH_1',
 'APPROACH_2',
 'APPROACH_3']

In [75]:
num_cont_cols = ['AGE', 'MAX DIA', 'MONTHS']
num_ord_cols = ['T', 'GRADE']
num_nom_cols = []
for col in X.columns[:-10]:
    if col not in num_cont_cols and col not in num_ord_cols:
        num_nom_cols.append(col)
num_nom_cols

['DM',
 'HTN',
 'CKD',
 'INCIDENTAL',
 'PAIN',
 'HEMATURIA',
 'MASS',
 'LOW',
 'FEVER',
 'N',
 'M',
 'IVC INV']

In [90]:
Xtrain, Xvalid, ytrain, yvalid = train_test_split(X, y, test_size = 0.25)

In [91]:
a, b, c, d = Xtrain.copy(), Xvalid.copy(), ytrain.copy(), yvalid.copy()

In [92]:
scaler = StandardScaler()
Xtrain[num_cont_cols] = scaler.fit_transform(Xtrain[num_cont_cols])
Xvalid[num_cont_cols] = scaler.transform(Xvalid[num_cont_cols])

In [93]:
model1 = RandomForestClassifier(random_state = 42)
model1.fit(Xtrain, ytrain)
ypred = model1.predict(Xvalid)
print("MAE:", mean_absolute_error(yvalid, ypred))
print(np.mean(yvalid == ypred) * 100)
f1 = f1_score(yvalid, ypred)
print(f1)
accuracy = accuracy_score(yvalid, ypred)
report = classification_report(yvalid, ypred)
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)

MAE: 0.09770114942528736
90.22988505747126
0.1904761904761905
Accuracy: 0.9022988505747126
Classification Report:
               precision    recall  f1-score   support

           0       0.90      1.00      0.95       155
           1       1.00      0.11      0.19        19

    accuracy                           0.90       174
   macro avg       0.95      0.55      0.57       174
weighted avg       0.91      0.90      0.87       174



In [94]:
(ypred == 1).sum()

2

In [101]:
model2 = RandomForestClassifier(random_state = 42)
model2.fit(a, c)
p = model2.predict(b)
print("MAE:", mean_absolute_error(d, p))
print(np.mean(d == p) * 100)
f1 = f1_score(d, p)
print(f1)
accuracy = accuracy_score(d, p)
report = classification_report(d, p)
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)

MAE: 0.09770114942528736
90.22988505747126
0.1904761904761905
Accuracy: 0.9022988505747126
Classification Report:
               precision    recall  f1-score   support

           0       0.90      1.00      0.95       155
           1       1.00      0.11      0.19        19

    accuracy                           0.90       174
   macro avg       0.95      0.55      0.57       174
weighted avg       0.91      0.90      0.87       174



In [97]:
new_scaler = StandardScaler()
xxx = X.copy()
xxx[num_cont_cols] = new_scaler.fit_transform(xxx[num_cont_cols])
xxx

Unnamed: 0,AGE,DM,HTN,CKD,INCIDENTAL,PAIN,HEMATURIA,MASS,LOW,FEVER,...,SEX_F,SEX_M,SITE_BL,SITE_LEFT,SITE_RIGHT,HISTOPATHOLOGY_CC,HISTOPATHOLOGY_PAP,APPROACH_1,APPROACH_2,APPROACH_3
0,-0.709194,0,0,0,0,1,1,0,0.0,1,...,1,0,0,0,1,1,0,1,0,0
1,-0.553369,0,0,0,1,0,0,0,0.0,0,...,0,1,0,0,1,1,0,1,0,0
2,0.303667,0,0,0,0,0,1,0,0.0,0,...,0,1,0,0,1,1,0,0,1,0
3,-2.501178,0,0,0,0,1,0,0,0.0,0,...,0,1,0,1,0,1,0,1,0,0
4,-0.942931,0,1,0,0,1,0,0,0.0,0,...,0,1,0,1,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688,0.771141,0,0,0,0,1,0,0,0.0,0,...,0,1,0,0,1,1,0,1,0,0
689,-0.397544,0,0,0,0,0,1,0,0.0,0,...,0,1,0,0,1,1,0,1,0,0
690,1.472352,0,1,0,0,0,1,0,0.0,0,...,0,1,0,0,1,1,0,0,1,0
691,0.849054,0,0,0,0,1,0,0,0.0,0,...,1,0,0,1,0,1,0,1,0,0


In [104]:
model3 = RandomForestClassifier(random_state = 42)
model3.fit(xxx, y)
xxxpred = model3.predict(xxx)
accuracy = accuracy_score(y, xxxpred)
report = classification_report(y, xxxpred)
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)

Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       606
           1       1.00      1.00      1.00        87

    accuracy                           1.00       693
   macro avg       1.00      1.00      1.00       693
weighted avg       1.00      1.00      1.00       693

