In [1]:
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/pharma_data/Training_set_begs.csv')
import numpy as np
from sklearn import preprocessing, svm, neighbors
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

## Data Description:
- ID_Patient_Care_Situation: Care situation of a patient during treatment
- Diagnosed_Condition: The diagnosed condition of the patient
- ID_Patient: Patient identifier number
- Treatment_with_drugs: Class of drugs used during treatment
- Survived_1_year: If the patient survived after one year (0 means did not survive; 1 means survived)
- Patient_Age: Age of the patient
- Patient_Body_Mass_Index: A calculated value based on the patient’s weight, height, etc.
- Patient_Smoker: If the patient was a smoker or not
- Patient_Rural_Urban: If the patient stayed in Rural or Urban part of the country
- Previous_Condition: Condition of the patient before the start of the treatment ( This variable is splitted into 8 columns - A, B, C, D, E, F, Z and Number_of_prev_cond. A, B, C, D, E, F and Z are the previous conditions of the patient. Suppose for one patient, if the entry in column A is 1, it means that the previous condition of the patient was A. If the patient didn't have that condition, it is 0 and same for other conditions. If a patient has previous condition as A and C , columns A and C will have entries as 1 and 1 respectively while the other column B, D, E, F, Z will have entries 0, 0, 0, 0, 0 respectively. The column Number_of_prev_cond will have entry as 2 i.e. 1 + 0 + 1 + 0 + 0 + 0 + 0 + 0 = 2 in this case. )
- Feel free to google 'Diagnose' and 'Body Mass Index' if you don't know about these terms.

## Label Classes:
We have to predict the chances of survival of a patient after one year. <br/>
label: Survived_1_year
- 0 -> Not survived
- 1 -> survived

## Preprocessing Data

In [2]:
df

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Treated_with_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,Patient_mental_condition,A,B,C,D,E,F,Z,Number_of_prev_cond,Survived_1_year
0,22374,8,3333,DX6,56,18.479385,YES,URBAN,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0
1,18164,5,5740,DX2,36,22.945566,YES,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
2,6283,23,10446,DX6,48,27.510027,YES,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3,5339,51,12011,DX1,5,19.130976,NO,URBAN,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
4,33012,0,12513,,128,1.348400,Cannot say,RURAL,Stable,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23092,15613,3,1548,DX2 DX4,14,18.643448,NO,RURAL,Stable,1.0,1.0,0.0,1.0,1.0,1.0,0.0,5.0,0
23093,15503,41,2769,DX6,55,23.684585,NO,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
23094,2167,24,7671,DX6,63,27.500039,YES,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
23095,31646,11,5764,DX3,25,23.719125,YES,RURAL,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1


In [3]:
df.dtypes

ID_Patient_Care_Situation      int64
Diagnosed_Condition            int64
Patient_ID                     int64
Treated_with_drugs            object
Patient_Age                    int64
Patient_Body_Mass_Index      float64
Patient_Smoker                object
Patient_Rural_Urban           object
Patient_mental_condition      object
A                            float64
B                            float64
C                            float64
D                            float64
E                            float64
F                            float64
Z                            float64
Number_of_prev_cond          float64
Survived_1_year                int64
dtype: object

### The function performs the following functions on the dataframe
- Removes missing Data from DF column
- Replacing text values in the dataframe
- Replace text values with int

In [4]:
def preprocess_data(df):
    """
        The function preprocesses the dataframe by performing the following operations:
            - Removes missing Data from DF column
            - Replacing text values in the dataframe
            - Replace text values with int
    """
    df.fillna(value=-99999, inplace=True)        #Removing missing data
    ### Replacing text with int
    df['Patient_Smoker'] = df['Patient_Smoker'].map({'YES': 1, 'NO': 0, 'Cannot say': -99999})
    df['Patient_Rural_Urban'] = df['Patient_Rural_Urban'].map({'URBAN': 1, 'RURAL': 0})
    
    drugs =df['Treated_with_drugs']
    for i in range(len(drugs)):
        #drug_str = drug    # string
        drug_str = drugs[i]
        #print(drug_str)
        if drug_str == -99999:
            continue
        drug_int = ([ int(x) for x in drug_str if x.isdigit() ])
        #print(drug_int)
        drugs[i] = sum(drug_int)
    df['Treated_with_drugs'] = drugs
    
    # Removing unecessary data
    df.drop(['ID_Patient_Care_Situation', 'Patient_ID', 'Patient_mental_condition'], axis=1, inplace=True)
    
preprocess_data(df)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,Diagnosed_Condition,Treated_with_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,A,B,C,D,E,F,Z,Number_of_prev_cond,Survived_1_year
0,8,6,56,18.479385,1,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0
1,5,2,36,22.945566,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
2,23,6,48,27.510027,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3,51,1,5,19.130976,0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
4,0,-99999,128,1.348400,-99999,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23092,3,6,14,18.643448,0,0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,5.0,0
23093,41,6,55,23.684585,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
23094,24,6,63,27.500039,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
23095,11,3,25,23.719125,1,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1


#### Replacing text values in the dataframe

In [5]:
print("Data Size:", len(df))

Data Size: 23097


### Removing unnecessary data
We will filter are the fields which are unnecessary for our feature set such as patient id <br/>
From the below we see that all patients are stable. So we will remove this field too

In [6]:
df.dtypes

Diagnosed_Condition          int64
Treated_with_drugs          object
Patient_Age                  int64
Patient_Body_Mass_Index    float64
Patient_Smoker               int64
Patient_Rural_Urban          int64
A                          float64
B                          float64
C                          float64
D                          float64
E                          float64
F                          float64
Z                          float64
Number_of_prev_cond        float64
Survived_1_year              int64
dtype: object

### Data Split
Spliting the data into features and labels and converting them to numpy array. <br/>
We will then split our training and testing data

In [7]:
X = np.array(df.drop(['Survived_1_year'], 1))        # Features
y = np.array(df['Survived_1_year'])                  # Label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [8]:
print("Size of train data:", len(X_train), len(y_train))
print("Size of test data:", len(X_test), len(y_test))

Size of train data: 18477 18477
Size of test data: 4620 4620


## Training

In [9]:
df

Unnamed: 0,Diagnosed_Condition,Treated_with_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,A,B,C,D,E,F,Z,Number_of_prev_cond,Survived_1_year
0,8,6,56,18.479385,1,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0
1,5,2,36,22.945566,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
2,23,6,48,27.510027,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3,51,1,5,19.130976,0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
4,0,-99999,128,1.348400,-99999,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23092,3,6,14,18.643448,0,0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,5.0,0
23093,41,6,55,23.684585,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
23094,24,6,63,27.500039,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
23095,11,3,25,23.719125,1,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1


In [10]:
clf = neighbors.KNeighborsClassifier()
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)
accuracy

0.7164502164502164

## Testing

In [11]:
test_new = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/pharma_data/Testing_set_begs.csv')

In [12]:
test_new.head()

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Treated_with_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,Patient_mental_condition,A,B,C,D,E,F,Z,Number_of_prev_cond
0,19150,40,3709,DX3,16,29.443894,NO,RURAL,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0
1,23216,52,986,DX6,24,26.836321,NO,URBAN,Stable,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0
2,11890,50,11821,DX4 DX5,63,25.52328,NO,RURAL,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0
3,7149,32,3292,DX6,42,27.171155,NO,URBAN,Stable,1.0,0.0,1.0,0.0,1.0,0.0,0.0,3.0
4,22845,20,9959,DX3,50,25.556192,NO,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### Preprocess test data

In [13]:
preprocess_data(test_new)
test_new

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,Diagnosed_Condition,Treated_with_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,A,B,C,D,E,F,Z,Number_of_prev_cond
0,40,3,16,29.443894,0,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0
1,52,6,24,26.836321,0,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0
2,50,9,63,25.523280,0,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0
3,32,6,42,27.171155,0,1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,3.0
4,20,3,50,25.556192,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9298,11,6,28,29.106314,0,0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,3.0
9299,7,1,4,20.616673,0,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0
9300,16,13,20,24.727357,1,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9301,50,4,33,17.517426,0,1,1.0,1.0,1.0,1.0,0.0,1.0,0.0,5.0


In [14]:
test_data = np.array(test_new)
test_data

array([[40, 3, 16, ..., 0.0, 0.0, 2.0],
       [52, 6, 24, ..., 0.0, 0.0, 2.0],
       [50, 9, 63, ..., 0.0, 0.0, 2.0],
       ...,
       [16, 13, 20, ..., 0.0, 0.0, 1.0],
       [50, 4, 33, ..., 1.0, 0.0, 5.0],
       [51, 6, 52, ..., 0.0, 0.0, 1.0]], dtype=object)

In [15]:
test_new
Survived_1_year = clf.predict(test_new)        # Prediction
Survived_1_year[:10]

array([0, 1, 1, 1, 1, 1, 0, 1, 0, 0], dtype=int64)

### Saving prediction results

In [16]:
res = pd.DataFrame(Survived_1_year)
res.index = test_new.index
res.columns = ["prediction"]
res.to_csv("prediction_results_HP.csv")      # the csv file will be saved locally on the same location where this notebook is located.