In [1]:
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/pharma_data/Training_set_begs.csv')
import numpy as np
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

## Data Description:
- ID_Patient_Care_Situation: Care situation of a patient during treatment
- Diagnosed_Condition: The diagnosed condition of the patient
- ID_Patient: Patient identifier number
- Treatment_with_drugs: Class of drugs used during treatment
- Survived_1_year: If the patient survived after one year (0 means did not survive; 1 means survived)
- Patient_Age: Age of the patient
- Patient_Body_Mass_Index: A calculated value based on the patient’s weight, height, etc.
- Patient_Smoker: If the patient was a smoker or not
- Patient_Rural_Urban: If the patient stayed in Rural or Urban part of the country
- Previous_Condition: Condition of the patient before the start of the treatment ( This variable is splitted into 8 columns - A, B, C, D, E, F, Z and Number_of_prev_cond. A, B, C, D, E, F and Z are the previous conditions of the patient. Suppose for one patient, if the entry in column A is 1, it means that the previous condition of the patient was A. If the patient didn't have that condition, it is 0 and same for other conditions. If a patient has previous condition as A and C , columns A and C will have entries as 1 and 1 respectively while the other column B, D, E, F, Z will have entries 0, 0, 0, 0, 0 respectively. The column Number_of_prev_cond will have entry as 2 i.e. 1 + 0 + 1 + 0 + 0 + 0 + 0 + 0 = 2 in this case. )
- Feel free to google 'Diagnose' and 'Body Mass Index' if you don't know about these terms.

## Label Classes:
We have to predict the chances of survival of a patient after one year. <br/>
label: Survived_1_year
- 0 -> Not survived
- 1 -> survived

## Preprocessing Data

In [2]:
df.head()

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Treated_with_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,Patient_mental_condition,A,B,C,D,E,F,Z,Number_of_prev_cond,Survived_1_year
0,22374,8,3333,DX6,56,18.479385,YES,URBAN,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0
1,18164,5,5740,DX2,36,22.945566,YES,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
2,6283,23,10446,DX6,48,27.510027,YES,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3,5339,51,12011,DX1,5,19.130976,NO,URBAN,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
4,33012,0,12513,,128,1.3484,Cannot say,RURAL,Stable,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1


In [3]:
df.dtypes

ID_Patient_Care_Situation      int64
Diagnosed_Condition            int64
Patient_ID                     int64
Treated_with_drugs            object
Patient_Age                    int64
Patient_Body_Mass_Index      float64
Patient_Smoker                object
Patient_Rural_Urban           object
Patient_mental_condition      object
A                            float64
B                            float64
C                            float64
D                            float64
E                            float64
F                            float64
Z                            float64
Number_of_prev_cond          float64
Survived_1_year                int64
dtype: object

In [4]:
df.fillna(value=-99999, inplace=True)        #Removing missing data

#### Replacing text values in the dataframe

In [None]:
### Replacing smoker col with int
df['Patient_Smoker'] = df['Patient_Smoker'].map({'YES': 1, 'NO': 0, 'Cannot say': -99999})

In [6]:
## Replacing Treated_with_drugs col with int
drugs =df['Treated_with_drugs'] 
for i in range(len(drugs)):
    #drug_str = drug    # string
    drug_str = drugs[i]
    #print(drug_str)
    if drug_str == -99999:
        continue
    drug_int = "".join([ str(x) for x in drug_str if x.isdigit() ])
    #print(int(drug_int))
    drugs[i] = int(drug_int)

drugs

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


0             6
1             2
2             6
3             1
4        -99999
          ...  
23092        24
23093         6
23094         6
23095         3
23096         6
Name: Treated_with_drugs, Length: 23097, dtype: object

In [7]:
drugs

0             6
1             2
2             6
3             1
4        -99999
          ...  
23092        24
23093         6
23094         6
23095         3
23096         6
Name: Treated_with_drugs, Length: 23097, dtype: object

In [8]:
df

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Treated_with_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,Patient_mental_condition,A,B,C,D,E,F,Z,Number_of_prev_cond,Survived_1_year
0,22374,8,3333,6,56,18.479385,1,URBAN,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0
1,18164,5,5740,2,36,22.945566,1,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
2,6283,23,10446,6,48,27.510027,1,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3,5339,51,12011,1,5,19.130976,0,URBAN,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
4,33012,0,12513,-99999,128,1.348400,-99999,RURAL,Stable,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23092,15613,3,1548,24,14,18.643448,0,RURAL,Stable,1.0,1.0,0.0,1.0,1.0,1.0,0.0,5.0,0
23093,15503,41,2769,6,55,23.684585,0,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
23094,2167,24,7671,6,63,27.500039,1,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
23095,31646,11,5764,3,25,23.719125,1,RURAL,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1


In [9]:
#df.fillna(value=-99999, inplace=True)        #Removing missing data

In [10]:
print("Data Size:", len(df))

Data Size: 23097


We will filter are the fields which are unnecessary for our feature set such as patient id <br/>
From the below we see that all patients are stable. So we will remove this field too

In [11]:
n = np.array(df['Patient_mental_condition'])
np.where(n != 'Stable')

(array([], dtype=int64),)

In [12]:
df.drop(['Patient_ID', 'Patient_Rural_Urban', 'Patient_mental_condition'], axis=1, inplace=True)
df

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Treated_with_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,A,B,C,D,E,F,Z,Number_of_prev_cond,Survived_1_year
0,22374,8,6,56,18.479385,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0
1,18164,5,2,36,22.945566,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
2,6283,23,6,48,27.510027,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3,5339,51,1,5,19.130976,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
4,33012,0,-99999,128,1.348400,-99999,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23092,15613,3,24,14,18.643448,0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,5.0,0
23093,15503,41,6,55,23.684585,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
23094,2167,24,6,63,27.500039,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
23095,31646,11,3,25,23.719125,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1


In [13]:
df.dtypes

ID_Patient_Care_Situation      int64
Diagnosed_Condition            int64
Treated_with_drugs            object
Patient_Age                    int64
Patient_Body_Mass_Index      float64
Patient_Smoker                 int64
A                            float64
B                            float64
C                            float64
D                            float64
E                            float64
F                            float64
Z                            float64
Number_of_prev_cond          float64
Survived_1_year                int64
dtype: object

Spliting the data into features and labels and converting them to numpy array. <br/>
We will then split our training and testing data

In [14]:
X = np.array(df.drop(['Survived_1_year'], 1))        # Features
y = np.array(df['Survived_1_year'])                  # Label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [15]:
print("Size of train data:", len(X_train), len(y_train))
print("Size of test data:", len(X_test), len(y_test))

Size of train data: 18477 18477
Size of test data: 4620 4620


## Training And Testing

In [None]:
clf = svm.SVR()   # classifier
clf.fit(X_train, y_train)

In [20]:
confidence = clf.score(X_test, y_test)


In [21]:
print(confidence)

0.09724208737702411


In [None]:
for k in ['linear','poly','rbf','sigmoid']:
    clf = svm.SVR(kernel=k)
    clf.fit(X_train, y_train)
    confidence = clf.score(X_test, y_test)
    print(k,confidence)