# Reading the data and analysing the dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,classification_report
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
# reading the dataset using pandas
train_data=pd.read_parquet(r"D:\certificates\Structured_Data_Assignment\Structured_Data_Assignment\train.parquet")
train_data

Unnamed: 0,Patient-Uid,Date,Incident
0,a0db1e73-1c7c-11ec-ae39-16262ee38c7f,2019-03-09,PRIMARY_DIAGNOSIS
1,a0dc93f2-1c7c-11ec-9cd2-16262ee38c7f,2015-05-16,PRIMARY_DIAGNOSIS
3,a0dc94c6-1c7c-11ec-a3a0-16262ee38c7f,2018-01-30,SYMPTOM_TYPE_0
4,a0dc950b-1c7c-11ec-b6ec-16262ee38c7f,2015-04-22,DRUG_TYPE_0
8,a0dc9543-1c7c-11ec-bb63-16262ee38c7f,2016-06-18,DRUG_TYPE_1
...,...,...,...
29080886,a0ee9f75-1c7c-11ec-94c7-16262ee38c7f,2018-07-06,DRUG_TYPE_6
29080897,a0ee1284-1c7c-11ec-a3d5-16262ee38c7f,2017-12-29,DRUG_TYPE_6
29080900,a0ee9b26-1c7c-11ec-8a40-16262ee38c7f,2018-10-18,DRUG_TYPE_10
29080903,a0ee1a92-1c7c-11ec-8341-16262ee38c7f,2015-09-18,DRUG_TYPE_6


# Basic analysis of the data


In [2]:
# analysing the shape of the dataset
train_data.shape

(3220868, 3)

In [3]:
# columns in the dataset
train_data.columns

Index(['Patient-Uid', 'Date', 'Incident'], dtype='object')

# Data Preprocessing

In [4]:
# checking for the missing values
train_data.isnull().sum()

Patient-Uid    0
Date           0
Incident       0
dtype: int64

In [5]:
#checking for duplicates in the dataset
train_data.duplicated().sum()

35571

In [6]:
# removing the duplicates from the dataset
train_data=train_data.drop_duplicates()

In [7]:
train_data.duplicated().sum()

0

In [8]:
# analysing the dataset after removing the dplicates
train_data.shape

(3185297, 3)

In [9]:
# these are the unique values that are present in the dataset
train_data['Incident'].unique()

array(['PRIMARY_DIAGNOSIS', 'SYMPTOM_TYPE_0', 'DRUG_TYPE_0',
       'DRUG_TYPE_1', 'DRUG_TYPE_2', 'TEST_TYPE_0', 'DRUG_TYPE_3',
       'DRUG_TYPE_4', 'DRUG_TYPE_5', 'DRUG_TYPE_6', 'DRUG_TYPE_8',
       'DRUG_TYPE_7', 'SYMPTOM_TYPE_1', 'DRUG_TYPE_10', 'SYMPTOM_TYPE_29',
       'SYMPTOM_TYPE_2', 'DRUG_TYPE_11', 'DRUG_TYPE_9', 'DRUG_TYPE_13',
       'SYMPTOM_TYPE_5', 'TEST_TYPE_1', 'SYMPTOM_TYPE_6', 'TEST_TYPE_2',
       'SYMPTOM_TYPE_3', 'SYMPTOM_TYPE_8', 'DRUG_TYPE_14', 'DRUG_TYPE_12',
       'SYMPTOM_TYPE_9', 'SYMPTOM_TYPE_10', 'SYMPTOM_TYPE_7',
       'SYMPTOM_TYPE_11', 'TEST_TYPE_3', 'DRUG_TYPE_15', 'SYMPTOM_TYPE_4',
       'SYMPTOM_TYPE_14', 'SYMPTOM_TYPE_13', 'SYMPTOM_TYPE_16',
       'SYMPTOM_TYPE_17', 'SYMPTOM_TYPE_15', 'SYMPTOM_TYPE_18',
       'SYMPTOM_TYPE_12', 'SYMPTOM_TYPE_20', 'SYMPTOM_TYPE_21',
       'DRUG_TYPE_17', 'SYMPTOM_TYPE_22', 'TEST_TYPE_4',
       'SYMPTOM_TYPE_23', 'DRUG_TYPE_16', 'TEST_TYPE_5',
       'SYMPTOM_TYPE_19', 'SYMPTOM_TYPE_24', 'SYMPTOM_TYPE_25',
   

In [10]:
train_data['Patient-Uid'].value_counts()

Patient-Uid
a0ddfd2c-1c7c-11ec-876d-16262ee38c7f    1645
a0ea618f-1c7c-11ec-93fb-16262ee38c7f    1316
a0df4809-1c7c-11ec-be0b-16262ee38c7f    1093
a0ec2afe-1c7c-11ec-befd-16262ee38c7f    1068
a0e553c4-1c7c-11ec-83f1-16262ee38c7f     994
                                        ... 
a0ecc127-1c7c-11ec-92b5-16262ee38c7f      31
a0eb794b-1c7c-11ec-92d7-16262ee38c7f      29
a0f02cd8-1c7c-11ec-96a0-16262ee38c7f      28
a0efac48-1c7c-11ec-9daa-16262ee38c7f      25
a0f0d0b5-1c7c-11ec-9901-16262ee38c7f      24
Name: count, Length: 27033, dtype: int64

In [114]:
#  creating an object named positive result and storing the data that are tested with target drugs
positive_result_tr=train_data[train_data['Incident']=='TARGET DRUG']

In [115]:
positive_result_tr

Unnamed: 0,Patient-Uid,Date,Incident
3294791,a0eb742b-1c7c-11ec-8f61-16262ee38c7f,2020-04-09,TARGET DRUG
3296990,a0edaf09-1c7c-11ec-a360-16262ee38c7f,2018-06-12,TARGET DRUG
3305387,a0e9fa0e-1c7c-11ec-8dc7-16262ee38c7f,2019-06-11,TARGET DRUG
3309423,a0ecc615-1c7c-11ec-aa31-16262ee38c7f,2019-11-15,TARGET DRUG
3309494,a0ea612f-1c7c-11ec-8cf0-16262ee38c7f,2020-03-18,TARGET DRUG
...,...,...,...
29074998,a0ef2b6d-1c7c-11ec-9172-16262ee38c7f,2018-10-12,TARGET DRUG
29075105,a0ebe423-1c7c-11ec-a5e0-16262ee38c7f,2019-07-02,TARGET DRUG
29075494,a0ebc713-1c7c-11ec-bd53-16262ee38c7f,2019-05-21,TARGET DRUG
29080031,a0ee1bdb-1c7c-11ec-90ba-16262ee38c7f,2018-06-07,TARGET DRUG


In [116]:
# creating an object named negative result and storing the data that are  not tested with target drug
negative_data_tr=train_data[~train_data['Patient-Uid'].isin(positive_result['Patient-Uid'])]
negative_result_tr = negative_data.groupby('Patient-Uid').tail(1)

In [117]:
negative_result_tr

Unnamed: 0,Patient-Uid,Date,Incident
57,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2017-12-01,TEST_TYPE_0
208,a0f9e9f9-1c7c-11ec-b565-16262ee38c7f,2016-06-22,DRUG_TYPE_9
305,a0f9ea43-1c7c-11ec-aa10-16262ee38c7f,2019-07-21,DRUG_TYPE_6
420,a0f9ea7c-1c7c-11ec-af15-16262ee38c7f,2016-06-15,DRUG_TYPE_6
497,a0f9eab1-1c7c-11ec-a732-16262ee38c7f,2018-11-22,DRUG_TYPE_6
...,...,...,...
1372381,a102720c-1c7c-11ec-bd9a-16262ee38c7f,2020-01-07,DRUG_TYPE_6
1372432,a102723c-1c7c-11ec-9f80-16262ee38c7f,2019-07-06,DRUG_TYPE_3
1372543,a102726b-1c7c-11ec-bfbf-16262ee38c7f,2018-12-31,DRUG_TYPE_0
1372607,a102729b-1c7c-11ec-86ba-16262ee38c7f,2019-04-02,DRUG_TYPE_3


In [118]:
#to ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [119]:
# to get the count of previous prescriptions within specific time intervals
positive_result_tr['Prescription_Count'] = positive_result_tr.groupby('Patient-Uid')['Date'].cumcount()
negative_result_tr['Prescription_Count'] = negative_result_tr.groupby('Patient-Uid')['Date'].cumcount()

In [121]:
positive_result_tr

Unnamed: 0,Patient-Uid,Date,Incident,Prescription_Count
3294791,a0eb742b-1c7c-11ec-8f61-16262ee38c7f,2020-04-09,TARGET DRUG,0
3296990,a0edaf09-1c7c-11ec-a360-16262ee38c7f,2018-06-12,TARGET DRUG,0
3305387,a0e9fa0e-1c7c-11ec-8dc7-16262ee38c7f,2019-06-11,TARGET DRUG,0
3309423,a0ecc615-1c7c-11ec-aa31-16262ee38c7f,2019-11-15,TARGET DRUG,0
3309494,a0ea612f-1c7c-11ec-8cf0-16262ee38c7f,2020-03-18,TARGET DRUG,0
...,...,...,...,...
29074998,a0ef2b6d-1c7c-11ec-9172-16262ee38c7f,2018-10-12,TARGET DRUG,4
29075105,a0ebe423-1c7c-11ec-a5e0-16262ee38c7f,2019-07-02,TARGET DRUG,9
29075494,a0ebc713-1c7c-11ec-bd53-16262ee38c7f,2019-05-21,TARGET DRUG,10
29080031,a0ee1bdb-1c7c-11ec-90ba-16262ee38c7f,2018-06-07,TARGET DRUG,14


In [122]:
negative_result_tr

Unnamed: 0,Patient-Uid,Date,Incident,Prescription_Count
57,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2017-12-01,TEST_TYPE_0,0
208,a0f9e9f9-1c7c-11ec-b565-16262ee38c7f,2016-06-22,DRUG_TYPE_9,0
305,a0f9ea43-1c7c-11ec-aa10-16262ee38c7f,2019-07-21,DRUG_TYPE_6,0
420,a0f9ea7c-1c7c-11ec-af15-16262ee38c7f,2016-06-15,DRUG_TYPE_6,0
497,a0f9eab1-1c7c-11ec-a732-16262ee38c7f,2018-11-22,DRUG_TYPE_6,0
...,...,...,...,...
1372381,a102720c-1c7c-11ec-bd9a-16262ee38c7f,2020-01-07,DRUG_TYPE_6,0
1372432,a102723c-1c7c-11ec-9f80-16262ee38c7f,2019-07-06,DRUG_TYPE_3,0
1372543,a102726b-1c7c-11ec-bfbf-16262ee38c7f,2018-12-31,DRUG_TYPE_0,0
1372607,a102729b-1c7c-11ec-86ba-16262ee38c7f,2019-04-02,DRUG_TYPE_3,0


In [123]:
# to get the difference between the most recent prescription and the prediction date.
prediction_date = pd.to_datetime('today') + pd.DateOffset(days=30)
positive_result_tr['Time_diff'] = (prediction_date - positive_result_tr.groupby('Patient-Uid')['Date'].transform('max')).dt.days
negative_result_tr['Time_diff'] = (prediction_date - negative_result_tr.groupby('Patient-Uid')['Date'].transform('max')).dt.days

In [124]:
positive_result_tr

Unnamed: 0,Patient-Uid,Date,Incident,Prescription_Count,Time_diff
3294791,a0eb742b-1c7c-11ec-8f61-16262ee38c7f,2020-04-09,TARGET DRUG,0,1197
3296990,a0edaf09-1c7c-11ec-a360-16262ee38c7f,2018-06-12,TARGET DRUG,0,1451
3305387,a0e9fa0e-1c7c-11ec-8dc7-16262ee38c7f,2019-06-11,TARGET DRUG,0,1476
3309423,a0ecc615-1c7c-11ec-aa31-16262ee38c7f,2019-11-15,TARGET DRUG,0,1181
3309494,a0ea612f-1c7c-11ec-8cf0-16262ee38c7f,2020-03-18,TARGET DRUG,0,1197
...,...,...,...,...,...
29074998,a0ef2b6d-1c7c-11ec-9172-16262ee38c7f,2018-10-12,TARGET DRUG,4,1717
29075105,a0ebe423-1c7c-11ec-a5e0-16262ee38c7f,2019-07-02,TARGET DRUG,9,1213
29075494,a0ebc713-1c7c-11ec-bd53-16262ee38c7f,2019-05-21,TARGET DRUG,10,1190
29080031,a0ee1bdb-1c7c-11ec-90ba-16262ee38c7f,2018-06-07,TARGET DRUG,14,1197


In [125]:
negative_result_tr

Unnamed: 0,Patient-Uid,Date,Incident,Prescription_Count,Time_diff
57,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2017-12-01,TEST_TYPE_0,0,2183
208,a0f9e9f9-1c7c-11ec-b565-16262ee38c7f,2016-06-22,DRUG_TYPE_9,0,2710
305,a0f9ea43-1c7c-11ec-aa10-16262ee38c7f,2019-07-21,DRUG_TYPE_6,0,1586
420,a0f9ea7c-1c7c-11ec-af15-16262ee38c7f,2016-06-15,DRUG_TYPE_6,0,2717
497,a0f9eab1-1c7c-11ec-a732-16262ee38c7f,2018-11-22,DRUG_TYPE_6,0,1827
...,...,...,...,...,...
1372381,a102720c-1c7c-11ec-bd9a-16262ee38c7f,2020-01-07,DRUG_TYPE_6,0,1416
1372432,a102723c-1c7c-11ec-9f80-16262ee38c7f,2019-07-06,DRUG_TYPE_3,0,1601
1372543,a102726b-1c7c-11ec-bfbf-16262ee38c7f,2018-12-31,DRUG_TYPE_0,0,1788
1372607,a102729b-1c7c-11ec-86ba-16262ee38c7f,2019-04-02,DRUG_TYPE_3,0,1696


In [126]:
# concating two dataframes into single dataframe for assining it to the model
new_data=pd.concat([positive_result_tr,negative_result_tr])
new_data

Unnamed: 0,Patient-Uid,Date,Incident,Prescription_Count,Time_diff
3294791,a0eb742b-1c7c-11ec-8f61-16262ee38c7f,2020-04-09,TARGET DRUG,0,1197
3296990,a0edaf09-1c7c-11ec-a360-16262ee38c7f,2018-06-12,TARGET DRUG,0,1451
3305387,a0e9fa0e-1c7c-11ec-8dc7-16262ee38c7f,2019-06-11,TARGET DRUG,0,1476
3309423,a0ecc615-1c7c-11ec-aa31-16262ee38c7f,2019-11-15,TARGET DRUG,0,1181
3309494,a0ea612f-1c7c-11ec-8cf0-16262ee38c7f,2020-03-18,TARGET DRUG,0,1197
...,...,...,...,...,...
1372381,a102720c-1c7c-11ec-bd9a-16262ee38c7f,2020-01-07,DRUG_TYPE_6,0,1416
1372432,a102723c-1c7c-11ec-9f80-16262ee38c7f,2019-07-06,DRUG_TYPE_3,0,1601
1372543,a102726b-1c7c-11ec-bfbf-16262ee38c7f,2018-12-31,DRUG_TYPE_0,0,1788
1372607,a102729b-1c7c-11ec-86ba-16262ee38c7f,2019-04-02,DRUG_TYPE_3,0,1696


In [127]:
# creating train test split
x_train,x_test,y_train,y_test =train_test_split(new_data[['Prescription_Count','Time_diff']],new_data['Incident']=='TARGET DRUG',test_size=0.3,random_state=42)

In [128]:
# model building for the predictions
model =XGBClassifier()
model.fit(x_train,y_train)
train_predict =model.predict(x_train)
test_predict =model.predict(x_test)

In [129]:
# evaluating the model using accuracy score
print('Accuracy Score :',accuracy_score(y_test,test_predict))

Accuracy Score : 0.9634476916560779


In [130]:
# evaluating the model using the f1 score
print('F1 score :',f1_score(y_test,test_predict))

F1 score : 0.9784373984958649


In [131]:
# evaluating using confusion matrix
print('confusion Matrix :',confusion_matrix(y_test,test_predict))

confusion Matrix : [[ 3167   303]
 [  560 19580]]


In [132]:
# evaluating using confusion matrix
print(classification_report(y_test,test_predict))

              precision    recall  f1-score   support

       False       0.85      0.91      0.88      3470
        True       0.98      0.97      0.98     20140

    accuracy                           0.96     23610
   macro avg       0.92      0.94      0.93     23610
weighted avg       0.96      0.96      0.96     23610



# Testing dataset

In [91]:
# reading the test dataset
test_data =pd.read_parquet(r"D:\certificates\Structured_Data_Assignment\Structured_Data_Assignment\test.parquet")
test_data

Unnamed: 0,Patient-Uid,Date,Incident
0,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2016-12-08,SYMPTOM_TYPE_0
1,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2018-10-17,DRUG_TYPE_0
2,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2017-12-01,DRUG_TYPE_2
3,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2018-12-05,DRUG_TYPE_1
4,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2017-11-04,SYMPTOM_TYPE_0
...,...,...,...
1372854,a10272c9-1c7c-11ec-b3ce-16262ee38c7f,2017-05-11,DRUG_TYPE_13
1372856,a10272c9-1c7c-11ec-b3ce-16262ee38c7f,2018-08-22,DRUG_TYPE_2
1372857,a10272c9-1c7c-11ec-b3ce-16262ee38c7f,2017-02-04,DRUG_TYPE_2
1372858,a10272c9-1c7c-11ec-b3ce-16262ee38c7f,2017-09-25,DRUG_TYPE_8


## Basic Analysis of data

In [93]:
test_data.shape

(1065524, 3)

In [95]:
test_data.columns

Index(['Patient-Uid', 'Date', 'Incident'], dtype='object')

## Data Preprocessing

In [96]:
# checking for any missing value
test_data.isnull().sum()

Patient-Uid    0
Date           0
Incident       0
dtype: int64

In [97]:
# checking for data types
test_data.dtypes

Patient-Uid            object
Date           datetime64[ns]
Incident               object
dtype: object

In [101]:
# checking for duplicates in dataset
test_data.duplicated().sum()

12100

In [102]:
# removing the duplicates from the dataset
test_data =test_data.drop_duplicates()


In [103]:
# checking for duplicates after droping the duplicates
test_data.duplicated().sum()

0

In [133]:
# creating an object named positive result and storing the data that are tested with target drug
positive_result_ts =test_data[test_data['Incident']=='TARGET DRUG']

In [155]:
positive_result_ts

Unnamed: 0,Patient-Uid,Date,Incident,Prescription_Count,Time_diff


In [135]:
# creating an object named negative result and storing the data that are not tested with target drug
negative_data=test_data[~test_data['Patient-Uid'].isin(positive_result['Patient-Uid'])]
negative_result_ts = negative_data.groupby('Patient-Uid').tail(1)

In [136]:
negative_result_ts


Unnamed: 0,Patient-Uid,Date,Incident
57,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2017-12-01,TEST_TYPE_0
208,a0f9e9f9-1c7c-11ec-b565-16262ee38c7f,2016-06-22,DRUG_TYPE_9
305,a0f9ea43-1c7c-11ec-aa10-16262ee38c7f,2019-07-21,DRUG_TYPE_6
420,a0f9ea7c-1c7c-11ec-af15-16262ee38c7f,2016-06-15,DRUG_TYPE_6
497,a0f9eab1-1c7c-11ec-a732-16262ee38c7f,2018-11-22,DRUG_TYPE_6
...,...,...,...
1372381,a102720c-1c7c-11ec-bd9a-16262ee38c7f,2020-01-07,DRUG_TYPE_6
1372432,a102723c-1c7c-11ec-9f80-16262ee38c7f,2019-07-06,DRUG_TYPE_3
1372543,a102726b-1c7c-11ec-bfbf-16262ee38c7f,2018-12-31,DRUG_TYPE_0
1372607,a102729b-1c7c-11ec-86ba-16262ee38c7f,2019-04-02,DRUG_TYPE_3


In [137]:
# to get the count of previous prescriptions within specific time intervals
positive_result_ts['Prescription_Count'] = positive_result_ts.groupby('Patient-Uid')['Date'].cumcount()
negative_result_ts['Prescription_Count'] = negative_result_ts.groupby('Patient-Uid')['Date'].cumcount()
     

In [138]:
positive_result_ts

Unnamed: 0,Patient-Uid,Date,Incident,Prescription_Count


In [139]:
negative_result_ts

Unnamed: 0,Patient-Uid,Date,Incident,Prescription_Count
57,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2017-12-01,TEST_TYPE_0,0
208,a0f9e9f9-1c7c-11ec-b565-16262ee38c7f,2016-06-22,DRUG_TYPE_9,0
305,a0f9ea43-1c7c-11ec-aa10-16262ee38c7f,2019-07-21,DRUG_TYPE_6,0
420,a0f9ea7c-1c7c-11ec-af15-16262ee38c7f,2016-06-15,DRUG_TYPE_6,0
497,a0f9eab1-1c7c-11ec-a732-16262ee38c7f,2018-11-22,DRUG_TYPE_6,0
...,...,...,...,...
1372381,a102720c-1c7c-11ec-bd9a-16262ee38c7f,2020-01-07,DRUG_TYPE_6,0
1372432,a102723c-1c7c-11ec-9f80-16262ee38c7f,2019-07-06,DRUG_TYPE_3,0
1372543,a102726b-1c7c-11ec-bfbf-16262ee38c7f,2018-12-31,DRUG_TYPE_0,0
1372607,a102729b-1c7c-11ec-86ba-16262ee38c7f,2019-04-02,DRUG_TYPE_3,0


In [141]:
prediction_date_ts =pd.to_datetime('today') + pd.DateOffset(days=30)
positive_result_ts['Time_diff'] =(prediction_date - positive_result_ts.groupby('Patient-Uid')['Date'].transform('max')).dt.days
negative_result_ts['Time_diff'] = (prediction_date - negative_result_ts.groupby('Patient-Uid')['Date'].transform('max')).dt.days

In [142]:
positive_result_ts

Unnamed: 0,Patient-Uid,Date,Incident,Prescription_Count,Time_diff


In [143]:
negative_result_ts

Unnamed: 0,Patient-Uid,Date,Incident,Prescription_Count,Time_diff
57,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2017-12-01,TEST_TYPE_0,0,2183
208,a0f9e9f9-1c7c-11ec-b565-16262ee38c7f,2016-06-22,DRUG_TYPE_9,0,2710
305,a0f9ea43-1c7c-11ec-aa10-16262ee38c7f,2019-07-21,DRUG_TYPE_6,0,1586
420,a0f9ea7c-1c7c-11ec-af15-16262ee38c7f,2016-06-15,DRUG_TYPE_6,0,2717
497,a0f9eab1-1c7c-11ec-a732-16262ee38c7f,2018-11-22,DRUG_TYPE_6,0,1827
...,...,...,...,...,...
1372381,a102720c-1c7c-11ec-bd9a-16262ee38c7f,2020-01-07,DRUG_TYPE_6,0,1416
1372432,a102723c-1c7c-11ec-9f80-16262ee38c7f,2019-07-06,DRUG_TYPE_3,0,1601
1372543,a102726b-1c7c-11ec-bfbf-16262ee38c7f,2018-12-31,DRUG_TYPE_0,0,1788
1372607,a102729b-1c7c-11ec-86ba-16262ee38c7f,2019-04-02,DRUG_TYPE_3,0,1696


In [144]:
# creating new dataset by concating positive and negative sets
new_data_ts = pd.concat([positive_result_ts, negative_result_ts])


In [145]:
new_data_ts

Unnamed: 0,Patient-Uid,Date,Incident,Prescription_Count,Time_diff
57,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2017-12-01,TEST_TYPE_0,0,2183
208,a0f9e9f9-1c7c-11ec-b565-16262ee38c7f,2016-06-22,DRUG_TYPE_9,0,2710
305,a0f9ea43-1c7c-11ec-aa10-16262ee38c7f,2019-07-21,DRUG_TYPE_6,0,1586
420,a0f9ea7c-1c7c-11ec-af15-16262ee38c7f,2016-06-15,DRUG_TYPE_6,0,2717
497,a0f9eab1-1c7c-11ec-a732-16262ee38c7f,2018-11-22,DRUG_TYPE_6,0,1827
...,...,...,...,...,...
1372381,a102720c-1c7c-11ec-bd9a-16262ee38c7f,2020-01-07,DRUG_TYPE_6,0,1416
1372432,a102723c-1c7c-11ec-9f80-16262ee38c7f,2019-07-06,DRUG_TYPE_3,0,1601
1372543,a102726b-1c7c-11ec-bfbf-16262ee38c7f,2018-12-31,DRUG_TYPE_0,0,1788
1372607,a102729b-1c7c-11ec-86ba-16262ee38c7f,2019-04-02,DRUG_TYPE_3,0,1696


In [146]:
new_data_ts.shape

(11482, 5)

In [147]:
# predicting the test data
test_data_predict =model.predict(new_data_ts[['Prescription_Count','Time_diff']])

In [165]:
test_data_predict

array([0, 0, 1, ..., 0, 0, 0])

In [162]:
# final submission file after completing prediction
Final_submission = pd.DataFrame({'Patient-Uid': new_data_ts['Patient-Uid'], 'Prediction': test_data_predict})
Final_submission
     

Unnamed: 0,Patient-Uid,Prediction
57,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,0
208,a0f9e9f9-1c7c-11ec-b565-16262ee38c7f,0
305,a0f9ea43-1c7c-11ec-aa10-16262ee38c7f,1
420,a0f9ea7c-1c7c-11ec-af15-16262ee38c7f,0
497,a0f9eab1-1c7c-11ec-a732-16262ee38c7f,0
...,...,...
1372381,a102720c-1c7c-11ec-bd9a-16262ee38c7f,1
1372432,a102723c-1c7c-11ec-9f80-16262ee38c7f,0
1372543,a102726b-1c7c-11ec-bfbf-16262ee38c7f,0
1372607,a102729b-1c7c-11ec-86ba-16262ee38c7f,0
