### Import modules

In [1]:
import pandas as pd
import numpy as np

from preprocessing.preprocessing import preprocessing

from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

### Import data

In [2]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test_challenge.csv')

### Preprocess data

In [3]:
df_train_cln = preprocessing(df_train).set_index('NHC')
df_test_cln = preprocessing(df_test).set_index('NHC')

In [4]:
df_train_cln.days_between = df_train_cln.days_between.astype(int)
df_test_cln.days_between = df_test_cln.days_between.astype(int)

### Check data

In [5]:
df_train_cln = df_train_cln.drop(['start_neutropenico','start_FN','birth_year','Gender','Past_positive_result_from','ID'], axis=1)

In [6]:
df_train_cln.head(5)


Unnamed: 0_level_0,MDR,days_between,days_in_hospital,hospital_stay_w_FN,prev_hospital_stay,emergency,num_movements,num_consult,share_room_MDR,dummy_LAM,...,num_rooms_b,gender__female,gender__male,Past_positive__Blood culture,Past_positive__Both,Past_positive__Culture,Past_positive__NEGATIVE,Past_positive__NEITHER,days_neutropenic_wo_fn,dummie_days_neutropenic_wo_fn
NHC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
404,0,0,28,1,3,0,0,10,0,0,...,1.0,1,0,0,0,1,0,0,21,1
1897,0,0,8,1,6,0,0,0,0,0,...,1.0,1,0,0,0,0,1,0,4,1
556,0,0,2,1,1,0,0,1,0,0,...,1.0,0,1,0,0,0,1,0,0,0
454,0,0,1,1,9,1,1,0,0,0,...,3.0,0,1,0,0,0,1,0,0,0
1615,0,0,17,1,5,0,2,2,0,0,...,2.0,1,0,0,0,1,0,0,15,1


In [7]:
feature = 'days_between'

datasets=[df_train_cln,df_test_cln]

for dataset in datasets:
    print('max:',dataset[feature].max())
    print('min:',dataset[feature].min())
    print('mean:',dataset[feature].mean())
    print('nulls',dataset[feature].isnull().sum())

max: 1307
min: 0
mean: 39.22387136672851
nulls 0
max: 3368
min: 0
mean: 61.85228677379481
nulls 0


### Split target and features

In [8]:
y = df_train_cln['MDR']
X = df_train_cln.drop(['MDR'], axis=1)

### Split train and test

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=0)

### Create model

In [10]:
lr = LinearRegression(normalize=True)

### Fit model

In [11]:
lr.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)

In [12]:
y_test_pred = lr.predict(X_test)
y_train_pred = lr.predict(X_train)
print('Train positives:',(y_train_pred > 0.5).sum())
print('Test positives:',(y_test_pred > 0.5).sum()) 

Train positives: 3
Test positives: 4


In [13]:
print('train ROC AUC:', roc_auc_score(y_train, y_train_pred))
print('test ROC AUC:', roc_auc_score(y_test, y_test_pred))

train ROC AUC: 0.7979740632283512
test ROC AUC: 0.5934065934065934
