In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
#import pydicom
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder,PowerTransformer
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

In [51]:
train = pd.read_csv('/content/our_train.csv')
test = pd.read_csv('/content/our_test.csv')

In [52]:
train.shape, test.shape

((1223, 7), (312, 7))

In [53]:
train.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker
1,ID00007637202177411956430,5,2214,55.712129,79,Male,Ex-smoker
2,ID00007637202177411956430,7,2061,51.862104,79,Male,Ex-smoker
3,ID00007637202177411956430,9,2144,53.950679,79,Male,Ex-smoker
4,ID00007637202177411956430,11,2069,52.063412,79,Male,Ex-smoker


In [54]:
test.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00023637202179104603099,-3,1536,65.306122,71,Female,Ex-smoker
1,ID00023637202179104603099,3,1368,58.163265,71,Female,Ex-smoker
2,ID00023637202179104603099,5,1361,57.865646,71,Female,Ex-smoker
3,ID00023637202179104603099,7,1465,62.287415,71,Female,Ex-smoker
4,ID00023637202179104603099,9,1681,71.471088,71,Female,Ex-smoker


In [55]:
# Using Weeks, Age, Sex and Smoking Status columns from train data
X = train[['Weeks','Age','Sex','SmokingStatus']].copy()
y = train['FVC'].copy()

# save the stats for future use
stats = X.describe().T

# One hot encoding on Sex and SmokingStatus columns
#pd.get_dummies - assign numerical value to these categorical inputs. 
X = pd.get_dummies(X, columns =['Sex','SmokingStatus'],drop_first=True)

#Scaling numeric features 
# scaling the numeric features
for col in ['Weeks', 'Age']:
    X[col] = (X[col] - stats.loc[col, 'min']) / (stats.loc[col, 'max'] - stats.loc[col, 'min'])

In [56]:
X.head()

Unnamed: 0,Weeks,Age,Sex_Male,SmokingStatus_Ex-smoker,SmokingStatus_Never smoked
0,0.007246,0.789474,1,1,0
1,0.072464,0.789474,1,1,0
2,0.086957,0.789474,1,1,0
3,0.101449,0.789474,1,1,0
4,0.115942,0.789474,1,1,0


In [57]:
X = train.copy()
y = train['FVC'].copy()

X['base_week'] = X.groupby('Patient')['Weeks'].transform('min')
X['base_FVC'] = X.groupby('Patient')['FVC'].transform('first')

# save the stats for future use
stats = X.describe().T

# one hot encoding for categorical features
X = pd.get_dummies(data=X, columns=['Sex','SmokingStatus'], drop_first=True)

# Scaling numeric columns
num_cols = ['Age','Weeks','base_week','base_FVC']

# Min-max scaling
for col in num_cols:
    X[col] = (X[col]-stats.loc[col,'min']) / (stats.loc[col,'max'] - stats.loc[col,'min'])
    
# printing the correlation of all features with FVC
print(X.corr()['FVC'].abs().sort_values(ascending=False)[1:])

base_FVC                      0.962155
Percent                       0.663128
Sex_Male                      0.483409
SmokingStatus_Never smoked    0.183468
Age                           0.149372
SmokingStatus_Ex-smoker       0.119985
Weeks                         0.023979
base_week                     0.006648
Name: FVC, dtype: float64


In [58]:
# removing unnecesary columns after transformations
#drop patient, as we are building a model based on our values 
X.drop(['Patient','FVC'], axis=1, inplace=True)
X.head()

Unnamed: 0,Weeks,Percent,Age,base_week,base_FVC,Sex_Male,SmokingStatus_Ex-smoker,SmokingStatus_Never smoked
0,0.007246,58.253649,0.789474,0.011905,0.241456,1,1,0
1,0.072464,55.712129,0.789474,0.011905,0.241456,1,1,0
2,0.086957,51.862104,0.789474,0.011905,0.241456,1,1,0
3,0.101449,53.950679,0.789474,0.011905,0.241456,1,1,0
4,0.115942,52.063412,0.789474,0.011905,0.241456,1,1,0


In [59]:
y.head()

0    2315
1    2214
2    2061
3    2144
4    2069
Name: FVC, dtype: int64

In [60]:
# Checking the score on transformed data nowla
cross_val_score(LinearRegression(),X,y,cv=3,scoring='neg_root_mean_squared_error')

array([-178.66715958, -191.27543824, -196.99707731])

In [61]:
# fit on the train dataset
lr = LinearRegression().fit(X, y)

**Making prediction on test data**

In [62]:
X_test = test.copy()
y_test = test['FVC'].copy()

X_test['base_week'] = X_test.groupby('Patient')['Weeks'].transform('min')
X_test['base_FVC'] = X_test.groupby('Patient')['FVC'].transform('first')

# save the stats for future use
stats = X_test.describe().T

# one hot encoding for categorical features
X_test = pd.get_dummies(data=X_test, columns=['Sex','SmokingStatus'], drop_first=True)

# Scaling numeric columns
num_cols = ['Age','Weeks','base_week','base_FVC']

# Min-max scaling
for col in num_cols:
    X_test[col] = (X_test[col]-stats.loc[col,'min']) / (stats.loc[col,'max'] - stats.loc[col,'min'])


In [63]:
X_test.drop(['Patient','FVC'], axis=1, inplace=True)
X_test.head()

Unnamed: 0,Weeks,Percent,Age,base_week,base_FVC,Sex_Male,SmokingStatus_Ex-smoker,SmokingStatus_Never smoked
0,0.009434,65.306122,0.540541,0.019231,0.044037,0,1,0
1,0.066038,58.163265,0.540541,0.019231,0.044037,0,1,0
2,0.084906,57.865646,0.540541,0.019231,0.044037,0,1,0
3,0.103774,62.287415,0.540541,0.019231,0.044037,0,1,0
4,0.122642,71.471088,0.540541,0.019231,0.044037,0,1,0


In [64]:
pred_test = test.copy()
pred_test['FVC'] = lr.predict(X_test)

In [65]:
pred_test['Patient_Week'] = pred_test.Patient + "_" + pred_test.Weeks.astype(str)

In [66]:
pred_test

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,Patient_Week
0,ID00023637202179104603099,-3,1171.565110,65.306122,71,Female,Ex-smoker,ID00023637202179104603099_-3
1,ID00023637202179104603099,3,1071.578076,58.163265,71,Female,Ex-smoker,ID00023637202179104603099_3
2,ID00023637202179104603099,5,1062.181743,57.865646,71,Female,Ex-smoker,ID00023637202179104603099_5
3,ID00023637202179104603099,7,1107.000253,62.287415,71,Female,Ex-smoker,ID00023637202179104603099_7
4,ID00023637202179104603099,9,1206.522027,71.471088,71,Female,Ex-smoker,ID00023637202179104603099_9
...,...,...,...,...,...,...,...,...
307,ID00421637202311550012437,21,3273.552127,84.471603,68,Male,Ex-smoker,ID00421637202311550012437_21
308,ID00421637202311550012437,23,3278.930297,85.460101,68,Male,Ex-smoker,ID00421637202311550012437_23
309,ID00421637202311550012437,29,3213.855429,81.356338,68,Male,Ex-smoker,ID00421637202311550012437_29
310,ID00421637202311550012437,41,3218.251736,84.861011,68,Male,Ex-smoker,ID00421637202311550012437_41


In [67]:
test['Patient_Week'] = test.Patient + "_" + test.Weeks.astype(str)

In [68]:
test

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,Patient_Week
0,ID00023637202179104603099,-3,1536,65.306122,71,Female,Ex-smoker,ID00023637202179104603099_-3
1,ID00023637202179104603099,3,1368,58.163265,71,Female,Ex-smoker,ID00023637202179104603099_3
2,ID00023637202179104603099,5,1361,57.865646,71,Female,Ex-smoker,ID00023637202179104603099_5
3,ID00023637202179104603099,7,1465,62.287415,71,Female,Ex-smoker,ID00023637202179104603099_7
4,ID00023637202179104603099,9,1681,71.471088,71,Female,Ex-smoker,ID00023637202179104603099_9
...,...,...,...,...,...,...,...,...
307,ID00421637202311550012437,21,2820,84.471603,68,Male,Ex-smoker,ID00421637202311550012437_21
308,ID00421637202311550012437,23,2853,85.460101,68,Male,Ex-smoker,ID00421637202311550012437_23
309,ID00421637202311550012437,29,2716,81.356338,68,Male,Ex-smoker,ID00421637202311550012437_29
310,ID00421637202311550012437,41,2833,84.861011,68,Male,Ex-smoker,ID00421637202311550012437_41


In [69]:
s1 = pd.merge(pred_test, test, how='inner', on=['Patient_Week'])

In [70]:
s1.head()

Unnamed: 0,Patient_x,Weeks_x,FVC_x,Percent_x,Age_x,Sex_x,SmokingStatus_x,Patient_Week,Patient_y,Weeks_y,FVC_y,Percent_y,Age_y,Sex_y,SmokingStatus_y
0,ID00023637202179104603099,-3,1171.56511,65.306122,71,Female,Ex-smoker,ID00023637202179104603099_-3,ID00023637202179104603099,-3,1536,65.306122,71,Female,Ex-smoker
1,ID00023637202179104603099,3,1071.578076,58.163265,71,Female,Ex-smoker,ID00023637202179104603099_3,ID00023637202179104603099,3,1368,58.163265,71,Female,Ex-smoker
2,ID00023637202179104603099,5,1062.181743,57.865646,71,Female,Ex-smoker,ID00023637202179104603099_5,ID00023637202179104603099,5,1361,57.865646,71,Female,Ex-smoker
3,ID00023637202179104603099,7,1107.000253,62.287415,71,Female,Ex-smoker,ID00023637202179104603099_7,ID00023637202179104603099,7,1465,62.287415,71,Female,Ex-smoker
4,ID00023637202179104603099,9,1206.522027,71.471088,71,Female,Ex-smoker,ID00023637202179104603099_9,ID00023637202179104603099,9,1681,71.471088,71,Female,Ex-smoker


In [71]:
s1 = s1.rename(columns={"FVC_y": "TrueFVC", "FVC_x": "PredictedFVC"})

In [72]:
from sklearn.metrics import mean_squared_error
from math import sqrt

y_test = s1.TrueFVC
y_pred = s1.PredictedFVC

rms = sqrt(mean_squared_error(y_pred, y_test))
print(rms)

513.0357584435505
