In [1]:
# Here we import our libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# load data

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!unzip "drive/MyDrive/train&test.zip"

Archive:  drive/MyDrive/train&test.zip
replace test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: test.csv                
replace train.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: train.csv               


In [4]:
# Read data sets into pandas data frames for training and testing.
d_train = pd.read_csv("train.csv")
d_test = pd.read_csv("test.csv")

# Dealing with train dataset

In [5]:
d_train

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure
0,1,1,20,50,0.000000,0.083334,0,5.837492
1,2,1,20,50,0.033652,18.383041,0,5.907794
2,3,1,20,50,0.067514,22.509278,0,7.876254
3,4,1,20,50,0.101542,22.808822,0,11.742872
4,5,1,20,50,0.135756,25.355850,0,12.234987
...,...,...,...,...,...,...,...,...
6035995,6035996,125749,50,10,2.504603,1.489714,1,3.869032
6035996,6035997,125749,50,10,2.537961,1.488497,1,3.869032
6035997,6035998,125749,50,10,2.571408,1.558978,1,3.798729
6035998,6035999,125749,50,10,2.604744,1.272663,1,4.079938


In [6]:
# Check data types
d_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6036000 entries, 0 to 6035999
Data columns (total 8 columns):
 #   Column     Dtype  
---  ------     -----  
 0   id         int64  
 1   breath_id  int64  
 2   R          int64  
 3   C          int64  
 4   time_step  float64
 5   u_in       float64
 6   u_out      int64  
 7   pressure   float64
dtypes: float64(3), int64(5)
memory usage: 368.4 MB


In [7]:
# Drop outliers
mean = d_train.mean()
std_dev = d_train.std()
maxx = mean + 3 * std_dev
minn = mean - 3 * std_dev
outliers = d_train[(d_train > maxx) | (d_train < minn)].stack()
d_train = d_train.drop(outliers.index.get_level_values(0))

In [8]:
# drop nulls
d_train = d_train.dropna()

In [9]:
# split the data from the labels and drop irrelevant columns
y = d_train['pressure']
X = d_train.drop(['pressure','breath_id'],axis=1)

In [10]:
# Normalize data using min-max scaling
for column in X:
    if column != X.columns[0]:  # exclude "id" column
        X[column] = (X[column].iloc[1:] - X[column].iloc[1:].min()) / (X[column].iloc[1:].max() - X[column].iloc[1:].min())
 
display(X)

Unnamed: 0,id,R,C,time_step,u_in,u_out
0,1,,,,,
1,2,0.333333,1.0,0.011457,0.385990,0.0
2,3,0.333333,1.0,0.022986,0.472629,0.0
3,4,0.333333,1.0,0.034571,0.478918,0.0
4,5,0.333333,1.0,0.046219,0.532398,0.0
...,...,...,...,...,...,...
6035995,6035996,1.000000,0.0,0.852707,0.031280,1.0
6035996,6035997,1.000000,0.0,0.864064,0.031254,1.0
6035997,6035998,1.000000,0.0,0.875451,0.032734,1.0
6035998,6035999,1.000000,0.0,0.886801,0.026722,1.0


In [11]:
X = X.fillna(X.mean())

In [12]:
# Split the data into training and validation
x_train,x_val,y_train,y_val= train_test_split(X,y,test_size=0.2,random_state= 42)

# KNeighbour Model

### ( I tried other models but this gave best accuracy)

In [13]:
from sklearn.neighbors import KNeighborsRegressor

In [14]:
model1 = KNeighborsRegressor() 

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [16]:
import time 

start = time.time()
model1.fit(X_train, y_train)
end = time.time() 

print('Time to train {}'.format(end - start))


Time to train 13.979872465133667


# dealing with test dataset

In [17]:
d_test

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out
0,1,0,5,20,0.000000,0.000000,0
1,2,0,5,20,0.031904,7.515046,0
2,3,0,5,20,0.063827,14.651675,0
3,4,0,5,20,0.095751,21.230610,0
4,5,0,5,20,0.127644,26.320956,0
...,...,...,...,...,...,...,...
4023995,4023996,125748,20,10,2.530117,4.971245,1
4023996,4023997,125748,20,10,2.563853,4.975709,1
4023997,4023998,125748,20,10,2.597475,4.979468,1
4023998,4023999,125748,20,10,2.631134,4.982648,1


In [18]:
d_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4024000 entries, 0 to 4023999
Data columns (total 7 columns):
 #   Column     Dtype  
---  ------     -----  
 0   id         int64  
 1   breath_id  int64  
 2   R          int64  
 3   C          int64  
 4   time_step  float64
 5   u_in       float64
 6   u_out      int64  
dtypes: float64(2), int64(5)
memory usage: 214.9 MB


In [19]:
# Drop the same column from test too
d_test = d_test.drop( ['breath_id'],axis=1)

In [20]:
# Remove nulls from test set
#null_counts2 = d_test.isnull().sum()
d_test = d_test.dropna()

In [21]:
# Perform the same data processing on test set to optimize our results
mean = d_test.mean()
std_dev = d_test.std()
maxx = mean + 3 * std_dev
minn = mean - 3 * std_dev
outliers = d_test[(d_test > maxx) | (d_test < minn)].stack()
d_test = d_test.drop(outliers.index.get_level_values(0))

In [22]:
# Normalize data using min-max scaling
for column in d_test:
    if column != d_test.columns[0]:  # exclude the first column which is the id
        d_test[column] = (d_test[column].iloc[1:] - d_test[column].iloc[1:].min()) / (d_test[column].iloc[1:].max() - d_test[column].iloc[1:].min())

# view normalized data   
display(d_test)

Unnamed: 0,id,R,C,time_step,u_in,u_out
0,1,,,,,
1,2,0.000000,0.25,0.010869,0.157000,0.0
2,3,0.000000,0.25,0.021745,0.306094,0.0
3,4,0.000000,0.25,0.032622,0.443537,0.0
4,5,0.000000,0.25,0.043487,0.549882,0.0
...,...,...,...,...,...,...
4023995,4023996,0.333333,0.00,0.861990,0.103856,1.0
4023996,4023997,0.333333,0.00,0.873484,0.103950,1.0
4023997,4023998,0.333333,0.00,0.884939,0.104028,1.0
4023998,4023999,0.333333,0.00,0.896406,0.104094,1.0


In [23]:
d_test = d_test.fillna(d_test.mean())

In [24]:
d_test

Unnamed: 0,id,R,C,time_step,u_in,u_out
0,1,0.499156,0.394196,0.455012,0.115107,0.637977
1,2,0.000000,0.250000,0.010869,0.157000,0.000000
2,3,0.000000,0.250000,0.021745,0.306094,0.000000
3,4,0.000000,0.250000,0.032622,0.443537,0.000000
4,5,0.000000,0.250000,0.043487,0.549882,0.000000
...,...,...,...,...,...,...
4023995,4023996,0.333333,0.000000,0.861990,0.103856,1.000000
4023996,4023997,0.333333,0.000000,0.873484,0.103950,1.000000
4023997,4023998,0.333333,0.000000,0.884939,0.104028,1.000000
4023998,4023999,0.333333,0.000000,0.896406,0.104094,1.000000


# model prediction and accuracy

In [25]:
# Predict the pressures of the test dataset
y_pred = model1.predict(d_test)

In [26]:
# Create a new df similar to sample submission(only id and pressure) and save it in drive

final_result = pd.DataFrame({"id": d_test.id, "pressure": y_pred})

final_result.to_csv("Assign_submission.csv", index=False) 
final_result

Unnamed: 0,id,pressure
0,1,10.716461
1,2,10.716461
2,3,10.716461
3,4,10.716461
4,5,12.713042
...,...,...
4023995,4023996,6.343667
4023996,4023997,6.329607
4023997,4023998,6.315546
4023998,4023999,6.357728


In [27]:
# Model Score
score = model1.score(X_val, y_val)
print(score)

0.9295892466888871


In [28]:
print('Model score:', score*100 ,"%")

Model score: 92.95892466888871 %


# Name: Ahmed Mohamed Ayman Mahmoud
# Faculty of computer and data science
# university ID: 20201374584