In [11]:
# import statements
import pandas as pd
from datetime import time
import numpy as np

In [12]:
# defining data for each csv file
patient_info=pd.read_csv('AsthmaFiles/patient_info.csv')
smartwatch1=pd.read_csv('AsthmaFiles/smartwatch1.csv')
smartwatch2=pd.read_csv('AsthmaFiles/smartwatch2.csv')
smartwatch3=pd.read_csv('AsthmaFiles/smartwatch3.csv')

In [13]:
#holds the list of user keys
list_of_user_keys=[]
#Cleaning data by removing patients with insignificant data
for x in patient_info[["user_key"]+list(patient_info)[-10:-2]].iterrows():
    if x[1]["pef_end_date"] - x[1]["pef_start_date"] >= 50 and x[1]["miband_end_date"]!="NaN":
        # if user fits significant data criteria, add user key to list
        list_of_user_keys.append(x[1]["user_key"])
list_of_user_keys

[190.0, 294.0, 343.0, 447.0, 473.0, 514.0, 625.0, 702.0, 808.0, 939.0]

In [14]:
#Gets peakflow data related to user key
def get_peakflow_data(user_key):
    peakflow_data = pd.read_csv("AsthmaFiles/peakflow.csv")
    peakflow_data = peakflow_data[peakflow_data["user_key"]==user_key]
    peakflow_data = peakflow_data[["date","hour","pef_max"]]
    return peakflow_data


In [15]:
#Gets Enviornmental data related to user key and dates their peakflow was recorded
def pair_weather(id,dates):
    weather = pd.read_csv("AsthmaFiles/environment.csv")
    for_id = weather.loc[weather['user_key'] == id]
    weather = for_id.loc[for_id['date'].isin(dates)]

    return weather

In [16]:
#try_catch to ensure no index errors crash the program and when they did a default value will be returned
def try_catch(row,default,weather,x):
    try:
        return weather.loc[weather['date'] == row["date"]].iloc[0][x]
    except IndexError:
        return default

In [17]:
#sepereates data for each user key among the 3 smartwatch files (sorting through 2.5 million datapoints is why it takes a while)
def seperate_for_key(user_key,smartwatch1,smartwatch2,smartwatch3):
    seperate_for_key1 = smartwatch1.loc[smartwatch1["user_key"]==user_key][["date","time","hr"]]
    seperate_for_key2 = smartwatch2.loc[smartwatch2["user_key"]==user_key][["date","time","hr"]]
    seperate_for_key3 = smartwatch3.loc[smartwatch3["user_key"]==user_key][["date","time","hr"]]
    frames = [seperate_for_key1, seperate_for_key2, seperate_for_key3]
    return pd.concat(frames)

#Parses through data for specific date given
def seperate_for_date(date,seperate_for_key):
    return seperate_for_key.loc[seperate_for_key["date"]==date][["time","hr"]]

#parse through data for specific time given
def seperate_for_time(time,seperate_for_date):
    if len(str(time)) == 1:
        time = "0" + str(time)
    return seperate_for_date.loc[seperate_for_date["time"].str.startswith(str(time))]["hr"].max()

#lambda method to be used in apply method that will set the value of rows in the peakflow dataframe based on date and time ranges
def lambda_method(row,key,sm1,sm2,sm3):
    sk = seperate_for_key(key,sm1,sm2,sm3)
    sd = seperate_for_date(row["date"],sk)
    return seperate_for_time(row["hour"],sd)



In [18]:
# default value is set to NA or empty in other words
default = "NA"
# traverse through all viable users
for i in list_of_user_keys:
    # gets peak flow data for user
    peak_flow_data = get_peakflow_data(i)
    # pairs weather data for user based on dates peak flow was recorded
    weather = pair_weather(i,peak_flow_data["date"])
    # adds weather data to peak flow data
    for x in list(weather.columns):
        #skips repeating columns
        if x not in peak_flow_data.columns:
            # appends rows to peak flow data
            peak_flow_data[x] = peak_flow_data.apply(lambda row: try_catch(row,default,weather,x), axis = 1)
    # adds heart rate data to peak flow data
    peak_flow_data["hr"] = peak_flow_data.apply(lambda row: lambda_method(row,i,smartwatch1,smartwatch2,smartwatch3), axis = 1)
    # saves data to csv file according to user key
    peak_flow_data.to_csv(f"AsthmaFiles/{int(i)}.csv",index=False)

In [19]:
# cleans data by removing rows with NaN values
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(axis=1)
    return df[indices_to_keep].astype(np.float64)

# reads all csv files and merges them into one to parse through
def read_all_and_merge():
    x = pd.DataFrame()
    for i in list_of_user_keys:
        user_info= pd.read_csv(f"AsthmaFiles/patient_info.csv")
        dataset = pd.read_csv(f"AsthmaFiles/{int(i)}.csv")
        dataset = dataset.drop(["weed_pollen","tree_pollen","grass_pollen"],axis=1)
        dataset = clean_dataset(dataset)
        ex_for_user = user_info.loc[user_info["user_key"]==i]["max_pef_expected"].iloc[0]
        max_for_user = dataset["pef_max"].max()
        dataset["pef_max"] = dataset["pef_max"].apply(lambda x: x/int(ex_for_user))
        dataset = dataset.drop(["date","user_key","no"],axis=1)
        x = pd.concat([x,dataset])

    return x

# saves merged data to csv file
x = read_all_and_merge()

# Function to remove rows with outliers in a specific column
def remove_outliers(df, column_name):
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 3.7 * IQR
    upper_bound = Q3 + 3.7 * IQR

    filtered_df = df[(df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)]
    return filtered_df

# x = remove_outliers(x, 'temperature')
# x = remove_outliers(x, 'temperature_min')
# x = remove_outliers(x, 'temperature_max')
# x = remove_outliers(x, 'pressure')
x = remove_outliers(x, 'humidity')
# x = remove_outliers(x, 'wind_speed')
# x = remove_outliers(x, 'wind_deg')
# x = remove_outliers(x, 'aqi')
# x = remove_outliers(x, 'co')
# x = remove_outliers(x, 'no2')
# x = remove_outliers(x, 'o3')
x = remove_outliers(x, 'so2')
# x = remove_outliers(x, 'pm2_5')
# x = remove_outliers(x, 'pm10')
# x = remove_outliers(x, 'nh3')
# x = remove_outliers(x, 'hr')

x.to_csv("AsthmaFiles/merged.csv",index=False)
y = x["pef_max"]
x = x.drop(["pef_max"],axis=1)
x


Unnamed: 0,hour,temperature,temperature_min,temperature_max,pressure,humidity,wind_speed,wind_deg,aqi,co,no2,o3,so2,pm2_5,pm10,nh3,hr
0,7.0,20.95,19.05,22.10,1022.0,83.0,2.15,201.0,5.0,270.37,9.08,43.63,0.91,53.96,57.75,3.90,94.0
1,7.0,16.64,15.44,17.99,1016.0,83.0,4.83,124.0,4.0,300.41,7.11,75.82,2.06,38.70,45.04,1.82,100.0
2,7.0,17.73,16.63,19.10,1007.0,82.0,3.30,160.0,4.0,263.69,8.40,55.08,1.31,25.41,29.47,3.07,89.0
3,7.0,17.65,17.02,18.39,1008.0,91.0,4.59,196.0,2.0,250.34,7.54,40.77,1.67,16.41,18.56,1.95,82.0
4,10.0,19.84,18.85,20.44,1014.0,90.0,5.24,268.0,1.0,243.66,4.67,37.55,1.13,3.74,4.80,1.79,93.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183,6.0,10.83,9.73,12.62,1022.0,93.0,2.06,220.0,1.0,188.59,4.58,49.35,1.43,6.58,11.06,0.98,91.0
184,5.0,9.58,7.80,12.06,1016.0,92.0,0.85,168.0,1.0,168.56,14.22,2.41,2.15,6.61,9.24,2.66,75.0
185,5.0,10.52,8.87,13.70,1005.0,86.0,0.89,248.0,1.0,173.57,12.00,11.44,1.49,3.62,4.90,1.77,77.0
186,5.0,9.49,8.35,10.37,1004.0,90.0,2.06,260.0,1.0,196.93,2.55,50.78,1.00,1.29,1.54,0.97,100.0


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
x_train=x_test=y_test=y_train=y_pred=0
count = 0
r2sum=0
maesum=0
msesum=0
rmsesum=0
for i in range(0,1000):
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.25, random_state=i)
    model = LinearRegression()
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

    # Assuming you have y_pred and y_test as NumPy arrays or Pandas Series
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    # print(f'Mean Absolute Error: {mae}')
    # print(f'Mean Squared Error: {mse}')
    # print(f'Root Mean Squared Error: {rmse}')
    if r2 > .4:
        count+=1
    r2sum+=r2
    maesum+=mae
    msesum+=mse
    rmsesum+=rmse
    # print(f'R-squared (R^2) Score: {r2}')
print(count)
print(r2sum/1000)
print(maesum/1000)
print(msesum/1000)
print(rmsesum/1000)



46
0.33791795365315047
0.11888384487269904
0.020308647607914093
0.14243268105105944


In [21]:
x_test

Unnamed: 0,hour,temperature,temperature_min,temperature_max,pressure,humidity,wind_speed,wind_deg,aqi,co,no2,o3,so2,pm2_5,pm10,nh3,hr
74,22.0,2.54,-0.77,4.84,1040.0,98.0,0.51,0.0,2.0,283.72,20.39,21.99,2.27,11.64,16.40,0.39,83.0
90,16.0,15.46,14.34,17.27,1017.0,79.0,4.12,190.0,1.0,236.99,1.41,50.07,0.60,2.84,5.63,1.08,99.0
43,11.0,26.15,23.97,29.37,1023.0,60.0,2.56,124.0,3.0,210.29,1.99,123.02,1.73,11.64,12.78,2.06,100.0
58,6.0,10.25,8.29,12.19,1023.0,93.0,7.91,319.0,1.0,283.72,4.88,51.50,2.09,1.78,4.25,0.59,88.0
118,7.0,14.75,12.93,15.77,1012.0,93.0,3.09,310.0,1.0,290.39,3.90,72.96,1.37,1.67,3.02,1.47,80.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
221,14.0,8.05,7.38,9.42,990.0,92.0,3.09,200.0,1.0,243.66,2.91,55.79,0.31,0.51,0.58,0.67,98.0
47,19.0,10.08,8.99,11.32,1010.0,88.0,14.08,304.0,1.0,303.75,2.61,57.22,1.27,0.97,2.94,0.17,85.0
47,7.0,19.16,17.27,20.88,1025.0,49.0,0.45,107.0,2.0,230.31,5.83,94.41,3.28,1.89,4.67,1.01,89.0
66,22.0,15.43,14.26,17.72,1012.0,87.0,1.70,174.0,2.0,220.30,10.28,20.21,3.79,11.28,12.13,2.50,88.0


In [22]:
# from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# # Assuming you have y_pred and y_test as NumPy arrays or Pandas Series
# mae = mean_absolute_error(y_test, y_pred)
# mse = mean_squared_error(y_test, y_pred)
# rmse = np.sqrt(mse)
# r2 = r2_score(y_test, y_pred)

# print(f'Mean Absolute Error: {mae}')
# print(f'Mean Squared Error: {mse}')
# print(f'Root Mean Squared Error: {rmse}')
# print(f'R-squared (R^2) Score: {r2}')



In [23]:
# Gives R^2 score of x_train vs y_train
model.score(x_train,y_train)

import joblib
joblib.dump(model, 'asthma_model.joblib')

['asthma_model.joblib']