# Data Parse

In [25]:
import subprocess
import requests
import pandas as pd
import json

def fetch_nutrients_dataframe():
    command = ['python', 'https://github.com/calvinkochunisg/HSG-CS-Project/blob/dev/api/spoonacular.py']

    # Run the command and capture the output
    result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

    # Check if the script executed successfully
    if result.returncode == 0:
        # The script executed successfully, process the stdout
        url_output = result.stdout.strip()
    else:
        # There was an error executing the script, process the stderr
        print(f"Error executing script: {result.stderr}")
        return None

    if url_output:
        url = url_output
        print(f"URL: {url}")
    else:
        print("No URL captured from the script output.")
        return None

    # Make a GET request to the URL
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
    else:
        print(f"Failed to fetch data, status code: {response.status_code}")
        return None

    # Initialize a list to hold structured nutrient data
    structured_nutrients = []

    # Iterate over each day in the data
    for day, content in data['week'].items():
        # Extract nutrients data and include the day of the week
        nutrients_data = {
            'Day': day.capitalize(),
            'Calories': content['nutrients']['calories'],
            'Protein': content['nutrients']['protein'],
            'Fat': content['nutrients']['fat'],
            'Carbohydrates': content['nutrients']['carbohydrates']
        }
        # Append the nutrients data to the list
        structured_nutrients.append(nutrients_data)

    # Convert the list of nutrients data into a DataFrame
    df_nutrients = pd.DataFrame(structured_nutrients)
    return df_nutrients

# Call the function and get the nutrients DataFrame
df_nutrients = fetch_nutrients_dataframe()

if df_nutrients is not None:
    print(df_nutrients)

Error executing script: python: can't open file '//https://github.com/calvinkochunisg/HSG-CS-Project/blob/dev/api/spoonacular.py': [Errno 2] No such file or directory



In [26]:
import subprocess

# Define the command to run your script using the absolute path
command = ['python', 'https://github.com/calvinkochunisg/HSG-CS-Project/blob/dev/api/spoonacular.py']

# Run the command and capture the output
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

# Check if the script executed successfully
if result.returncode == 0:
    # The script executed successfully, process the stdout
    url_output = result.stdout.strip()
    if url_output:
        print(f"URL: {url_output}")  # Assuming the script outputs only the URL
    else:
        print("No URL captured from the script output.")
else:
    # There was an error executing the script, process the stderr
    print(f"Error executing script: {result.stderr}")


Error executing script: python: can't open file '//https://github.com/calvinkochunisg/HSG-CS-Project/blob/dev/api/spoonacular.py': [Errno 2] No such file or directory



In [28]:
import requests

# URL to the raw version of the script
url = 'https://github.com/calvinkochunisg/HSG-CS-Project/blob/dev/api/spoonacular.py'

# Fetch the script
response = requests.get(url)
if response.status_code == 200:
    with open('spoonacular.py', 'w') as file:
        file.write(response.text)
else:
    print("Failed to download the file")

# Now you have "spoonacular.py" locally, and you can execute it as needed


OSError: [Errno 30] Read-only file system: 'spoonacular.py'

# Modelling

### Preparation

In [15]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer

import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# URL to the raw CSV file on GitHub
url = "https://raw.githubusercontent.com/calvinkochunisg/HSG-CS-Project/dev/Data/diet_data%202.csv"
response = requests.get(url, verify=False)
if response.status_code == 200:
    data = StringIO(response.text)
    data = pd.read_csv(data)
    print(df.head())
else:
    print('Failed to retrieve data:', response.status_code)

        Date  Stone  Pounds  Ounces  weight_oz  calories cals_per_oz  \
0  7/30/2018   12.0     2.0     6.0     2726.0    1950.0        0.72   
1  7/31/2018   12.0     0.0     8.0     2696.0    2600.0        0.96   
2   8/1/2018   12.0     1.0     0.0     2704.0    2500.0        0.92   
3   8/2/2018   12.0     1.0     0.0     2704.0    1850.0        0.68   
4   8/3/2018   11.0    12.0     8.0     2664.0    2900.0        1.09   

   five_donuts  walk  run  wine  prot  weight  change  
0          1.0   1.0  0.0   0.0   0.0     0.0   -30.0  
1          1.0   0.0  0.0   0.0   0.0     0.0     8.0  
2          1.0   1.0  0.0   0.0   0.0     0.0     0.0  
3          1.0   1.0  0.0   1.0   0.0     0.0   -40.0  
4          1.0   1.0  0.0   0.0   0.0     0.0    14.0  




### Cleaning

In [16]:
print(data.head())
data.info()

#convert to datetime
data['Date'] = pd.to_datetime(data['Date'], format='%m/%d/%Y')
data.set_index('Date', inplace=True)
print(data.head())

# converting ounces to kilogram which is a more popular measurement for countries outsides of U.S.
conversion_factor = 0.0283495

data["kg"] = data["weight_oz"]*conversion_factor

# dropping the features with existing information to avoid multicolinearity, 
# "cals_per_oz" and "prot" are dropped as not provided in the API
data_clean = data.drop(["Stone", "Pounds", "Ounces", "five_donuts", 
                        "weight_oz", "cals_per_oz", "prot", "change"], axis=1)
print(data_clean.head())
print(data_clean.info())
print(data_clean.tail(10))

#shifting the "kg" column up because this result is measured before exercising, 
# but it is only useful at the day end to see the result

data_clean['kg'] = data_clean['kg'].shift(-1)

# Remove the last 8 rows of the DataFrame because they are pure NaN
data_clean = data_clean[:-8]
print(data_clean.head())
print(data_clean.tail())

        Date  Stone  Pounds  Ounces  weight_oz  calories cals_per_oz  \
0  7/30/2018   12.0     2.0     6.0     2726.0    1950.0        0.72   
1  7/31/2018   12.0     0.0     8.0     2696.0    2600.0        0.96   
2   8/1/2018   12.0     1.0     0.0     2704.0    2500.0        0.92   
3   8/2/2018   12.0     1.0     0.0     2704.0    1850.0        0.68   
4   8/3/2018   11.0    12.0     8.0     2664.0    2900.0        1.09   

   five_donuts  walk  run  wine  prot  weight  change  
0          1.0   1.0  0.0   0.0   0.0     0.0   -30.0  
1          1.0   0.0  0.0   0.0   0.0     0.0     8.0  
2          1.0   1.0  0.0   0.0   0.0     0.0     0.0  
3          1.0   1.0  0.0   1.0   0.0     0.0   -40.0  
4          1.0   1.0  0.0   0.0   0.0     0.0    14.0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151 entries, 0 to 150
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         150 non-null    object 


### KNN Imputer

In [None]:
# using two weeks for test data, since the dataset at hand is really small
# too out the last week as anomalie, as a huge dip is observed, maybe the recorder feels motivated than ever as Christmas approaches
data_train = data_clean[data_clean.index < "2018-12-06"]
data_test = data_clean[(data_clean.index >= "2018-12-06") & (data_clean.index <= "2018-12-12")]

# KNN imputer to give values to the NaN
imputer = KNNImputer(n_neighbors=5, weights='uniform')
imputed_train_1 = imputer.fit_transform(data_train)
imputed_train = pd.DataFrame(imputed_train_1, index=data_train.index, columns=data_train.columns)
print(imputed_train.shape)

imputed_test_1 = imputer.fit_transform(data_test)
imputed_test = pd.DataFrame(imputed_test_1, index=data_test.index, columns=data_test.columns)
print(imputed_test.shape)

missing_values = imputed_train.isnull().sum().sum()
print("Missing values in the DataFrame:", missing_values)

### XGBoost

In [None]:
# Splitting the data into features and target
X_train = imputed_train.drop("kg", axis=1)
y_train = imputed_train["kg"]
X_test = imputed_test.drop("kg", axis=1)
y_test = imputed_test["kg"]

# Initialize and train the XGBoost model
model_xgb = xgb.XGBRegressor(objective ='reg:squarederror', n_estimators=100, seed=42)
model_xgb.fit(X_train, y_train)

# Predictions and Evaluation
y_pred = model_xgb.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")