# import packages

In [None]:
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split, cross_validate
import zipfile
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from sklearn.metrics import root_mean_squared_log_error
from sklearn.linear_model import LinearRegression

In [None]:
data_path = os.path.join(os.path.dirname(os.getcwd()),"data",os.listdir(os.path.join(os.path.dirname(os.getcwd()),"data"))[0])

with zipfile.ZipFile(data_path) as z:
    with z.open("sample_submission.csv") as f:
        sample_submission = pd.read_csv(f)
    with z.open("test.csv") as f:
        test = pd.read_csv(f)
    with z.open("train.csv") as f:
        train = pd.read_csv(f)

In [None]:
train.shape,test.shape

In [None]:
train.info()

In [None]:
train.head(2)

In [None]:
def get_dtypes(df):
    cols = list(df.columns)
    object_features, numerical_features = [], []
    datatypes = df.dtypes.to_dict()
    object_features = [column for column in cols if datatypes[column].name=='object']
    numerical_features = [column for column in cols if column not in object_features ]
    return object_features,numerical_features
   

In [None]:
object_features, numerical_features = get_dtypes(train)

In [None]:
train.describe()

In [None]:
train.hist(bins=50, figsize=(12, 12))

In [None]:
bins = [-np.inf,80,90,100,110,np.inf]
labels = [0,1,2,3,4]
train["hr_category"] = pd.cut(train["Heart_Rate"],
                               bins=bins,
                               labels=labels)


In [None]:
train, test_set = train_test_split(
    train, test_size=0.2, stratify=train[["hr_category"]], random_state=42)

In [None]:
train_set, validation_set = train_test_split(
    train_validation_set, test_size=0.20, stratify=train_validation_set[["hr_category"]], random_state=42)

In [None]:
train_set.shape, validation_set.shape, test_set.shape, train_validation_set.shape

In [None]:
sns.pairplot(train_set)

In [None]:
train_set.columns

In [None]:
target = "Calories"

In [None]:
train_set.plot(kind="scatter", x="Heart_Rate", y=target, grid=True, alpha=0.005)

# new features

In [None]:
train_set["heart_rate_duration"] = train_set["Heart_Rate"]*train_set["Duration"]

In [None]:
train_set.plot(kind="scatter", x="heart_rate_duration", y=target, grid=True, alpha=0.005)

In [None]:
train_set.plot(kind="scatter", x="heart_rate_duration", y=target, grid=True,)

In [None]:
lin_reg = LinearRegression(fit_intercept=False)

In [None]:
linear_rmsles = cross_val_score(lin_reg, train_set["heart_rate_duration"].values.reshape(-1,1), train_set[target].values,
                              scoring="neg_root_mean_squared_log_error", cv=10)

In [None]:
linear_rmsles.mean()

In [None]:
linear_results = cross_validate(lin_reg, train_set["heart_rate_duration"].values.reshape(-1,1), train_set[target].values,
                              scoring="neg_root_mean_squared_log_error", cv=10,return_train_score=True)

In [None]:
linear_results