In [1]:
import numpy as np
import pandas as pd

In [2]:
nutrients = pd.read_csv("nutrients_csvfile.csv")
nutrients.head()

Unnamed: 0,Food,Measure,Grams,Calories,Protein,Fat,Sat.Fat,Fiber,Carbs,Category
0,Cows' milk,1 qt.,976,660,32,40,36,0.0,48,Dairy products
1,Milk skim,1 qt.,984,360,36,t,t,0.0,52,Dairy products
2,Buttermilk,1 cup,246,127,9,5,4,0.0,13,Dairy products
3,"Evaporated, undiluted",1 cup,252,345,16,20,18,0.0,24,Dairy products
4,Fortified milk,6 cups,1419,1373,89,42,23,1.4,119,Dairy products


In [3]:
nutrients.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 335 entries, 0 to 334
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Food      335 non-null    object
 1   Measure   335 non-null    object
 2   Grams     335 non-null    object
 3   Calories  334 non-null    object
 4   Protein   335 non-null    object
 5   Fat       335 non-null    object
 6   Sat.Fat   333 non-null    object
 7   Fiber     335 non-null    object
 8   Carbs     335 non-null    object
 9   Category  335 non-null    object
dtypes: object(10)
memory usage: 26.3+ KB


Todas las variables tienen tipo de datos `object`, se debe hacer preprocesamiento de datos para conseguir tipos de datos numéricos con excepción de las variables categóricas Food, Measure y Category.

In [4]:
# nutrients['Food'].value_counts() # muy pocos tipos de alimentos se repiten, no se considera como variable para crear el modelo
# nutrients['Measure'].value_counts() # muchas categorías, 
nutrients['Category'].value_counts().head() # se tomará en cuenta para muestreo estratificado

Breads, cereals, fastfood,grains    45
Meat, Poultry                       30
Desserts, sweets                    29
Vegetables A-E                      28
Vegetables R-Z                      28
Name: Category, dtype: int64

In [5]:
nutrients.drop_duplicates(keep='first', inplace=True) 

In [6]:
num_features = ["Grams", "Calories", "Protein",	"Fat", "Sat.Fat", "Fiber",	"Carbs"]
nutrients_num = pd.DataFrame(nutrients[num_features])
nutrients_num.replace("t", 0, inplace=True)
nutrients_num.replace("t'", 0, inplace=True)
nutrients_num.replace(",", ".",regex=True, inplace=True)
nutrients_num.replace(",", "", regex=True, inplace=True)
nutrients_num.replace("a", "", regex=True, inplace=True)
nutrients_num['Calories'][91]=(8+44)/2 # necesario para luego poder pasar a valores numéricos


In [7]:
nutrients_num[:10]

Unnamed: 0,Grams,Calories,Protein,Fat,Sat.Fat,Fiber,Carbs
0,976.0,660.0,32,40,36,0.0,48
1,984.0,360.0,36,0,0,0.0,52
2,246.0,127.0,9,5,4,0.0,13
3,252.0,345.0,16,20,18,0.0,24
4,1.419,1.373,89,42,23,1.4,119
5,103.0,515.0,27,28,24,0.0,39
6,85.0,290.0,30,0,0,0.0,42
7,85.0,290.0,30,0,0,1.0,42
8,244.0,165.0,8,10,8,0.0,11
9,540.0,690.0,24,24,22,0.0,70


In [8]:
for name in num_features:
    nutrients_num[name] = pd.to_numeric(nutrients_num[name])

nutrients_num.dtypes

Grams       float64
Calories    float64
Protein       int64
Fat           int64
Sat.Fat     float64
Fiber       float64
Carbs       float64
dtype: object

In [9]:
nutrients_num.isnull().any()  # se consulta si hay valores nulos

Grams       False
Calories     True
Protein     False
Fat         False
Sat.Fat      True
Fiber        True
Carbs       False
dtype: bool

In [10]:
nutrients_num.dropna(inplace=True)
nutrients_num.shape

(331, 7)

In [11]:
nutrients_num.isnull().any()

Grams       False
Calories    False
Protein     False
Fat         False
Sat.Fat     False
Fiber       False
Carbs       False
dtype: bool

In [12]:
nutrients_num_norm = nutrients_num.apply(lambda x: (x/nutrients_num["Grams"]), axis = 0)
nutrients_num_norm.head()

Unnamed: 0,Grams,Calories,Protein,Fat,Sat.Fat,Fiber,Carbs
0,1.0,0.67623,0.032787,0.040984,0.036885,0.0,0.04918
1,1.0,0.365854,0.036585,0.0,0.0,0.0,0.052846
2,1.0,0.51626,0.036585,0.020325,0.01626,0.0,0.052846
3,1.0,1.369048,0.063492,0.079365,0.071429,0.0,0.095238
4,1.0,0.967583,62.720226,29.598309,16.208598,0.98661,83.861875


In [13]:
# Ahora podemos empezar a separar variable objetivo de regresores y a separar datos
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(nutrients_num_norm, test_size=0.2, random_state=42)
print(train_set.shape)
print(test_set.shape)

(264, 7)
(67, 7)


In [14]:
calories_train = train_set['Calories']
calories_test = test_set['Calories']
nutrients_train = train_set[["Grams", "Protein",	"Fat", "Sat.Fat", "Fiber",	"Carbs"]]
nutrients_test = test_set[["Grams", "Protein",	"Fat", "Sat.Fat", "Fiber",	"Carbs"]]

print(nutrients_train.shape)
print(nutrients_test.shape)

(264, 6)
(67, 6)


In [15]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(nutrients_train, calories_train)


LinearRegression()

In [27]:
calories_pred = lin_reg.predict(nutrients_test)
print("calorias predichas", calories_pred[:5])
print("calorias reales", calories_test[:5])

calorias predichas [2.03523581 4.84360996 0.35986756 1.84438248 1.03031445]
calorias reales 25     3.750000
311    6.071429
74     1.058824
223    0.002698
58     1.400000
Name: Calories, dtype: float64


In [17]:
from sklearn.metrics import mean_squared_error

mse_lin_reg = mean_squared_error(calories_test, calories_pred)
mse_lin_reg

1.3310062435230068

In [18]:
rmse_lin_reg = np.sqrt(mse_lin_reg)
rmse_lin_reg

1.1536924388774534

In [19]:
calories_test.mean()
calories_test.std()

1.8949772939907892

In [20]:
from sklearn.linear_model import Ridge

In [21]:
ridge_reg = Ridge(alpha=10)

In [22]:
ridge_reg.fit(nutrients_train, calories_train)

Ridge(alpha=10)

In [23]:
calories_pred_ridge = ridge_reg.predict(nutrients_test)

In [24]:
mse_ridge_reg = mean_squared_error(calories_test, calories_pred_ridge)
rmse_ridge_reg = np.sqrt(mse_ridge_reg)
rmse_ridge_reg

1.3999573604678468