In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style()
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

import warnings
warnings.filterwarnings('ignore')
sns.set_theme(style='darkgrid', palette='colorblind')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder()

In [None]:
df=pd.read_csv('/kaggle/input/fast-food/FastFoodNutritionMenu.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
df.describe().T

In [None]:
df.shape

In [None]:
df['Company'].value_counts()

In [None]:
df.isna().sum()

In [None]:
df.isna().sum().plot(kind = 'bar')

# REMOVE NULL

In [None]:
df = df.drop('Weight Watchers\nPnts', axis=1)

In [None]:
df=df.dropna()

# REMOVE DUPLICATE

In [None]:
# Finding duplicate rows
duplicate_rows = df[df.duplicated(keep='first')]
# Number of duplicate rows
num_duplicates = duplicate_rows.shape[0]
# Displaying the duplicate rows
print(f"Number of duplicate rows: {num_duplicates}")
duplicate_rows

In [None]:
df = df.drop_duplicates()

In [None]:
df.isna().sum()

# visualization

In [None]:
col_conv = [ 'Calories', 'Calories from\nFat', 'Total Fat\n(g)',
       'Saturated Fat\n(g)', 'Trans Fat\n(g)', 'Cholesterol\n(mg)',
       'Sodium \n(mg)', 'Carbs\n(g)', 'Fiber\n(g)', 'Sugars\n(g)',
       'Protein\n(g)'] 
for column in col_conv:
    df[column] = pd.to_numeric(df[column], errors='coerce')

In [None]:
sns.barplot(x=df['Company'], y=df['Total Fat\n(g)'])

In [None]:
Corr_Matrix = df.corr()

# Set up the figure and plot the heatmap
plt.figure(figsize=(45, 45))
sns.heatmap(Corr_Matrix, annot=True, cmap='coolwarm', center=0)
plt.show()

In [None]:
sns.catplot(data=df, y="Protein\n(g)",  kind="box")#الشكل العادي 

In [None]:
# Distribution of Popularity
plt.figure(figsize=(10, 6))
sns.histplot(df['Protein\n(g)'], kde=True)
plt.title('Distribution of Protein\n(g)')
plt.xlabel('Protein\n(g)')
plt.ylabel('Count')
plt.show()

In [None]:
def plot_correlation_heatmaps_by_company(dataframe):
    
    companies = df['Company'].unique()
    
    for company in companies:
        company_df = df[df['Company'] == company]
        corr_matrix = company_df.corr()
        
        plt.figure(figsize=(8, 6))
        
        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, linewidths=.5)
        
        plt.title(f'Correlation Heatmap - {company}')
        plt.show()

plot_correlation_heatmaps_by_company(df)

In [None]:
import plotly.express as px
companies = df['Company'].unique()
# Define nutritional columns
nutritional_columns = ['Total Fat\n(g)', 'Protein\n(g)', 'Carbs\n(g)']

# Create box plots for each nutritional column
fig = px.box(df, x='Company', y=nutritional_columns, title='Nutritional Values by Company',
             labels={'value': 'Grams'}, category_orders={'Company': companies})
fig.update_layout(xaxis={'categoryorder': 'array', 'categoryarray': companies},
                  xaxis_title='Company', yaxis_title='Grams')

fig.show()

In [None]:
sns.set(style="whitegrid")
# Creating a FacetGrid
g = sns.FacetGrid(df, col="Company", col_wrap=2, height=5)
# Plotting the histogram of Calories for each company
g.map(plt.hist, 'Calories', bins=20, edgecolor='black')
g.set_titles("Distribution of Calories - {col_name}")
g.set_axis_labels("Calories", "Frequency")
plt.tight_layout()
plt.show()

In [None]:
df=df.dropna()

# categorical

In [None]:
df['Company'] = le.fit_transform(df['Company'])
df['Item'] = le.fit_transform(df['Item'])

In [None]:
df.head()

# Top 5 Most Positively Correlated

In [None]:
print('Top 5 Most Positively Correlated to the Target Variable')
Corr_Matrix['Total Fat\n(g)'].sort_values(ascending=False).head(5)

# Top 5 Most Negatively Correlated

In [None]:
print('Top 5 Most Negatively Correlated to the Target Variable')
Corr_Matrix['Total Fat\n(g)'].sort_values(ascending=True).head(5)

# DROP LOW Correlated

In [None]:
columns_to_drop = [col for col in Corr_Matrix.columns if abs(Corr_Matrix.loc['Total Fat\n(g)', col]) < 0.5]
columns_to_drop

In [None]:
df = df.drop(columns_to_drop, axis=1)
df.shape

In [None]:
df=df.dropna()

# split 

In [None]:
X = df.drop(columns=['Total Fat\n(g)'])
y = df['Total Fat\n(g)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Display the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# MODEL

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
}
best_model = None
best_r2 = 0

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred= model.predict(X_test)

    # Evaluate the model
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    submit = pd.DataFrame()
    submit['Actual Price'] = y_test
    submit['Predict_price'] = y_pred
    submit = submit.reset_index()
    print(submit.head(8))
    r2 = r2_score(y_test, y_pred)

    if r2 > best_r2:
        best_r2 = r2
        best_model = model.__class__.__name__

    print(f'{model_name}:')
    print(f'R2 Score: {r2:.2f}')
    print(f'Mean Absolute Error (MAE): {mae:.2f}')
    print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
    print('----------------------------------------')
print(f"The best performing model is: {best_model} with accuracy: {best_r2:.2f}")

# feature_importances

In [None]:
importances = model.feature_importances_
feature_names = X.columns
feature_importance_dict = dict(zip(feature_names, importances))
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance:.2f}")
plt.figure(figsize=(12, 7))
plt.barh(*zip(*sorted_feature_importance), alpha=0.9, color='teal')
plt.title('Feature Importance', fontsize=15)
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()

# forward_selection with column (Total Fat\n(g))

In [None]:
import pandas as pd
import statsmodels.api as sm

# Your DataFrame
# df = ...

X = df.drop(columns=['Total Fat\n(g)'])
y = df['Total Fat\n(g)']

def forward_selection(df, target, significance_level=0.05):
    initial_features = df.columns.tolist()
    best_features = []
    while len(initial_features) > 0:
        remaining_features = list(set(initial_features) - set(best_features))
        new_pval = pd.Series(index=remaining_features)
        for new_column in remaining_features:
            model = sm.OLS(target, sm.add_constant(df[best_features + [new_column]])).fit()
            new_pval[new_column] = model.pvalues[new_column]
        min_p_value = new_pval.min()
        if min_p_value < significance_level:
            best_features.append(new_pval.idxmin())
        else:
            break
    return best_features

# Assuming you have already defined X and y as the features and target variable respectively
selected_features = forward_selection(X, y)
print("Selected features:", selected_features)

In [None]:
Selected_features = [ 'Saturated Fat\n(g)', 'Calories', 'Company', 'Sodium \n(mg)','Calories from\nFat']

X = df[selected_features]
y = df['Total Fat\n(g)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Display the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
}
best_model = None
best_r2 = 0

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred= model.predict(X_test)

    # Evaluate the model
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    submit = pd.DataFrame()
    submit['Actual Price'] = y_test
    submit['Predict_price'] = y_pred
    submit = submit.reset_index()
    print(submit.head(8))
    r2 = r2_score(y_test, y_pred)

    if r2 > best_r2:
        best_r2 = r2
        best_model = model.__class__.__name__

    print(f'{model_name}:')
    print(f'R2 Score: {r2:.2f}')
    print(f'Mean Absolute Error (MAE): {mae:.2f}')
    print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
    print('----------------------------------------')
print(f"The best performing model is: {best_model} with accuracy: {best_r2:.2f}")

# forward_selection with colums( Calories from\nFat)

In [None]:
import pandas as pd
import statsmodels.api as sm

# Your DataFrame
# df = ...

X = df.drop(columns=['Calories from\nFat'])
y = df['Calories from\nFat']

def forward_selection(df, target, significance_level=0.05):
    initial_features = df.columns.tolist()
    best_features = []
    while len(initial_features) > 0:
        remaining_features = list(set(initial_features) - set(best_features))
        new_pval = pd.Series(index=remaining_features)
        for new_column in remaining_features:
            model = sm.OLS(target, sm.add_constant(df[best_features + [new_column]])).fit()
            new_pval[new_column] = model.pvalues[new_column]
        min_p_value = new_pval.min()
        if min_p_value < significance_level:
            best_features.append(new_pval.idxmin())
        else:
            break
    return best_features

# Assuming you have already defined X and y as the features and target variable respectively
selected_features = forward_selection(X, y)
print("Selected features:", selected_features)

In [None]:
Selected_features = [ 'Total Fat\n(g)', 'Saturated Fat\n(g)', 'Sodium \n(mg)', 'Calories', 'Company']

X = df[selected_features]
y = df['Calories from\nFat']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Display the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
}
best_model = None
best_r2 = 0

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred= model.predict(X_test)

    # Evaluate the model
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    submit = pd.DataFrame()
    submit['Actual Price'] = y_test
    submit['Predict_price'] = y_pred
    submit = submit.reset_index()
    print(submit.head(8))
    r2 = r2_score(y_test, y_pred)

    if r2 > best_r2:
        best_r2 = r2
        best_model = model.__class__.__name__

    print(f'{model_name}:')
    print(f'R2 Score: {r2:.2f}')
    print(f'Mean Absolute Error (MAE): {mae:.2f}')
    print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
    print('----------------------------------------')
print(f"The best performing model is: {best_model} with accuracy: {best_r2:.2f}")

In [None]:
'''from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Split data into features (X) and target (y)
X = df.drop(columns=['Item', 'Company'])
y = df['Item']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a decision tree classifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Evaluate model accuracy
accuracy = accuracy_score(y_test, predictions)
print("Model Accuracy:", accuracy)'''