# Project Descreption

The Objective of this project is :

1.   Use the OpenFoodFact   that shows  characteristics of certain product groups, similarities between products and product groups, to provide a global view of the dataset, and exhibit salient features that are of interest for an analyst or stakeholder in this sector.

2.   Use some machine learning algorithms to : 

        *   predict the **nutriscore_grade** of a product given nutritional values and possibly other fields (as few as possible)
        *   predict the **nova_group** of a product given nutritional values and possibly other fields (as few as possible),
        *   predict the **pnns_groups_1** of a product given nutritional values and possibly other fields (as few as possible),
        *   predict the **pnns_groups_2** of a product given nutritional values and possibly other fields (as few as possible),
        *   predict the **categories** (either atomic categories or lists of categories) of a product given nutritional values and possibly other fields (as few as possible),
        *   predict one or more **nutritional values** (ex: sugars_100g) given nutritional values and possibly other fields (as few as possible)
        






# Importing the Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn')
from scipy.stats import norm, skew
import numpy as np
import seaborn as sns
from sklearn.preprocessing import  StandardScaler,  LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from scipy.stats import skew
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
import statsmodels.api as sm
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
import lightgbm as lgb
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor


# Importing the dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data = pd.read_csv('/content/drive/My Drive/off_complete.csv', sep = '\t')

In [None]:
data.shape

In [None]:
data.head()


# Data Visualisation

In [None]:
plt.rcParams['figure.figsize'] = (30, 20)
sns.heatmap(data.corr(), annot = True)
plt.title('Histogram of the Dataset', fontsize = 30)
plt.xticks(size = 25)
plt.yticks(size = 25)
plt.show()

**Note :**
From the heatmap, we can see that **Nutriscore_score** and **Nutriscore_score_fr** are totaly correlated, and **Sodium** and **Salt** too, so we can delete them.

In [None]:
data.drop(['salt_100g'], 1, inplace=True)
data.drop(['nutrition-score-fr_100g'], 1, inplace=True)

**Comparison the nutriscore grade for every code**

In [None]:
plt.rcParams['figure.figsize'] = (10, 6)
sns.countplot(data['nutriscore_grade'], palette = 'pink')
plt.title('Most Existing Nutriscore Grade ', fontsize = 20)
plt.show()

**comparison of nova_group for every code**

In [None]:
plt.rcParams['figure.figsize'] = (10, 5)
sns.countplot(data['nova_group'])
plt.title('Most Existing nova_group ', fontsize = 20)
plt.show()

different pnns_groups_1 acquired by the the products

In [None]:
plt.rcParams['figure.figsize'] = (40, 25)

plt.style.use('fivethirtyeight')
ax = sns.countplot(data['pnns_groups_1'], palette = 'bone')
ax.set_xlabel(xlabel = 'Different pnns_groups_1', fontsize = 40)
ax.set_ylabel(ylabel = 'Count of Products', fontsize = 40)
ax.set_title(label = 'Comparison of pnns_groups_1 and products', fontsize = 40)
plt.xticks(size = 25)
plt.yticks(size = 30)
plt.show()

**Comparing the product's nutriscore_score**

In [None]:
import warnings
warnings.filterwarnings('ignore')

plt.rcParams['figure.figsize'] = (25, 15)
sns.distplot(data['nutriscore_score'], color = 'blue')
plt.xlabel('nutriscore_score Range for Products', fontsize = 16)
plt.ylabel('Count of the Products', fontsize = 16)
plt.title('Distribution of nutriscore_score of Products', fontsize = 20)
plt.xticks(rotation = 90)
plt.show()

 show Different fat_100g of the products

In [None]:
sns.distplot(data['fat_100g'], color = 'pink')
plt.title('Different fat_100g of the products')
plt.xlabel('fat_100g associated with the Products')
plt.ylabel('count of Products')
plt.show()

`show Different countries tags that these products come from`

In [None]:
data['countries_tags'].value_counts().head(20).plot.bar(color = 'orange', figsize = (20, 7))
plt.title('Different countries tags that these products come from', fontsize = 30, fontweight = 20)
plt.xlabel('Name of The Country')
plt.ylabel('count')
plt.xticks(size = 25)
plt.show()

**show Different brands of the products**

In [None]:
data['brands'].value_counts().head(20).plot.bar(color = 'orange', figsize = (15, 7))
plt.title('Different brands of the products', fontsize = 30, fontweight = 20)
plt.xlabel('Name of The Brand')
plt.ylabel('count')
plt.xticks(size = 25)
plt.show()

best product per each nutriscore_grade with their , pnns_groups_2, pnns_groups_1 and code  based on their  energy-kcal_100g


In [None]:
data.iloc[data.groupby(data['nutriscore_grade'])['energy-kcal_100g'].idxmax()][['nutriscore_grade','product_name','nova_group', 'pnns_groups_1',
                                                                    'pnns_groups_2','energy-kcal_100g']].style.background_gradient('Reds')


best product per each nutriscore_grade with their , pnns_groups_2, pnns_groups_1 and code  based on their  proteins_100g


In [None]:
data.iloc[data.groupby(data['nutriscore_grade'])['proteins_100g'].idxmax()][['nutriscore_grade','product_name','nova_group', 'pnns_groups_1',
                                                                    'pnns_groups_2','proteins_100g']].style.background_gradient('Blues')

**best product per each nutriscore_grade with their , pnns_groups_2, pnns_groups_1 and code  based on their  energy-kcal_100g**

In [None]:
data.iloc[data.groupby(data['nutriscore_grade'])['carbohydrates_100g'].idxmax()][['nutriscore_grade','product_name','nova_group', 'pnns_groups_1',
                                                                    'pnns_groups_2','carbohydrates_100g']].style.background_gradient('Reds')


**picking up the countries_tags with highest number of products**

In [None]:
data['countries_tags'].value_counts().head(8)

**the most  countries_tags's products and their nutriscore_score**

In [None]:
some_countries = ('en:france', 'en:united-states', 'en:spain', 'en:belgium', 'en:united-kingdom', 'en:germany',
                  'en:canada', 'en:france,en:germany')
data_countries = data.loc[data['countries_tags'].isin(some_countries) & data['nutriscore_score']]

plt.rcParams['figure.figsize'] = (15, 7)
ax = sns.violinplot(x = data_countries['countries_tags'], y = data_countries['nutriscore_score'], palette = 'Reds')
ax.set_xlabel(xlabel = 'countries_tags', fontsize = 9)
ax.set_ylabel(ylabel = 'nutriscore_scores', fontsize = 9)
ax.set_title(label = 'Distribution of nutriscore_score of products from different countries_tags', fontsize = 20)
plt.xticks(size = 20)
plt.show()

**Every countries_tags's Product and their energy-kcal_100g**

In [None]:
some_countries = ('en:france', 'en:united-states', 'en:spain', 'en:belgium', 'en:united-kingdom', 'en:germany',
                  'en:canada', 'en:france,en:germany')
data_countries = data.loc[data['countries_tags'].isin(some_countries) & data['energy-kcal_100g']]

plt.rcParams['figure.figsize'] = (15, 7)
ax = sns.barplot(x = data_countries['countries_tags'], y = data_countries['energy-kcal_100g'],palette = 'Purples')
ax.set_xlabel(xlabel = 'countries_tags', fontsize = 9)
ax.set_ylabel(ylabel = 'energy-kcal_100g', fontsize = 9)
ax.set_title(label = 'Distribution of energy-kcal_100g of products from different countries_tags', fontsize = 20)
plt.xticks(size = 20)
plt.show()


**the the most used and popular product**

In [None]:
data['product_name'].value_counts().head(10)

In [None]:
some_products = ('Pâte à sucre', 'The Madelaine Chocolate Company, Solid Milk Chocolate', 'Glaçage fondant'
                 , 'Colorant alimentaire', 'Pain aux 2 lins', 'Miel',
             'The Madelaine Chocolate Company, Solid Dark Chocolate', 'Crème dessert chocolat', 'Vitória crackers')

data_products = data.loc[data['product_name'].isin(some_products) & data['sugars_100g']]

ax = sns.boxplot(x = data_products['product_name'], y = data_products['sugars_100g'], palette = 'inferno')
ax.set_xlabel(xlabel = 'Some Popular product_name', fontsize = 9)
ax.set_ylabel(ylabel = ' sugars_100g', fontsize = 9)
ax.set_title(label = 'Distribution of sugars_100g  in Different popular product_name', fontsize = 20)
plt.xticks(rotation = 90)
plt.xticks(size = 20)
plt.show()

**Distribution of nutriscore_score in some Popular products**

In [None]:
some_products = ('Pâte à sucre', 'The Madelaine Chocolate Company, Solid Milk Chocolate', 'Glaçage fondant', 'Colorant alimentaire',
                 'Pain aux 2 lins', 'Miel','The Madelaine Chocolate Company, Solid Dark Chocolate', 'Crème dessert chocolat', 'Vitória crackers')

data_products = data.loc[data['product_name'].isin(some_products) & data['nutriscore_score']]

plt.rcParams['figure.figsize'] = (16, 8)
ax = sns.violinplot(x = 'product_name', y = 'nutriscore_score', data = data_products, palette = 'bright')
ax.set_xlabel(xlabel = 'Names of some popular products', fontsize = 10)
ax.set_ylabel(ylabel = 'Distribution of nutriscore_score', fontsize = 10)
ax.set_title(label = 'Disstribution of nutriscore_score  in some Popular product_name', fontsize = 20)
plt.xticks(rotation = 90)
plt.show()

 finding 15 poorest products from  Calcium


In [None]:
data.sort_values('calcium_100g', ascending = True)[['product_name','nutriscore_grade', 'code', 'nutriscore_score'
,'countries_tags', 'calcium_100g']].head(15).style.background_gradient('viridis')

finding 15 richest products from  Calcium

In [None]:

data.sort_values('calcium_100g', ascending = False)[['product_name','nutriscore_grade', 'code', 'nutriscore_score',
                                                   'countries_tags', 'calcium_100g']].head(15).style.background_gradient('viridis')

finding 15 poorest products from  Energie

In [None]:
data.sort_values('energy-kcal_100g', ascending = True)[['product_name', 'nutriscore_grade','code', 'nutriscore_score',
                                                      'countries_tags', 'energy-kcal_100g']].head(15).style.background_gradient('viridis')

**Finding 15 richest products from  Energie**

In [None]:
data.sort_values('energy-kcal_100g', ascending = False)[['product_name', 'nutriscore_grade','code', 
                                                       'nutriscore_score','countries_tags', 'energy-kcal_100g']].head(15).style.background_gradient('viridis')

In [None]:
def productdata(x):
    return data.loc[x,:]

x = productdata(233)  
pd.set_option('display.max_rows', 200)
x = pd.DataFrame(x)
print(x)