In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("/content/food_coded.csv")

This CSV file looks like this. It seems to be the data about food collected through a survey among people.

In [None]:
df.head(5)

In [None]:
df.columns

All the columns seem to have meaningful names....So, as for now, no need to change their names.

In [None]:
df.shape

In [None]:
df.info()

The nan values in the columns below can be replaced with mode of each column and the datatype can be converted into int. This inference can be drawn looking at the unique values in each of these columns.

In [None]:
df.tortilla_calories.unique() # No. of tortilla calories.

In [None]:
df.sports.unique() # 2 - They don't play sports, 1- they  play sports

In [None]:
df.soup.unique() # 1- they drink soup , 2 - they  don't drink it

In [None]:
df.self_perception_weight.unique()

In [None]:
df.persian_food.unique() #  ratings given for the persian food on the scale of 1-5.

In [None]:
df.on_off_campus.unique()  # Each value represents no. of days they are on campus in a week

In [None]:
df.mother_education.unique()  #This can be taken as education divided into 5 categories.

In [None]:
df.marital_status.unique()  # Looks like the marital status is divided into 3 categories....
# 1 -  married
# 2 - not married
# 4 - something else like not interested to tell or don't want to reveal.

In [None]:
df.life_rewarding.unique()   # The ratings that they give to their life rewarding.

In [None]:
df.income.unique()  # The values represent no. of weekdays they earn income that is working days.

array([5, 4, 6, 1, 3, 2])

In [None]:
df.fav_food.unique()   #Ratings for the fav food

In [None]:
df.father_education.unique()  # This can be taken as education divided into 5 categories.

In [None]:
df.employment.unique()   #Seems like there are 3 types of employment.

array([3, 2, 1])

In [None]:
df.exercise.unique() #Seems like there are 3 types of exercises.

In [None]:
df.drink.unique()
# 1 represents that they drink and 2 that they don't drink

In [None]:
df.cuisine.unique()
# Each value representing no. of weekdays having some cuisine


In [None]:
df.cook.unique()
# Each value represent ratings for the cook.

In [None]:
df.calories_scone.unique()

In [None]:
df.calories_day.unique()

In [None]:
col = ['calories_day','calories_scone','cook','cuisine','drink','exercise','employment','father_education','fav_food','income','life_rewarding','marital_status','mother_education'
,'on_off_campus','persian_food','self_perception_weight','soup','sports','tortilla_calories']
for i in col:
  df[i].fillna(df[i].mode().iloc[0],inplace=True)
  df[i] = df[i].astype(int)   #Replacing nan values with mode and changing dtype to int

In [None]:
df.describe()

In [None]:
df.weight.unique()

The weight should be in int or float. So, the next cell converts it into float and nan values with the mean values.

In [None]:
# Convert 'weight' column to numeric, coerce errors to handle non-convertible values
df['weight'] = pd.to_numeric(df['weight'], errors='coerce').astype('float')

# Replace NaN values with the mean of non-null values in the 'weight' column
mean_weight = df['weight'].mean()
df['weight'].fillna(mean_weight, inplace=True)


In [None]:
df['GPA'].unique()

GPA usually has a float datatype. So,in the next cell, it is converted to float datatype with NaN values replaced with mean values.

In [None]:
# Convert 'GPA' column to string
df['GPA'] = df['GPA'].astype(str)

# Remove non-numeric characters and convert to numeric values
df['GPA'] = pd.to_numeric(df['GPA'].str.replace('[^0-9.]', '', regex=True), errors='coerce')

# Replace invalid GPA values with mean of remaining values(e.g., negative values)
mean = df['GPA'].mean()
df.GPA.fillna(mean,inplace=True)

df['GPA'].unique()


In [None]:
df["comfort_food_reasons_coded.1"].unique()

In [None]:
df.comfort_food_reasons_coded.unique()

In [None]:
diff = df[(df["comfort_food_reasons_coded.1"]-df["comfort_food_reasons_coded"])!=0]
new_df = {"comfort_food_reasons_coded.1": diff["comfort_food_reasons_coded.1"],"comfort_food_reasons_coded":  diff["comfort_food_reasons_coded"]}
pd.DataFrame(new_df)


The above cell shows the rows that differ from each other in "comfort_food_reasons_coded.1" and "comfort_food_reasons_coded" column. As they differ in only one row except for the row where the latter has NaN values. Also, the "comfort_food_reasons_coded.1" column has datatype int which is more suitable and more accurate so, we can just delete the latter column from our dataframe.


In [None]:
df.drop(['comfort_food_reasons_coded'], axis=1, inplace=True)
df.rename(columns={'comfort_food_reasons_coded.1': 'comfort_food_reasons_coded'},inplace=True)

The nan values in the columns with dtype object can be replaced with "Not Known"....This is done in the next cell.

In [None]:
object_columns = df.select_dtypes(include=['object'])

# Replace null values with "Not Known" in the selected columns
df[object_columns.columns] = object_columns.apply(lambda col: col.fillna("Not Known"))

Now, the data is almost cleaned with all possible values converted into int. All the null values are now taken care of. All the necessary columns with dtype object are coded. They can be removed from data but I have kept it just for the sake of more understanding of these parameters.

In [None]:
df.info()

In [None]:
x1 = [df.calories_chicken.mean(), df.calories_scone.mean(), df.tortilla_calories.mean(), df.turkey_calories.mean(), df.waffle_calories.mean()]
data= {'Values': x1,'Items':['calories\nchicken', 'calories\nscone', 'tortilla\ncalories', 'turkey\ncalories', 'waffle\ncalories']}
df_new = pd.DataFrame(data)

plt.title('Average Calory intake of various food items')
sns.barplot(x='Items', y='Values', data = df_new, order=df_new.sort_values('Values',ascending=False).Items)
plt.show()

From the above graph, it is seen that the **waffle's** and **tortilla's** calories intake is much more than other calories.




In [None]:
x1 = [df.ethnic_food.mean(), df.greek_food.mean(), df.indian_food.mean(), df.italian_food.mean(), df.persian_food.mean(), df.thai_food.mean()]
data= {'x': x1,'y':['ethnic\nfood', 'greek\nfood', 'indian\nfood', 'italian\nfood', 'persian\nfood','thai\nfood']}
dfn=pd.DataFrame(data)

plt.title('Average Ratings of different food types')
sns.barplot(x='y', y='x', data = dfn,order=dfn.sort_values('x',ascending=False).y)
plt.xlabel("Food types")
plt.ylabel("Ratings")
plt.yticks([0, 1, 2, 3, 4,5])
plt.show()

The **Italian food** is most popular among the people and the **Persian food** being least popular.

In [None]:
sns.lineplot(data=df, x=df.index, y='healthy_feeling', label='Healthy Feeling')
sns.lineplot(data=df, x=df.index, y='life_rewarding', label='Life Rewarding')


# Adding labels and title
plt.xlabel('Index')
plt.ylabel('Values')
plt.title('Line Plot of Multiple Columns')

# Display the legend
plt.legend()

# Show the plot
plt.show()

We can see for a large no. of people the life rewarding is high even though they have less healthy feeling.

In [None]:
df.fav_cuisine.unique()
# We can plot a pie chart showing the favourite cuisine based on count of various food items but I am unable to do it as some these strings are not fit for this. Same goes for type_sports.

In [None]:
sns.scatterplot(x="breakfast",y="GPA",data=df)
plt.show()

Here 1 represents that the person having breakfast and 2 represents that they don't have breakfast. So, we can see that people who have breakfast have good GPA.

In [None]:
sns.lineplot(data=df, x='veggies_day', y='nutritional_check', label='Veggies per day',errorbar=None)
sns.lineplot(data=df, x='fruit_day', y='nutritional_check', label='Fruits per day',errorbar=None)
sns.lineplot(data=df, x='eating_out', y='nutritional_check', label='Eating out',errorbar=None)

# Adding labels and title
plt.xlabel("Values")
plt.ylabel('Nutritional Values')


# Display the legend
plt.legend(loc = 'lower right')

# Show the plot
plt.show()

It can be inferred that with the increasing veggies and fruits per day, the nutrirional content increases.
Also, with increasing value of eating out, nutritional value decreases.

In [None]:
sns.lineplot(data=df, x='diet_current_coded',y='ideal_diet_coded')
plt.xlabel('Current Diet (Coded value)')
plt.ylabel('Ideal Diet (Coded value)')
plt.show()

It seems like the current diet followed by the people is not that ideal for them. The current diet with value around 4 seems to be having highest ideal diet value.

In [None]:
sns.lineplot(data=df, x='weight',y='self_perception_weight')
plt.xlabel('Actual Weight')
plt.ylabel('Self Perception Weight')
plt.show()

In [None]:
sns.lineplot(data=df, x='exercise', y='GPA')
plt.xticks([1,2,3])
plt.show()

The people who do the third type of exercise are observed to have less GPA.

In [None]:
sns.lineplot(data=df, x='on_off_campus', y='GPA')
plt.xticks([1,2,3,4])
plt.xlabel("No. of days they stay on campus")
plt.show()

As expected, the higher no. of days people stay on campus, the higher is their GPA with a thin errorbar.

In [None]:
sns.countplot(x='employment',  data = df)
plt.show()

The numbers are encoded values for the following:


1.   Not employed
2.   Part time employment
3.   Full time employment

We can see that most of the people are having **part time employment**.



In [None]:
sns.lineplot(x="employment",y="GPA",data=df)
plt.xticks([1,2,3])
plt.show()

As represented above, the people are not employed at all are having higher GPA and the people with full time employment have lesser GPA.

In [None]:
sns.countplot(x='ideal_diet_coded',  data = df, hue="Gender")
plt.show()

The no. of people with smaller ideal diet values are more.

In [None]:
sns.scatterplot(x='Gender', y="GPA", data = df)
plt.show()

Here 1 represents Male and 2 represents Female. We can see that the GPA of males is scattered around 3.75 while for the females it is not that scattered there moreover is spread out.

In [None]:
correlation = df.corr()
plt.figure(figsize=(12,10))
sns.heatmap(correlation, vmax = 1, vmin =-1)
plt.show()

The whittish spots on the above heat map show that the various food types are related to each other. There are no as such dark spots in the heat map which may represent inverse relation between any two parameters.