# Week 6 Data Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display


In [None]:
df = pd.read_csv('Cereals.csv')
display(df)

In [None]:
print('Cleaning up the dataset...')
df.replace('?', np.nan, inplace=True)
df['carbo'] = df['carbo'].astype(float)
df['sugars'] = df['sugars'].astype(float)
df['potass'] = df['potass'].astype(float)
dup = df.drop_duplicates(subset='name')
print(dup.shape[0], 'unique brands are there')

In [None]:
mfr_counts = df['mfr'].value_counts()
plt.figure(figsize=(10, 6))
sns.barplot(x=mfr_counts.index, y=mfr_counts.values, palette='Set2')
plt.title('Number of Cereals Listed Per Manufacturer')
plt.xlabel('Manufacturer')
plt.ylabel('Number of Cereals')
plt.tight_layout()
plt.show()

In [None]:
typecount = df['type'].value_counts()
plt.figure(figsize=(8, 8))
typecount.plot(kind='pie', autopct='%1.1f%%', startangle=90, colors=['lightblue', 'orange'])
plt.title('Distribution of Hot vs Cold Cereals')
plt.ylabel('')
plt.show()

In [None]:
best_cereal = df.loc[df['rating'].idxmax()]
worst_cereal = df.loc[df['rating'].idxmin()]
print(f"Best Cereal: {best_cereal['name']} with Rating: {best_cereal['rating']}")
print(f"Worst Cereal: {worst_cereal['name']} with Rating: {worst_cereal['rating']}")
plt.figure(figsize=(10, 6))
sns.barplot(x=['Best Cereal', 'Worst Cereal'], y=[best_cereal['rating'], worst_cereal['rating']])
plt.title('Best and Worst Cereal Ratings')
plt.ylabel('Rating')
plt.show()

In [None]:
max_fiber = df['fiber'].max()
min_sugar = df['sugars'].min()
highest_fiber_cereals = df[df['fiber'] == max_fiber]
lowest_sugar_cereals = df[df['sugars'] == min_sugar]
print(f"Cereals with highest fiber ({max_fiber}g of fiber):")
print(highest_fiber_cereals[['name', 'fiber']])
print(f"\nCereals with lowest sugar ({min_sugar}g of sugar):")
print(lowest_sugar_cereals[['name', 'sugars']])
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='type', y='rating', palette='Set2')
plt.title('Rating Comparison for Hot vs Cold Cereals')
plt.xlabel('Cereal Type')
plt.ylabel('Rating')
plt.show()

In [None]:
high_protein_cereals = df[df['protein'] > 3]
plt.figure(figsize=(12, 6))
sns.barplot(x='protein', y='name', data=high_protein_cereals, palette='Blues')
plt.title('Cereals with More Than 3 Grams of Protein')
plt.xlabel('Protein (g)')
plt.ylabel('Cereal Name')
plt.tight_layout()
plt.show()
shelf_counts = df['shelf'].value_counts()
plt.figure(figsize=(10, 6))
sns.barplot(x=shelf_counts.index, y=shelf_counts.values, palette='Set3')
plt.title('Cereals by Display Shelf')
plt.xlabel('Shelf Position')
plt.ylabel('Number of Cereals')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='mfr', y='sugars', palette='Set3')
plt.title('Sugar Content Variation Across Different Brands')
plt.xlabel('Manufacturer')
plt.ylabel('Sugar Content (g)')
plt.xticks(rotation=45)
plt.show()
avg_calories_mfr = df.groupby('mfr')['calories'].mean()
plt.figure(figsize=(10, 6))
sns.barplot(x=avg_calories_mfr.index, y=avg_calories_mfr.values, palette='BuPu')
plt.title('Average Calories in Cereals per Manufacturer')
plt.xlabel('Manufacturer')
plt.ylabel('Average Calories')
plt.tight_layout()
plt.show()

In [None]:
avg_nutrition = df[['calories', 'protein', 'fat', 'sodium', 'fiber', 'carbo', 'sugars','potass','vitamins']].mean()
avg_nutrition.plot(kind='bar', figsize=(10, 6))
plt.title('Average Nutritional Content Across All Cereals')
plt.ylabel('Average Amount')
plt.xlabel('Nutritional Component')
plt.tight_layout()
plt.show()
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='sugars', y='calories', hue='type', style='type', palette='Set2')
plt.title('Relationship Between Sugar and Calories')
plt.xlabel('Sugar Content (g)')
plt.ylabel('Calories')
plt.show()

In [None]:
rating_median = df['rating'].median()
df['rating_group'] = df['rating'].apply(lambda x: 'High Rating' if x > rating_median else 'Low Rating')
plt.figure(figsize=(10, 6))
sns.boxplot(x='rating_group', y='sugars', data=df, palette='Set2')
plt.title('Sugar Content Comparison for High vs Low Rated Cereals')
plt.xlabel('Rating Group')
plt.ylabel('Sugar Content (g)')
plt.tight_layout()
plt.show()

In [None]:
corr_matrix = df[['fiber', 'sugars', 'rating']].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, linewidths=0.5)
plt.title('Correlation Matrix: Fiber, Sugar, and Rating')
plt.tight_layout()
plt.show()
plt.figure(figsize=(10, 6))
sns.scatterplot(x='fiber', y='rating', data=df, hue='mfr', palette='Set2', alpha=0.7)
plt.title('Scatter Plot: Fiber vs Rating')
plt.xlabel('Fiber Content (g)')
plt.ylabel('Rating')
plt.tight_layout()
plt.show()
plt.figure(figsize=(10, 6))
sns.scatterplot(x='sugars', y='rating', data=df, hue='mfr', palette='Set2', alpha=0.7)
plt.title('Scatter Plot: Sugar vs Rating')
plt.xlabel('Sugar Content (g)')
plt.ylabel('Rating')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='shelf', y='rating', palette='Set1')
plt.title('Cereal Ratings by Display Shelf')
plt.xlabel('Shelf Position')
plt.ylabel('Rating')
plt.show()

## MTCARS Dataset Analysis

In [None]:
df = pd.read_csv('mtcars.csv')
display(df)

In [None]:
df['mpg'].plot(kind='hist', bins=10, title='Histogram Representation of MPG')
plt.show()
df['mpg'].plot(kind='box')
plt.show()
maxmpg = df.loc[df['mpg'].idxmax()]
display(maxmpg)

In [None]:
category_counts = df['am'].value_counts().reset_index()
category_counts.columns = ['am', 'count']
sns.barplot(x='am', y='count', data=category_counts)
plt.title('Automatic vs Manual Cars')
plt.show()

In [None]:
minhp = df.loc[df['hp'].idxmin()]
print(minhp)
five_num_summary = df['disp'].describe()[['min', '25%', '50%', '75%', 'max']]
print(five_num_summary)
df['disp'].plot(kind='box')
plt.show()

In [None]:
heavy = df.loc[df['wt'].idxmax()]
print("The heaviest car is", heavy['model'], "with a weight of", heavy['wt'], ". Gears", heavy['gear'])
qsec = df.loc[df['qsec'].idxmax()]
print("Car with best qsec:", qsec['model'])

In [None]:
manual_cars = df[df['am'] == 1]
auto_cars = df[df['am'] == 0]
print("Mean mpg of manual cars", manual_cars['mpg'].mean())
print("Mean mpg of automatic cars", auto_cars['mpg'].mean())
manual_cars['mpg'].plot(kind='box')
plt.show()
auto_cars['mpg'].plot(kind='box')
plt.show()

In [None]:
dfx = df[['mpg','wt']].reset_index(drop=True)
dfx.plot(kind='scatter', x='mpg', y='wt', title='Scatter plot of mpg and wt')
plt.show()

In [None]:
def categorize_gears(num_gears):
    if num_gears <= 4:
        return 'Low gears'
    elif num_gears <= 6:
        return 'Medium gears'
    else:
        return 'High gears'
df['gear_category'] = df['gear'].apply(categorize_gears)
display(df.head())

In [None]:
sns.scatterplot(x='gear', y='mpg', data=df)
plt.title('Fuel Efficiency (MPG) vs Number of Gears')
plt.xlabel('Number of Gears')
plt.ylabel('Fuel Efficiency (MPG)')
plt.show()
plt.figure(figsize=(8, 6))
sns.scatterplot(x='cyl', y='hp', data=df)
plt.title('Horsepower (HP) vs Number of Cylinders')
plt.xlabel('Number of Cylinders')
plt.ylabel('Horsepower (HP)')
plt.show()

In [None]:
good_fuel_efficiency = df['mpg'] > df['mpg'].median()
high_speed = df['hp'] > df['hp'].median()
selected_cars = df[good_fuel_efficiency & high_speed]
print("Cars with good fuel efficiency and high horsepower:")
print(selected_cars[['model', 'mpg', 'hp']])
sns.scatterplot(x='hp', y='disp', data=df)
plt.title('Horsepower (HP) vs Displacement')
plt.xlabel('HP')
plt.ylabel('Displacement')
plt.show()