Importing the dataset:

In [None]:
df = pd.read_csv(r'cereal.csv')
df.info()
print("\nNull values in each column:\n", df.isnull().sum())


Import necessary libraries for machine learning:

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

Handling missing values:

In [None]:
if df.isnull().sum().sum() > 0:
    plt.figure(figsize=(10, 8))
    sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
    plt.title('Missing Values Heatmap (Before)')
    plt.show()

    imputer = SimpleImputer(strategy='mean')
    df = imputer.fit_transform(df)  # Create a transformed DataFrame

    print("\nNull values after imputation:\n", df.isnull().sum())

    plt.figure(figsize=(10, 8))
    sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
    plt.title('Missing Values Heatmap (After)')
    plt.show()


Handling non-numeric columns and converting categorical data:

In [None]:
# Select only numeric columns from the dataframe
numeric_df = df.select_dtypes(include=[np.number])

PLotting the Correlation Heatmap

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', linewidths=0.5)
plt.title("Correlation Heatmap")
plt.xlabel('Columns')
plt.ylabel('Columns')
plt.show()

Plotting Box Plot for Sugars by Cereal Type

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(x='type', y='sugars', data=df)
plt.xlabel('Cereal Type')
plt.ylabel('Sugars (grams)')
plt.title('Distribution of Sugars by Cereal Type (Box Plot)')
plt.show()

Plotting Pairplot for Numeric Columns

In [None]:
sns.pairplot(df.select_dtypes(include=[np.number]))
plt.show()

Data Preprocessing


In [None]:
# Convert 'type' column to binary (1 for 'C' and 0 otherwise)
df['type'] = (df['type'] == 'C').astype(int)

# Display unique values in 'mfr' column
print("\nUnique values in 'mfr':\n", df['mfr'].unique())

# Replace -1 with NaN
df = df.replace(-1, np.NaN)

# Fill NaN values in specified columns with the mean of each column
for col in ['carbo', 'sugars', 'potass']:
    df[col] = df[col].fillna(df[col].mean())

# Drop 'name' column as it is not needed for analysis
df.drop('name', axis=1, inplace=True)

# One-hot encoding for 'mfr' column
dummy = pd.get_dummies(df['mfr'], dtype=int)
df = pd.concat([df, dummy], axis=1)

# Drop original 'mfr' column after encoding
df.drop('mfr', axis=1, inplace=True)

Separating features and target variable :

In [None]:
y = df['rating']
X = df.drop('rating', axis=1)

Standardizing the Feature Variables :

In [None]:
sc = StandardScaler()
X = pd.DataFrame(sc.fit_transform(X), columns=X.columns)

Splitting the Data into Training and Testing Sets :

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)