In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score

In [None]:
df = pd.read_csv("/content/beer-servings (1).csv",index_col=0)


In [None]:
display(df.head())

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0.0,0.0,0.0,0.0,Asia
1,Albania,89.0,132.0,54.0,4.9,Europe
2,Algeria,25.0,0.0,14.0,0.7,Africa
3,Andorra,245.0,138.0,312.0,12.4,Europe
4,Angola,217.0,57.0,45.0,5.9,Africa


In [None]:
duplicates = df.duplicated().sum()
print(f"Duplicate rows before dropping: {duplicates}")
df = df.drop_duplicates()
print(f"Duplicate rows after dropping: {df.duplicated().sum()}")


Duplicate rows before dropping: 0
Duplicate rows after dropping: 0


In [None]:
print("\n---- Missing Value Handling ----")
for col in df.select_dtypes(include=np.number).columns:
    missing_before = df[col].isna().sum()
    df[col] = df[col].fillna(df[col].median())
    missing_after = df[col].isna().sum()
    print(f"{col}: missing before = {missing_before}, after = {missing_after}")



---- Missing Value Handling ----
beer_servings: missing before = 8, after = 0
spirit_servings: missing before = 8, after = 0
wine_servings: missing before = 6, after = 0
total_litres_of_pure_alcohol: missing before = 1, after = 0


In [None]:
print("\n---- Outlier Handling ----")
for col in df.select_dtypes(include=np.number).columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers_before = ((df[col] < lower) | (df[col] > upper)).sum()
    df[col] = np.clip(df[col], lower, upper)
    outliers_after = ((df[col] < lower) | (df[col] > upper)).sum()
    print(f"{col}: outliers before = {outliers_before}, after = {outliers_after}")


---- Outlier Handling ----
beer_servings: outliers before = 0, after = 0
spirit_servings: outliers before = 5, after = 0
wine_servings: outliers before = 26, after = 0
total_litres_of_pure_alcohol: outliers before = 0, after = 0


In [None]:
X = df.drop(["total_litres_of_pure_alcohol", "country", "continent"], axis=1)
y = df["total_litres_of_pure_alcohol"]

In [None]:
print("\n---- Min-Max Scaling ----")
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
print(pd.DataFrame(X_scaled, columns=X.columns).head())



---- Min-Max Scaling ----
   beer_servings  spirit_servings  wine_servings
0       0.000000           0.0000       0.000000
1       0.236702           0.4224       0.369863
2       0.066489           0.0000       0.095890
3       0.651596           0.4416       1.000000
4       0.577128           0.1824       0.308219


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42)

In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso Regression": Lasso(alpha=0.01),
    "Ridge Regression": Ridge(alpha=1.0)
}

In [None]:
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    results.append([name, train_r2, test_r2])

In [None]:
results_df = pd.DataFrame(
    results, columns=["Model Name", "Train R2", "Test R2"]
)

print("\n---- Model Performance (R2) ----")
print(results_df)



---- Model Performance (R2) ----
          Model Name  Train R2   Test R2
0  Linear Regression  0.827096  0.890826
1   Lasso Regression  0.826907  0.887676
2   Ridge Regression  0.824894  0.880753
