In [11]:
# model_training.py
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score

# Load dataset
df = pd.read_csv(
    r"C:\Users\rahul\OneDrive\Desktop\anitha\DSA\dsa assignments\streamlitappdeploy\beer-servings.csv"
)
df.dropna(inplace=True)
print(df.head())



   Unnamed: 0      country  beer_servings  spirit_servings  wine_servings  \
0           0  Afghanistan            0.0              0.0            0.0   
1           1      Albania           89.0            132.0           54.0   
2           2      Algeria           25.0              0.0           14.0   
3           3      Andorra          245.0            138.0          312.0   
4           4       Angola          217.0             57.0           45.0   

   total_litres_of_pure_alcohol continent  
0                           0.0      Asia  
1                           4.9    Europe  
2                           0.7    Africa  
3                          12.4    Europe  
4                           5.9    Africa  


In [12]:
# Features & target
X = df[["country", "beer_servings", "spirit_servings", "wine_servings", "continent"]]
y = df["total_litres_of_pure_alcohol"]

# Categorical & numerical
categorical = ["country", "continent"]
numerical = ["beer_servings", "spirit_servings", "wine_servings"]

# Preprocessor
preprocessor = ColumnTransformer(
    [("cat", OneHotEncoder(handle_unknown="ignore"), categorical)],
    remainder="passthrough",
)

# Pipelines
pipe_lr = Pipeline([("pre", preprocessor), ("model", LinearRegression())])

pipe_rf = Pipeline([("pre", preprocessor), ("model", RandomForestRegressor(random_state=42))])

In [13]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train both models
pipe_lr.fit(X_train, y_train)
pipe_rf.fit(X_train, y_train)

# Evaluate
r2_lr = r2_score(y_test, pipe_lr.predict(X_test))
r2_rf = r2_score(y_test, pipe_rf.predict(X_test))

print(f"Linear R2: {r2_lr:.3f} | Random Forest R2: {r2_rf:.3f}")



Linear R2: 0.836 | Random Forest R2: 0.822


In [14]:
# Hyperparameter tuning for Random Forest
params = {"model__n_estimators": [50, 100, 200], "model__max_depth": [None, 5, 10]}
grid = GridSearchCV(pipe_rf, param_grid=params, cv=5, scoring="r2", n_jobs=-1)
grid.fit(X_train, y_train)

print("Best Random Forest R2:", r2_score(y_test, grid.best_estimator_.predict(X_test)))


Best Random Forest R2: 0.8218393896092901


In [15]:
# Save best model
with open("model.pkl", "wb") as f:
    pickle.dump(grid.best_estimator_, f)