In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [18]:
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor


In [10]:
# Download necessary NLTK data
nltk.download('vader_lexicon')

# Load the dataset
df = pd.read_csv('/kaggle/input/bash-beer/train.csv')

# Initialize Sentiment Intensity Analyzer
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [11]:
tfidf_vectorizer = TfidfVectorizer(max_features=500, stop_words='english')

# Create TF-IDF features for "review/text"
tfidf_text = tfidf_vectorizer.fit_transform(df['review/text'].fillna(''))

# Convert to DataFrame and add to original dataset
tfidf_df = pd.DataFrame(tfidf_text.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
df = pd.concat([df.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

In [12]:
df['sentiment'] = df['review/text'].fillna('').apply(lambda x: sia.polarity_scores(x)['compound'])

# Select features including ratings, beer info, and derived sentiment
features = [
    'beer/ABV', 'review/appearance', 'review/aroma', 'review/palate', 'review/taste',
    'sentiment'
]

# Include TF-IDF features as well
features += tfidf_vectorizer.get_feature_names_out().tolist()

In [16]:
df.head()

Unnamed: 0,index,beer/ABV,beer/beerId,beer/brewerId,beer/name,beer/style,review/appearance,review/aroma,review/overall,review/palate,...,wonderful,wood,woody,worth,wouldn,wow,year,yeast,yellow,sentiment
0,40163,5.0,46634,14338,Chiostro,Herbed / Spiced Beer,4.0,4.0,4.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0258
1,8135,11.0,3003,395,Bearded Pat's Barleywine,American Barleywine,4.0,3.5,3.5,3.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7227
2,10529,4.7,961,365,Naughty Nellie's Ale,American Pale Ale (APA),3.5,4.0,3.5,3.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8175
3,44610,4.4,429,1,Pilsner Urquell,Czech Pilsener,3.0,3.0,2.5,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8625
4,37062,4.4,4904,1417,Black Sheep Ale (Special),English Pale Ale,4.0,3.0,3.0,3.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9305


In [19]:
X = df[features]
y = df['review/overall']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=42),
    "MLPRegressor" : MLPRegressor(hidden_layer_sizes=(64, 64), activation='relu', max_iter=500, random_state=42),
    "XGBRegressor" : XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

}

In [20]:
# Train models and evaluate performance
for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Model Evaluation
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"{model_name} - MSE: {mse:.4f}, MAE: {mae:.4f}, R2 Score: {r2:.4f}")


Training Linear Regression...
Linear Regression - MSE: 0.1526, MAE: 0.2976, R2 Score: 0.6898

Training Random Forest Regressor...
Random Forest Regressor - MSE: 0.1548, MAE: 0.3000, R2 Score: 0.6854

Training MLPRegressor...
MLPRegressor - MSE: 0.3289, MAE: 0.4411, R2 Score: 0.3313

Training XGBRegressor...
XGBRegressor - MSE: 0.1501, MAE: 0.2956, R2 Score: 0.6949


In [22]:
if 'Random Forest Regressor' in models:
    rf_model = models['Random Forest Regressor']
    importances = rf_model.feature_importances_
    feature_importance_df = pd.DataFrame({'feature': features, 'importance': importances})
    feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)

    print("\nTop 10 Important Features:")
    print(feature_importance_df.head(10))


Top 10 Important Features:
               feature  importance
4         review/taste    0.621861
3        review/palate    0.035997
0             beer/ABV    0.012155
5            sentiment    0.007063
1    review/appearance    0.006331
2         review/aroma    0.004790
139          drinkable    0.004097
217               head    0.003875
144               easy    0.003818
41                beer    0.003690
