In [22]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import classification_report, confusion_matrix

In [7]:
boardgames = pd.read_csv("cleaned_boardgames.csv")
boardgames


Unnamed: 0,title,rating,type,min_age,max_players,max_time,weight,nlp_description
0,Brass: Birmingham,8.6,Strategy,14.0,4,120,3.87,brass birminghamis economic strategy game sequ...
1,Pandemic Legacy: Season 1,8.5,Strategy,13.0,4,60,2.83,pandemic legacyis co operative campaign game o...
2,Ark Nova,8.5,Strategy,14.0,4,150,3.78,inark nova plan design modern scientifically m...
3,Gloomhaven,8.6,Strategy,14.0,4,120,3.91,game persistent changing world ideally played ...
4,Twilight Imperium: Fourth Edition,8.6,Strategy,14.0,6,480,4.33,twilight imperium fourth edition game galactic...
...,...,...,...,...,...,...,...,...
892,Disney Villainous: Perfectly Wretched,7.6,Family,10.0,3,60,2.31,indisney villainous perfectly wretched player ...
893,A War of Whispers,7.3,Strategy,14.0,4,60,2.57,war whispersis competitive board game player f...
894,Warhammer: Invasion,7.2,Customizable,13.0,2,45,2.74,warhammer invasion card game two player card g...
895,878 Vikings: Invasions of England,7.5,Wargames,12.0,4,120,2.61,year past year viking raiding party norway den...


In [8]:
vectorizer = TfidfVectorizer(max_features=1000) 
X_text = vectorizer.fit_transform(boardgames['nlp_description'])

In [25]:
structured = boardgames[['rating', 'min_age', 'max_players', 'max_time']].copy()
structured = structured.fillna(0)
structured = structured.astype(float)

type_dummies = pd.get_dummies(boardgames['type'], prefix='type')
X_structured = pd.concat([structured, type_dummies], axis=1).astype(float)

#X_structured = structured.to_numpy()
X_all = hstack([X_text, X_structured])

y = boardgames['weight']
X_train, X_test, y_train, y_test = train_test_split(X_all, y, test_size=0.2, random_state=42)

In [26]:
model = Ridge()
model.fit(X_train, y_train)

In [27]:
y_pred = model.predict(X_test)
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))
print("R^2 Score:", r2_score(y_test, y_pred))

RMSE: 0.4217795856313535
R^2 Score: 0.7457451447102826




In [31]:
test_indices = y_test.index

# Step 2: Compute absolute error
errors = np.abs(y_pred - y_test)

# Step 3: Create a DataFrame to store results
results_df = pd.DataFrame({
    'title': boardgames.loc[test_indices, 'title'].values,
    'actual_weight': y_test.values,
    'predicted_weight': y_pred,
    'error': errors
})

# Step 4: Sort by smallest error
closest_predictions = results_df.sort_values(by='error')

# Step 5: View or export
closest_predictions
closest_predictions.to_csv("closest_predictions.csv", index=False)

In [12]:
def label_weight(w):
    if w < 1.0:
        return 'Light'
    elif w < 2.0:
        return 'Medium Light'
    elif w < 3.0:
        return 'Medium'
    elif w < 4.0:
        return 'Medium Heavy'
    else:
        return 'Heavy'

boardgames['weight_class'] = boardgames['weight'].apply(label_weight)

In [21]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1,2))

# Encode class labels
le = LabelEncoder()
y = le.fit_transform(boardgames['weight_class'])  # e.g., Light=0, Medium=1, Heavy=2

# Fit logistic regression
model = LogisticRegression(multi_class='ovr', max_iter=1000)
pipeline = make_pipeline(tfidf, model)
pipeline.fit(boardgames['nlp_description'], y)

# Get words and their weights
all_top_words = []
feature_names = tfidf.get_feature_names_out()
for i, class_name in enumerate(le.classes_):
    top_words = sorted(zip(model.coef_[i], feature_names), reverse=True)[:25]
    print(f"\nTop words for {class_name} games:")
    for coef, word in top_words:
        print(f"{word}: {coef:.4f}")
        all_top_words.append({
            'weight_class': class_name,
            'word': word,
            'coefficient': coef
        })
top_words_df = pd.DataFrame(all_top_words)


Top words for Heavy games:
company: 1.1150
wine: 1.0509
factory: 0.9513
age: 0.7761
new: 0.7434
economic: 0.6234
class: 0.6025
invention: 0.5999
sie: 0.5926
energy: 0.5862
island: 0.5836
progress: 0.5614
die: 0.5608
party: 0.5530
machine: 0.5484
century: 0.5439
technology: 0.5322
trade: 0.5282
society: 0.5161
powerful: 0.5027
modern: 0.4867
driven: 0.4828
mage knight: 0.4792
renaissance: 0.4740
military: 0.4733

Top words for Medium games:
building: 1.2746
dragon: 1.0626
quest: 1.0004
deck: 0.9643
scenario: 0.7644
villain: 0.7600
glory: 0.7174
tile: 0.6923
combination: 0.6782
treasure: 0.6607
legend: 0.6607
card: 0.6511
robot: 0.6309
viking: 0.6255
defeat: 0.6222
settlement: 0.6200
empire: 0.6089
disease: 0.6044
unique: 0.6027
phase: 0.5783
game description: 0.5699
castle: 0.5688
kingdom: 0.5682
battlefield: 0.5516
special power: 0.5479

Top words for Medium Heavy games:
action: 1.4632
worker: 1.2578
great: 0.9856
god: 0.9352
nation: 0.9274
civilization: 0.9215
conflict: 0.8964
wall: 

In [20]:
top_words_df.to_csv("top_weight_words.csv")
print("Exported to csv")

Exported to csv


In [24]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1,2))
X_tfidf = tfidf.fit_transform(boardgames['nlp_description'])

# 2. Encode target labels (weight_class)
le = LabelEncoder()
y = le.fit_transform(boardgames['weight_class'])  # e.g., Light=0, Medium=1, Heavy=2

# 3. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# 4. Train a Decision Tree Classifier
tree_model = DecisionTreeClassifier(max_depth=20, random_state=42)
tree_model.fit(X_train, y_train)

# 5. Get feature importances
feature_names = tfidf.get_feature_names_out()
importances = tree_model.feature_importances_

# 6. Put into DataFrame for easier inspection
importance_df = pd.DataFrame({
    'word': feature_names,
    'importance': importances
})

# 7. Filter and sort by importance
top_words = importance_df[importance_df['importance'] > 0].sort_values(by='importance', ascending=False).head(30)

# 8. Export to CSV
top_words.to_csv("decision_tree_word_importance.csv", index=False)

print(top_words)

             word  importance
41         action    0.027340
944      conflict    0.026545
4953        world    0.024111
4416         team    0.023696
1245         dice    0.023317
3116       people    0.021250
3221       played    0.020633
4791      victory    0.018005
1358         draw    0.015664
1549       europe    0.014199
488      bringing    0.013941
1131       decide    0.013926
3146        piece    0.013149
3367        point    0.012743
3200         play    0.012596
570          card    0.012395
3967       secret    0.012312
1580     existing    0.012258
1362      drawing    0.011562
3426     possible    0.011502
3973       secure    0.011365
1895     game end    0.010883
2206     humanity    0.010678
763        choose    0.010568
2775   modern age    0.010292
4713     upgraded    0.009963
1608  exploration    0.009881
1527          era    0.009741
2034        great    0.009668
4555         town    0.009628
