In [None]:
# Initial imports
import pandas as pd
import sklearn as skl
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sqlalchemy import create_engine
from config import db_password

In [None]:
# create the database engine to connect to the SQL database
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/BoardGames"
engine = create_engine(db_string)

In [None]:
# Create a dataframe from the database
games_df = pd.read_sql_table("games", con=engine)
games_df.head()

In [None]:
# Change data types of Complexity object to Float64
games_df["Complexity Average"] = games_df["Complexity Average"].astype(float)
# Change data types of Rating Average object to Float64
games_df["Rating Average"] = games_df["Rating Average"].astype(float)
games_df.dtypes

In [None]:
# Find null values
for column in games_df.columns:
    print(f"Column {column} has {games_df[column].isnull().sum()} null values")

In [None]:
# Drop null values
games_df=games_df.dropna()

In [None]:
# Find duplicate entries
print(f"Duplicate entries: {games_df.duplicated().sum()}")

In [None]:
# Checking domain counts
domain_counts = games_df.Domains.value_counts()
domain_counts

In [None]:
# Visualize the value counts
domain_counts.plot.density()

In [None]:
# Replace smaller domains with "Other"
replace_domains = list(domain_counts[domain_counts<400].index)

for domain in replace_domains:
    games_df.Domains = games_df.Domains.replace(domain,"Other")
    
games_df.Domains.value_counts()

In [None]:
# label encode domains
le=preprocessing.LabelEncoder()
le.fit(games_df["Domains"])
games_df["Domains"] = le.transform(games_df["Domains"])

In [None]:
# Check domain counts after encoding
domain_counts = games_df.Domains.value_counts()
domain_counts

In [None]:
# Drop ID and Name
games_df = games_df.drop(["ID","Name","index","Year Published"], axis =1)
games_df.head()

In [None]:
# remove all splaces for computer ease during model training
games_df = games_df.rename(columns={"Min Players": "min_players", "Max Players":"max_players",
                             "Play Time":"play_time","Min Age":"min_age","Users Rated":"users_rated",
                             "Rating Average":"rating_average","BGG Rank":"bgg_rank","Complexity Average":"complexity_average",
                             "Owned Users":"owned_users"})
games_df

In [None]:
# Define X and y
X = games_df.copy()
X = X.drop("Domains", axis=1)
y = games_df["Domains"]

In [None]:
# Split into training and testing sets
X_train, X_test,  y_train, y_test = train_test_split(X, y, random_state = 78)

In [None]:
# Scale the data
scaler=StandardScaler()
# Fit the scaler with the training data
X_scaler=scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Instatiate Logistic Regression model
classifier=LogisticRegression(solver='lbfgs',max_iter=500,random_state=1)

In [None]:
# Train the model
classifier.fit(X_train_scaled, y_train)

In [None]:
# Test the model
y_pred=classifier.predict(X_test_scaled)

In [None]:
# Check the accuracy score
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Abstract Games", "Children's Games","Family Games","Other","Party Games","Strategy Games","Thematic Games","Wargames"], columns=["Predicted Abstract Games", "Children's Games","Family Games","Other","Party Games","Strategy Games","Thematic Games","Wargames"])

pd.set_option('display.max_columns', None)
cm_df