In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train=pd.read_csv("/content/drive/MyDrive/train.csv")
test=pd.read_csv("/content/drive/MyDrive/test.csv")
games=pd.read_csv("/content/drive/MyDrive/games.csv")
turns=pd.read_csv("/content/drive/MyDrive/turns.csv")
sub=pd.read_csv("/content/submission.csv")

FileNotFoundError: ignored

# New section

In [None]:
sub.info()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
games.info()

In [None]:
turns.info()

In [None]:
train = train.rename(columns=str.lower)
test  = test.rename(columns=str.lower)
turns = turns.rename(columns=str.lower)
games = games.rename(columns=str.lower)

In [None]:
brief_df = pd.concat([train, test], axis=0)
brief_df = brief_df.sort_values(["game_id"])
bots = ["BetterBot", "STEEBot", "HastyBot"]


user_df = brief_df[~brief_df["nickname"].isin(bots)]
user_df = user_df.rename(
    columns={"nickname": "user_name", "score": "user_score", "rating": "user_rating"}
)
bot_df = brief_df[brief_df["nickname"].isin(bots)]

bot_df = bot_df.rename(
    columns={"nickname": "bot_name", "score": "bot_score", "rating": "bot_rating"}
)

main_df = pd.merge(user_df, bot_df, on="game_id")
main_df.head()

In [None]:
main_df.info()

In [None]:
from sklearn.preprocessing import LabelEncoder
main_df["user_freq"] = main_df.groupby("user_name")["user_name"].transform("count")
encode_bots = LabelEncoder()
main_df["bot_name"] = encode_bots.fit_transform(main_df["bot_name"])
main_df.head()

In [None]:
missing_cols = main_df.columns[main_df.isnull().any()].tolist()
print(f'These labels have missing data that needs to be cleaned: {missing_cols} ')

In [None]:
train_df = main_df[~main_df['user_rating'].isna()].reset_index(drop=True)
train_df.head()

In [None]:
test_df  = main_df[main_df['user_rating'].isna()].reset_index(drop=True)
test_df.head()

In [None]:
main_df[main_df.duplicated()].shape[0]

In [None]:
top_score = train.sort_values(by='score', ascending=False)[:30]
figure = plt.figure(figsize=(10,6))
sns.barplot(x=top_score.nickname, y=top_score.score)
plt.xticks()
plt.ylabel('Scrabble Scores')
plt.xlabel('Competitor Nickname')
plt.title('Scrabble Competitors by Scores')
plt.show()

In [None]:
corr = games.corr(method='pearson')
sns.heatmap(corr)

In [None]:
games["rating_mode"].value_counts().plot.bar(figsize=(8, 6), color=['#f5803d', '#f5005a'], title='Scrabble Rating Mode');

In [None]:
fig, axes = plt.subplots(1, 3, figsize = (25, 5))
ax = axes.flatten()
sns.histplot(ax = axes[0], x = main_df["user_score"], bins = 20, kde = True, color = "#f5803d").set(title = "Distribution of user_score variable");
sns.histplot(ax = axes[1], x = main_df["bot_score"], bins = 20, kde = True, color = "#f5483a").set(title = "Distribution of bot_score variable");
sns.histplot(ax = axes[2], x = main_df["bot_rating"], bins = 20, kde = True, color = "#f5003d").set(title = "Distribution of bot_rating variable");

In [None]:
fig, axes = plt.subplots(1, 2, figsize = (30, 6))
axes = axes.flatten()
sns.scatterplot(ax = axes[0], x = "user_score", y = "bot_score", data = main_df, color = "#f5483a"
                ).set(title = "Relationship between user_score VS bot_score");
sns.scatterplot(ax = axes[1], x = "bot_score", y = "bot_rating", data = main_df,
                color = "#f5003d").set(title = "Relationship between bot_score VS bot_rating")


In [None]:
from sklearn.linear_model import  LinearRegression, Ridge,Lasso
from sklearn.tree import  DecisionTreeRegressor
from sklearn.ensemble import  RandomForestRegressor,GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
import lightgbm as lgb
model_dict = {
    "linear": LinearRegression(),
    "ridge": Ridge(),
    "lasso": Lasso(),
    "decision_tree": DecisionTreeRegressor(),
    "random_forest": RandomForestRegressor(),
    "gradient_boosting": GradientBoostingRegressor(),
    "neural_network": MLPRegressor(),
    "lgb": lgb.LGBMRegressor(),
}

In [None]:
def get_scores(model_dict, X, y, nfolds=5):
    """
    This function computes the cross-validated R^2 and RMSE scores
    for each model in model_dict on the provided training data X and y.

    Args:
        model_dict (dict): A dictionary containing the models to be evaluated, with keys as model names and values as the initialized model objects.
        X (pandas.DataFrame): The training data on which to evaluate the models.
        y (pandas.DataFrame): The target variable for the training data.
        nfolds (int, optional): The number of folds to use for cross-validation. Defaults to 5.

    Returns:
        pandas.DataFrame: A dataframe containing the mean R^2 and RMSE scores for each model, computed using cross-validation.
    """
    df_score_details = {
        "model": [],
        "(R2)": [],
        "(RMSE)": [],
        "(MAE)": [],
    }
    for model_key in model_dict.keys():
        val_r2_scores = []
        val_rmse_scores = []
        val_mae_scores = []
        kf = KFold(n_splits=nfolds)
        start = time.time()
        # nfolds
        for train_index, val_index in kf.split(X):
            X_train, X_val = X.iloc[train_index], X.iloc[val_index]
            y_train, y_val = y.iloc[train_index], y.iloc[val_index]

            # model
            model_cls = model_dict[model_key] # model
            model = model_cls
            model.fit(X_train, y_train) # X_train
            # validation_data model
            val_preds = model.predict(X_val).reshape(-1) # X_val
            val_r2_scores.append(r2_score(y_val, val_preds))
            val_rmse_scores.append(mean_squared_error(y_val, val_preds, squared=False)) # RMSE
            val_mae_scores.append(mean_absolute_error(y_val, val_preds)) # MAE
        df_score_details["model"].append(model_key)
        df_score_details["(R2)"].append(np.mean(val_r2_scores))
        df_score_details["(RMSE)"].append(np.mean(val_rmse_scores))
        df_score_details["(MAE)"].append(np.mean(val_mae_scores))
        elapsed_time = time.time() - start
        print("-------------------------")
        print(f"model{model_key}: {df_score_details}")
        print(f"{model_key} finished in {elapsed_time:.2f} seconds")
        print("-------------------------")
    df_score = pd.DataFrame(df_score_details)
    return df_score

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import KFold
import time
X_train = train_df.drop(["user_name", "user_rating"], axis=1)
y_train = train_df["user_rating"].copy()
X_test = test_df.drop(["user_name", "user_rating"], axis=1)
# CV
df_score = get_scores(model_dict, X_train, y_train, nfolds=2)

In [None]:
df_score.sort_values("(RMSE)")

In [None]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
test_df["user_rating"] = model.predict(
    test_df.drop(["user_name", "user_rating"], axis=1)
)
final_sub = test_df[["game_id", "user_rating"]]
final_sub = final_sub.rename(columns={"user_rating": "rating"})
print(final_sub.head())
final_sub.to_csv("submission.csv", index=False)
print('Submission successful!')

In [None]:
final_sub