In [None]:
import pandas as pd
import numpy as np
import tensorflow
from sentence_transformers import SentenceTransformer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

In [None]:
splits = {'train': 'train.csv', 'test': 'test.csv'}
df = pd.read_csv("hf://datasets/agentlans/text-quality/" + splits["train"])

In [None]:
df = df[["text", "quality"]]

In [None]:
def check_nulls(df):
    return df[df.isnull().any(axis=1)]

In [None]:
def check_duplicates(df):
    return df[df.duplicated()]

In [None]:
check_duplicates(df), check_nulls(df)

In [None]:
df["text"].str.len().describe()

In [None]:
encoder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
df["embedding"] = df["text"].apply(lambda x: encoder.encode(x))

In [None]:
def flatten_embeddings(df):
    embeddings_df = pd.DataFrame(df['embedding'].tolist())
    embeddings_df.columns = [f'embedding_{i}' for i in range(embeddings_df.shape[1])]
    df = df.drop(columns=['embedding']).join(embeddings_df)
    return df

In [None]:
numerical_df = flatten_embeddings(df).drop(columns=["text"])

In [None]:
numerical_df["quality"].describe()

In [None]:
def standardize(col):
    return (col - col.min()) / (col.max() - col.min())

In [None]:
numerical_df["quality"] = standardize(numerical_df["quality"]) # can train without scaling, if so use tanh activation later

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    numerical_df.drop(columns=["quality"]),
    numerical_df["quality"],
    test_size=0.2
)

In [None]:
model = Sequential([
    Dense(256, activation='relu', input_shape=(384,)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # use tanh activation if DIDN'T MINMAX SCALE target var
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

In [None]:
history = model.fit(X_train, y_train, epochs=75, batch_size=32, validation_split=0.2, verbose=1)

In [None]:
X_pred = model.predict(X_train)
rmse_train = root_mean_squared_error(y_train, X_pred)
print(f"Train RMSE: {rmse_train}")

In [None]:
y_pred = model.predict(X_test)
rmse_test = root_mean_squared_error(y_test, y_pred)
print(f"Test RMSE: {rmse_test}")

In [None]:
#model.save("models/lexical_model.keras") <- if to save the non-scaled outputs model
model.save("models/lexical_model_scaled_outputs.keras")