In [121]:
import pandas as pd
import numpy as np
import tensorflow
from sentence_transformers import SentenceTransformer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

In [74]:
splits = {'train': 'train.csv', 'test': 'test.csv'}
df = pd.read_csv("hf://datasets/agentlans/text-quality/" + splits["train"])

In [75]:
df

Unnamed: 0,text,source,fineweb,nvidia,quality
0,Our razor sharp swords sliced through the roug...,allenai/c4,-0.438112,0.182096,-0.049947
1,In fact my own children attend several day cam...,allenai/c4,0.150865,0.011652,0.265714
2,The reduced inflammatory reaction causes a dec...,agentlans/wikipedia-paragraphs,0.905146,0.428094,0.792037
3,"The incredible spetaculo de la vida, the incre...",monology/pile-uncopyrighted,-0.566386,-0.072152,-0.299926
4,On the upper slopes they clashed with a force ...,agentlans/wikipedia-paragraphs,1.135600,0.052069,0.734396
...,...,...,...,...,...
89995,Also included and released as a single is a so...,agentlans/wikipedia-paragraphs,-0.322785,0.029838,-0.504086
89996,A:\n\nYou could simply move ....\nResponse.Red...,monology/pile-uncopyrighted,0.640707,-1.240916,-0.975509
89997,The next year representatives met with counter...,agentlans/wikipedia-paragraphs,0.446387,0.170391,0.063359
89998,"Anyway, I would love to hear your thoughts on ...",monology/pile-uncopyrighted,-1.005016,0.051131,-0.393147


In [76]:
df = df[["text", "quality"]]

In [77]:
def check_nulls(df):
    return df[df.isnull().any(axis=1)]

In [78]:
def check_duplicates(df):
    return df[df.duplicated()]

In [79]:
check_duplicates(df), check_nulls(df)

(Empty DataFrame
 Columns: [text, quality]
 Index: [],
 Empty DataFrame
 Columns: [text, quality]
 Index: [])

In [80]:
df["text"].str.len().describe()

count    90000.000000
mean       136.165633
std        367.651079
min          1.000000
25%         66.000000
50%        109.000000
75%        164.000000
max      59098.000000
Name: text, dtype: float64

In [81]:
encoder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [82]:
df["embedding"] = df["text"].apply(lambda x: encoder.encode(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["embedding"] = df["text"].apply(lambda x: encoder.encode(x))


In [87]:
def flatten_embeddings(df):
    embeddings_df = pd.DataFrame(df['embedding'].tolist())
    embeddings_df.columns = [f'embedding_{i}' for i in range(embeddings_df.shape[1])]
    df = df.drop(columns=['embedding']).join(embeddings_df)
    return df

In [88]:
numerical_df = flatten_embeddings(df).drop(columns=["text"])

In [89]:
numerical_df["quality"].describe()

count    90000.000000
mean         0.000535
std          0.681308
min         -5.241319
25%         -0.474049
50%          0.010280
75%          0.497374
max          2.271465
Name: quality, dtype: float64

In [105]:
def standardize(col):
    return (col - col.min()) / (col.max() - col.min())

In [None]:
numerical_df["quality"] = standardize(numerical_df["quality"]) # can train without scaling, if so use tanh activation later

In [110]:
X_train, X_test, y_train, y_test = train_test_split(
    numerical_df.drop(columns=["quality"]),
    numerical_df["quality"],
    test_size=0.2
)

In [111]:
model = Sequential([
    Dense(256, activation='relu', input_shape=(384,)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # use tanh activation if DIDN'T MINMAX SCALE target var
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [112]:
history = model.fit(X_train, y_train, epochs=75, batch_size=32, validation_split=0.2, verbose=1)

Epoch 1/75
[1m1800/1800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 0.0288 - val_loss: 0.0035
Epoch 2/75
[1m1800/1800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.0041 - val_loss: 0.0031
Epoch 3/75
[1m1800/1800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.0035 - val_loss: 0.0029
Epoch 4/75
[1m1800/1800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.0031 - val_loss: 0.0027
Epoch 5/75
[1m1800/1800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 977us/step - loss: 0.0029 - val_loss: 0.0026
Epoch 6/75
[1m1800/1800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 979us/step - loss: 0.0028 - val_loss: 0.0025
Epoch 7/75
[1m1800/1800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 992us/step - loss: 0.0026 - val_loss: 0.0027
Epoch 8/75
[1m1800/1800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 973us/step - loss: 0.0024 - val_loss: 0.0024
Epoch 9/75
[1m1

In [113]:
X_pred = model.predict(X_train)
rmse_train = root_mean_squared_error(y_train, X_pred)
print(f"Train RMSE: {rmse_train}")

[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 308us/step
Train RMSE: 0.030407155902174198


In [114]:
y_pred = model.predict(X_test)
rmse_test = root_mean_squared_error(y_test, y_pred)
print(f"Test RMSE: {rmse_test}")

[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 398us/step
Test RMSE: 0.05078736239052238


In [None]:
#model.save("models/lexical_model.keras") <- if to save the non-scaled outputs model
model.save("models/lexical_model_scaled_outputs.keras")