In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
train=pd.read_parquet("train.parquet")

In [3]:
train.head()

Unnamed: 0,label,text
0,0,"First of all i'm not a big fan of buffet, i tr..."
1,1,Thanks Yelp. I was looking for the words to de...
2,2,Service was so-so. They were receiving a deliv...
3,2,Stamoolis Brothers is one of the Strip Distric...
4,0,I want to give a 2 stars because the service s...


In [4]:
train.shape

(10000, 2)

In [5]:
test=pd.read_parquet("test_without_label.parquet")
test.head()

Unnamed: 0,label,text
0,,"First of all i'm not a big fan of buffet, i tr..."
1,,Thanks Yelp. I was looking for the words to de...
2,,Service was so-so. They were receiving a deliv...
3,,Stamoolis Brothers is one of the Strip Distric...
4,,I want to give a 2 stars because the service s...


In [6]:
test.shape

(3000, 2)

In [7]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train = vectorizer.fit_transform(train['text']).toarray()
y_train = train['label']

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [8]:
#Customized NN
model = Sequential()
model.add(Dense(128, input_dim=X_train_split.shape[1], activation='relu'))
model.add(Dropout(0.8))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.8))
model.add(Dense(5, activation='softmax'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [9]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [10]:
model.fit(X_train_split, y_train_split, validation_data=(X_val_split, y_val_split), epochs=10, batch_size=32, callbacks=[early_stopping])

Epoch 1/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 16ms/step - accuracy: 0.2129 - loss: 1.6078 - val_accuracy: 0.3965 - val_loss: 1.5828
Epoch 2/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.2833 - loss: 1.5551 - val_accuracy: 0.4295 - val_loss: 1.3484
Epoch 3/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.3651 - loss: 1.3787 - val_accuracy: 0.4650 - val_loss: 1.2212
Epoch 4/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.4229 - loss: 1.2436 - val_accuracy: 0.4965 - val_loss: 1.1616
Epoch 5/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 16ms/step - accuracy: 0.4486 - loss: 1.1890 - val_accuracy: 0.4995 - val_loss: 1.1342
Epoch 6/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 16ms/step - accuracy: 0.5066 - loss: 1.1007 - val_accuracy: 0.5110 - val_loss: 1.1190
Epoch 7/10
[1m250/250

<keras.src.callbacks.history.History at 0x1b7f3803d70>

In [11]:
y_train_pred = model.predict(X_train)
y_train_pred = y_train_pred.argmax(axis=1)


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step


In [12]:
accuracy = accuracy_score(y_train, y_train_pred)
precision = precision_score(y_train, y_train_pred, average='weighted')
recall = recall_score(y_train, y_train_pred, average='weighted')
f1 = f1_score(y_train, y_train_pred, average='weighted')

print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-score: {f1:.3f}")

Accuracy: 0.702
Precision: 0.700
Recall: 0.702
F1-score: 0.701


In [13]:
X_test = vectorizer.transform(test['text']).toarray()
test_prediction = model.predict(X_test)
test_prediction = test_prediction.argmax(axis=1)
test['label'] = test_prediction

[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


In [14]:
test.to_parquet("test_with_predicted_label.parquet")
print(test.shape)

(3000, 2)


In [16]:
test_with_predicted_label=pd.read_parquet("test_with_predicted_label.parquet")
test_with_predicted_label.head()

Unnamed: 0,label,text
0,2,"First of all i'm not a big fan of buffet, i tr..."
1,1,Thanks Yelp. I was looking for the words to de...
2,1,Service was so-so. They were receiving a deliv...
3,3,Stamoolis Brothers is one of the Strip Distric...
4,1,I want to give a 2 stars because the service s...
