In [62]:
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.utils import to_categorical
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import RandomOverSampler

In [102]:
# Define the path to the data file
data_file_path = 'farm-ads.txt'

# Read the data from the file
with open(data_file_path, 'r') as file:
    lines = file.readlines()

# Create lists to store labels and text
labels = []
text = []

# Process each line of the file
for line in lines:
    parts = line.strip().split(' ', 1)
    label = int(parts[0])
    ad_text = parts[1]
    labels.append(label)
    text.append(ad_text)

# Create a DataFrame
df = pd.DataFrame({'label': labels, 'text': text})

In [103]:
# Step 2: Text Preprocessing
X = df['text']
y = df['label']

In [104]:
# Step 3: Apply Oversampling
oversampler = RandomOverSampler(random_state=42)

# Reshape the text data to a 2D array
X_reshaped = X.values.reshape(-1, 1)

X_resampled, y_resampled = oversampler.fit_resample(X_reshaped, y)

In [122]:
# Step 3: Model Preparation
tokenizer = Tokenizer(num_words=10000)  # Consider the top 10,000 words
tokenizer.fit_on_texts(X_resampled[:, 0])
X_sequence = tokenizer.texts_to_sequences(X_resampled[:, 0])
X_padded = pad_sequences(X_sequence, maxlen=200)  # Set a reasonable sequence length

X_train, X_test, y_train, y_test = train_test_split(X_padded, y_resampled, test_size=0.2, random_state=42)

In [124]:
# Convert labels to categorical format
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)

In [125]:
# Step 4: Build and Train the Model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=100, input_length=200))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64))
model.add(Dense(2, activation='softmax'))  # Output layer with 2 classes

In [126]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [127]:
model.fit(X_train, y_train_categorical, validation_data=(X_test, y_test_categorical), epochs=7, batch_size=64)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x1571509aa08>

In [128]:
# Step 5: Model Evaluation
y_pred_categorical = model.predict(X_test)
y_pred = [1 if pred[1] > pred[0] else 0 for pred in y_pred_categorical]



In [129]:
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.91      0.89       432
           1       0.91      0.88      0.89       452

    accuracy                           0.89       884
   macro avg       0.89      0.89      0.89       884
weighted avg       0.89      0.89      0.89       884

Confusion Matrix:
 [[391  41]
 [ 55 397]]


In [130]:
# Save the Keras model
model.save('sentiment_model.h5')

In [133]:
from keras.models import load_model

In [134]:
# Load the Keras model
loaded_model = load_model('sentiment_model.h5')