In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
#functions to open files, merging them, splitting them into lines and then into a dataframe
def read_file(file_path):
    with open(file_path, 'r') as file:
        return file.read()

# Function to combine contents of the two files, creating a new one
def combine_files(file1, file2, output_file):
    text1 = read_file(file1)
    text2 = read_file(file2)
    combined = text1 + "\n" + text2

    with open(output_file, 'w') as file:
        file.write(combined)
    return combined

file1_path = '/content/TrainingDataNegative.txt'
file2_path = '/content/TrainingDataPositive.txt'
output_file_path = 'reviews.txt'

combined = combine_files(file1_path, file2_path, output_file_path)
lines = combined.split('\n')
df = pd.DataFrame(lines, columns=['review'])

# Preprocessing functions and application
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = ' '.join(word for word in text.split() if word not in stopwords.words('english'))
    return text

df['cleaned_review'] = df['review'].apply(clean_text)





In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
import nltk

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df['cleaned_review'])
sequences = tokenizer.texts_to_sequences(df['cleaned_review'])
max_length = 100
X = pad_sequences(sequences, maxlen=max_length)
y = to_categorical(df['sentiment'])


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Model creation
model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.5),
    LSTM(32),
    Dense(1, activation='sigmoid')
])

optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_model.h5', save_best_only=True, monitor='val_accuracy', mode='max')

# Train model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val), callbacks=[early_stopping, model_checkpoint])

# Evaluate model on test data
test_reviews = ["This is a fantastic product!", "I did not like the service."]
test_sequences = tokenizer.texts_to_sequences(test_reviews)
X_test = pad_sequences(test_sequences, maxlen=max_length)
predictions = model.predict(X_test)
print(predictions)


Epoch 1/20
Epoch 2/20


  saving_api.save_model(


Epoch 3/20
Epoch 4/20
[[[0.5010172 ]
  [0.5012164 ]
  [0.5012599 ]
  [0.50123596]
  [0.50118697]
  [0.5011326 ]
  [0.5010816 ]
  [0.50103706]
  [0.50099975]
  [0.50096923]
  [0.50094485]
  [0.5009255 ]
  [0.50091034]
  [0.5008986 ]
  [0.50088954]
  [0.5008828 ]
  [0.5008776 ]
  [0.50087374]
  [0.500871  ]
  [0.5008689 ]
  [0.5008675 ]
  [0.5008664 ]
  [0.50086576]
  [0.5008653 ]
  [0.500865  ]
  [0.50086486]
  [0.50086474]
  [0.50086474]
  [0.50086474]
  [0.50086474]
  [0.5008648 ]
  [0.50086486]
  [0.50086486]
  [0.5008649 ]
  [0.500865  ]
  [0.500865  ]
  [0.50086504]
  [0.5008651 ]
  [0.5008651 ]
  [0.5008651 ]
  [0.50086516]
  [0.50086516]
  [0.50086516]
  [0.5008652 ]
  [0.5008652 ]
  [0.5008652 ]
  [0.5008652 ]
  [0.5008652 ]
  [0.5008652 ]
  [0.5008652 ]
  [0.50086516]
  [0.50086516]
  [0.5008651 ]
  [0.5008651 ]
  [0.5008651 ]
  [0.5008651 ]
  [0.50086504]
  [0.500865  ]
  [0.500865  ]
  [0.5008649 ]
  [0.50086486]
  [0.50086486]
  [0.5008648 ]
  [0.5008648 ]
  [0.5008648 ]
  [

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# Assign binary labels for sentiment (0 for negative, 1 for positive)
df['sentiment'] = np.random.randint(2, size=len(df))

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df['cleaned_review'])
sequences = tokenizer.texts_to_sequences(df['cleaned_review'])
max_length = 100
X = pad_sequences(sequences, maxlen=max_length)
y = to_categorical(df['sentiment'])

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.metrics import accuracy_score, precision_score
val_predictions = model.predict(X_val)
val_pred_labels = np.argmax(val_predictions, axis=1)
val_true_labels = np.argmax(y_val, axis=1)

accuracy = accuracy_score(val_true_labels, val_pred_labels)
precision = precision_score(val_true_labels, val_pred_labels)

print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Validation Precision: {precision:.4f}")

# Testing the model on a couple of sample sentences
sample_sentences = ["I love this product!", "This is the worst service I have ever received."]
cleaned_samples = [clean_text(sentence) for sentence in sample_sentences]
sample_sequences = tokenizer.texts_to_sequences(cleaned_samples)
sample_padded = pad_sequences(sample_sequences, maxlen=max_length)

sample_predictions = model.predict(sample_padded)
sample_pred_labels = np.argmax(sample_predictions, axis=1)

for sentence, label in zip(sample_sentences, sample_pred_labels):
    sentiment = "Positive" if label == 1 else "Negative"
    print(f"Sentence: '{sentence}' - Sentiment: {sentiment}")

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score

num_negative_reviews = sum(1 for line in open(file1_path))
num_positive_reviews = sum(1 for line in open(file2_path))
labels = [0] * num_negative_reviews + [1] * num_positive_reviews
df['label'] = labels

#split data
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_review'], df['label'], test_size=0.2, random_state=42)

# Vectorize text data
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train and evaluate model
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)

y_pred = model.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')

# Example review to test
example_review = "I love this product, i've been wasting my money on other stuff for so long."
cleaned_example_review = clean_text(example_review)
vectorized_example_review = vectorizer.transform([cleaned_example_review])
predicted_score = model.predict(vectorized_example_review)

print(f"Original Review: {example_review}")
print(f"Cleaned Review: {cleaned_example_review}")
print(f"Predicted Sentiment Score: {predicted_score[0]}")

FileNotFoundError: [Errno 2] No such file or directory: '/content/TrainingDataNegative.txt'