In [32]:
import pandas as pd

def parse_review_file(file_path):
    reviews = []
    review = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if not line:
                if review:  # If review is not empty, append it and reset
                    reviews.append(review)
                    review = {}
            else:
                split_line = line.split(': ', 1)
                if len(split_line) == 2:
                    key, value = split_line
                    review[key] = value
        if review:  # Append last review if file doesn't end with blank line
            reviews.append(review)
    
    return pd.DataFrame(reviews)

# Example usage:
df = parse_review_file('Musical_Instruments.txt')
print(df.head())


  product/productId                                      product/title  \
0        B0009F5ZXS  Dimarzio Virtual Hot PAF Bridge - Black Finish...   
1        B000EENAE0  Danelectro DJ-20C Rocky Road Spin Speaker Mini...   
2        B000EENAE0  Danelectro DJ-20C Rocky Road Spin Speaker Mini...   
3        B000EENAE0  Danelectro DJ-20C Rocky Road Spin Speaker Mini...   
4        B000EENAE0  Danelectro DJ-20C Rocky Road Spin Speaker Mini...   

  product/price   review/userId review/profileName review/helpfulness  \
0         69.00  A3915SRVUGEXY5            plinker                2/2   
1         60.74  A31KXTOQNTWUVM         Bill Board                9/9   
2         60.74  A1EJ0E61P5F3YL           Laus Deo                3/3   
3         60.74  A30ROM67HSGKWL                 Al                2/2   
4         60.74   AGETSICLTBAQO          Dave Puzz                1/1   

  review/score review/time                                     review/summary  \
0          3.0  1336780800         

In [33]:
df['review/score'] = df['review/score'].astype(float)

def label_sentiment(score):
    if score >= 4:
        return 'positive'
    elif score <= 2:
        return 'negative'
    
    else:
        return 'neutral'

df['sentiment'] = df['review/score'].apply(label_sentiment)
df = df[df['sentiment'] != 'neutral']  # if doing binary sentiment analysis



In [34]:
df['sentiment'].value_counts()
df = df.drop(columns=['product/productId', 'product/title', 'product/price', 'review/userId', 'review/profileName', 'review/time'])


In [36]:
# Replace 'positive' with 1 and 'negative' with 0
df['sentiment'] = df['sentiment'].map({'negative': 0, 'positive': 1})
print(df['sentiment'].value_counts())
df


sentiment
1    67687
0    11213
Name: count, dtype: int64


Unnamed: 0,review/helpfulness,review/score,review/summary,review/text,sentiment
1,9/9,5.0,Now I won't have to invest in a Leslie speaker!,No other way to put it: this thing is WONDERFU...,1
2,3/3,4.0,Fun Toy,This effect has an interesting sound somewhere...,1
3,2/2,5.0,Danelectro DJ-20C Rocky Road Spin Speaker Mini...,This pedal is cheaply made but it sounds terri...,1
4,1/1,5.0,Leslie in a nutshell,Danelectro DJ-20C Rocky Road Spin Speaker Mini...,1
5,1/2,5.0,Great for guitar if you time it right!,Got this thinking I could make my guitars soun...,1
...,...,...,...,...,...
85399,4/4,4.0,Good Entry Mixer,"Pros: This product is good for its price, hold...",1
85400,1/1,5.0,Nice mixer!,"It's fulfiled with features, not a regular mix...",1
85401,1/2,4.0,how does this compare to DJX750?,I used to have this mixer. I was used to using...,1
85403,8/15,5.0,BETTER THAN ADVERTISED!!!,"I PURCHACED THIS A YEAR AGO, AND I STILL ENJOY...",1


In [40]:
# df.to_csv('cleaned_reviews.csv', index=False)
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

df['cleaned_text'] = df['review/text'].apply(preprocess_text)

In [42]:
df = df.drop(columns=['review/text'])

In [43]:
df.to_csv('cleaned_reviews.csv', index=False)

In [44]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df['cleaned_text'], df['sentiment'], 
    test_size=0.2, random_state=42
)

In [None]:
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Parameters
vocab_size = 5000  # you can adjust based on your data
max_length = 100   # adjust based on typical sentence length

# Tokenization
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Convert texts to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding sequences
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Define the model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_length=max_length),
    LSTM(64, return_sequences=True),
    Dropout(0.2),
    LSTM(32),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# Summarize the model architecture
model.summary()


In [None]:
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)


In [None]:
history = model.fit(
    X_train_pad, y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)


In [None]:
loss, accuracy = model.evaluate(X_test_pad, y_test, verbose=1)
print(f'Test Accuracy: {accuracy:.4f}')


In [None]:
# Example review
new_review = ["The product quality is excellent and I'm very satisfied with my purchase."]

# Preprocess the review
new_review_seq = tokenizer.texts_to_sequences(new_review)
new_review_pad = pad_sequences(new_review_seq, maxlen=max_length, padding='post', truncating='post')

# Predict sentiment
prediction = model.predict(new_review_pad)
sentiment = 'positive' if prediction[0][0] >= 0.5 else 'negative'
print(f'Sentiment: {sentiment}')


In [None]:
# Save the model
model.save('sentiment_analysis_model.h5')

# Load the model
from tensorflow.keras.models import load_model
loaded_model = load_model('sentiment_analysis_model.h5')
