In [96]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

df = pd.read_csv('Musical_instruments_reviews.csv')
df['sentiment'] = df['overall'].map({5: 1, 4: -1, 3: 0, 2: -1, 1: -1})

positive_df = df[df['sentiment'] == 1]
neutral_df = df[df['sentiment'] == 0]
negative_df = df[df['sentiment'] == -1]

sample_size = min(len(positive_df), len(neutral_df), len(negative_df))

positive_sample = positive_df.sample(sample_size, random_state=42)
print(positive_sample)
neutral_sample = neutral_df.sample(sample_size, random_state=42)
negative_sample = negative_df.sample(sample_size, random_state=42)
print(negative_sample)

balanced_df = pd.concat([positive_sample, neutral_sample, negative_sample])

train_df, test_df = train_test_split(balanced_df, test_size=0.2, random_state=42)

def calculate_tfidf(corpus):
    tf = {}
    for doc in corpus:
        for word in doc.split():
            tf[word] = tf.get(word, 0) + 1

    idf = {}
    for doc in corpus:
        for word in set(doc.split()):
            idf[word] = idf.get(word, 0) + 1

    tfidf = {}
    for word, tf_value in tf.items():
        idf_value = np.log(len(corpus) / (idf[word] + 1))
        tfidf[word] = tf_value * idf_value
    return tfidf

tfidf_dict = calculate_tfidf(train_df['summary'])
train_tfidf = train_df['summary'].apply(lambda x: sum(tfidf_dict.get(word, 0) for word in x.split()))

X_train = np.array(train_tfidf.tolist()).reshape(-1, 1)
y_train = train_df['sentiment'].values

model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

def predict_sentiment(review, threshold=0.32):
    review_tfidf = sum(tfidf_dict.get(word, 0) for word in review.split())
    X_new = np.array([review_tfidf]).reshape(-1, 1)

    probability_positive = model.predict_proba(X_new)[0, 1]
    #print(probability_positive)
    
    if probability_positive > threshold:
        return 'Positive'
    elif probability_positive < threshold-0.007:
        return 'Neutral'
    else:
        return 'Negative'

new_review = "Useless product"
predicted_sentiment = predict_sentiment(new_review)
print(f"Predicted Sentiment: {predicted_sentiment}")


          reviewerID        asin          reviewerName helpful  \
3272  A2KXINV90T91L8  B0002M6CVC              David T.  [0, 0]   
7843  A1JCRZEQACX4V4  B003BFYDBS        Colin Hendrick  [0, 0]   
4047  A2D0F87RY5YMHN  B0009G1KEU         Levis Ouellet  [0, 0]   
3871  A1RGF9CS1V8O14  B000960KNW             Josiah H.  [0, 0]   
9410  A1DVUFG2QSJ6IK  B005T800V2       grandpa "Randy"  [0, 0]   
...              ...         ...                   ...     ...   
8867  A109ME7C09HM2M  B004XNK7AI    Douglas W. Fielder  [1, 1]   
5770  A35XRT4BW4I6UD  B000VJJQUU     Richard R. Casper  [1, 1]   
7052  A3N93PUHEMQU34  B002024UDE  L. Su "Flying train"  [1, 2]   
2273   A1JNNQAUI5FZV  B0002GLCRC                 Cliff  [0, 0]   
1953   ADH0O8UVJOT10  B0002F4VBM              StormJH1  [1, 1]   

                                             reviewText  overall  \
3272  This are some great strings. I have bought the...      5.0   
7843  Never used this seller or site for this purpos...      5.0   
404