In [1]:
import json
import re
import pandas as pd
from reddit import RedditClass  # Importing the reddit file
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
import torch
import pickle

In [2]:
model_data = pd.read_json('sentiment_data.json', lines=True)

In [3]:
stemmer = PorterStemmer()

def stemming(sent):
    texts = sent.lower()
    texts = re.sub('[^a-z0-9]', ' ', texts)
    texts = texts.split()
    texts = [stemmer.stem(x) for x in texts if x not in stopwords.words('english')]
    texts = ' '.join(texts)
    return texts

model_data['stemmed_data'] = model_data['Sentences'].apply(stemming)

In [4]:
x = model_data['stemmed_data'].values
y = model_data['Sentiment'].values

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2,stratify=y, random_state=42)

In [6]:
X_test

array(['countri buy speaker', 'use work recent start issu',
       'metallica repress led zepplin repress press palla germani freak good',
       ..., 'return one skip',
       'turntabl built pre amp connect directli power speaker via rca aux cabl',
       'recent audit dali opticon 6 sonu faber lumina v new floor replac one'],
      dtype=object)

In [11]:
print(X_test.shape)

(1157, 5495)


In [7]:
# Converting textual data into numerical data

vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [9]:
print(X_test)

  (0, 4627)	0.3191841043847163
  (0, 1610)	0.8132933961418256
  (0, 1284)	0.4864929180370689
  (1, 5392)	0.421646115736518
  (1, 5166)	0.33159045609313414
  (1, 4692)	0.42742835943044455
  (1, 4061)	0.5267560938779224
  (1, 2809)	0.502090965492635
  (2, 3861)	0.33581217576263456
  (2, 3248)	0.5380839089854487
  (2, 2977)	0.4542667487394492
  (2, 2432)	0.2701299023685451
  (2, 2319)	0.564240592599075
  (3, 2144)	0.7071067811865476
  (3, 1570)	0.7071067811865476
  (4, 5040)	0.39196349774781053
  (4, 5009)	0.3737931191306791
  (4, 4409)	0.21489377475691307
  (4, 2934)	0.39196349774781053
  (4, 2177)	0.34273065171656186
  (4, 1073)	0.3737931191306791
  (4, 735)	0.22087780621298883
  (4, 89)	0.28877621454833124
  (4, 64)	0.34273065171656186
  (5, 5405)	0.28242397972951716
  :	:
  (1153, 860)	0.3601112385509273
  (1154, 4510)	0.7083641606020835
  (1154, 4161)	0.5873996249727315
  (1154, 3550)	0.39138458906349416
  (1155, 5219)	0.33640747961629097
  (1155, 5063)	0.20426681414750472
  (1155, 4

In [ ]:
# Creating the Logistic Regression model

log_model = MultinomialNB()
log_model.fit(X_train, y_train)

# Getting model accuracy
X_train_pred = log_model.predict(X_train)
accuracy = accuracy_score(y_train, X_train_pred)
print(f'The accuracy score of the training data is: {accuracy}')

X_test_pred = log_model.predict(X_test)
accuracy_scr = accuracy_score(y_test, X_test_pred)
print(f'The accuracy of the test model is: {accuracy_scr}')

check3 = 0
while check3 == 0:
    match input('Do you want to save the model? ').lower():
        case 'y':
            pickle.dump(model, open('SentimentModel.pkl', 'wb'))
            check3 = 1
        case "n":
            check3 = 1
            continue
        case _:
            print('Invalid input, Try again.')

inp = input('Enter your sentence: ')
res = stemming(inp)

document = vectorizer.fit_transform([inp])
#
# prediction = log_model.predict(document)
#
print(document)
x_new = X_test[1]
print(x_new)
print(y_test[1])

# # prediction = log_model.predict(x_new)
# print(prediction)
