In [11]:
import pandas as pd # dataframes (tables and data management manipulation in python)
import numpy as np # math library
import tensorflow as tf # neural network
from sklearn.model_selection import train_test_split # split training and testing method
from sklearn.preprocessing import StandardScaler   # to split data into x and y x is numerical and y is target value???? # Scale values for network compatibility
import os # for system calls
import seaborn as sns # visualisation
from matplotlib import pyplot as plt # plotting
import io, re, shutil, string
import pickle

In [12]:
sentiment_df = pd.read_csv('databases/sentiment_analysis_gasp.csv', encoding='ISO-8859-1')

In [13]:
sentiment_df

Unnamed: 0,sentiment,sentence
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


In [14]:
def sentiment_to_num(sentiment: str):
    return {'negative': 0.0, 'neutral': 0.5, 'positive': 1.0}[sentiment]

sentiment_df['sentiment'] = sentiment_df['sentiment'].apply(sentiment_to_num)
sentiment_df

Unnamed: 0,sentiment,sentence
0,0.5,"According to Gran , the company has no plans t..."
1,0.5,Technopolis plans to develop in stages an area...
2,0.0,The international electronic industry company ...
3,1.0,With the new production plant the company woul...
4,1.0,According to the company 's updated strategy f...
...,...,...
4841,0.0,LONDON MarketWatch -- Share prices ended lower...
4842,0.5,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,0.0,Operating profit fell to EUR 35.4 mn from EUR ...
4844,0.0,Net sales of the Paper segment decreased to EU...


In [15]:
sentiment_model = tf.keras.models.Sequential()
sentiment_model

<Sequential name=sequential_1, built=False>

In [16]:
sentences = sentiment_df['sentence'].values
labels = sentiment_df['sentiment'].values

In [17]:
sentences

array(['According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .',
       'Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .',
       'The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported .',
       ...,
       'Operating profit fell to EUR 35.4 mn from EUR 68.8 mn in 2007 , including vessel sales gain of EUR 12.3 mn .',
       'Net sales of the Paper segment decreased to EUR 221.6 mn in the second quarter of 2009 from EUR 241.1 mn in the second quarter of 2008 , while operating profit excluding non-recurring items rose to EUR 8.0 mn from EUR 7.6 mn .',
       'Sales in Finland decreased by 10.5 % in Januar

In [18]:
labels

array([0.5, 0.5, 0. , ..., 0. , 0. , 0. ], shape=(4846,))

In [19]:
X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.2, random_state=42)

In [20]:
vocab_size = 20000
embedding_dim = 64
max_length = 40
oov_tok = "<OOV>"

In [21]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
tokenizer

with open('models/sentiment_analysis_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [22]:
train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)

train_padded = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
test_padded = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

In [23]:
sentiment_model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim, input_shape=(max_length,)))
sentiment_model.add(tf.keras.layers.GlobalAveragePooling1D())
sentiment_model.add(tf.keras.layers.Dense(256, activation='relu'))
sentiment_model.add(tf.keras.layers.Dense(128, activation='relu'))
sentiment_model.add(tf.keras.layers.Dropout(0.8))
sentiment_model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

  super().__init__(**kwargs)
I0000 00:00:1758594921.376263    3849 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 3063 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


In [24]:
sentiment_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# sentiment_model.build(input_shape=(None, ))

In [25]:
sentiment_model.summary()
os.listdir()

['test.ipynb',
 'test.py',
 'databases',
 'test_arnav_sentiment_analysis.ipynb',
 'gauri.py',
 'theonlyprj.py',
 'sandy.py',
 'models',
 'theonlyprj.ipynb',
 'test_arnav.py',
 'test_arnav.ipynb']

In [26]:
sentiment_model.fit(train_padded, y_train, epochs=10, validation_data=(test_padded, y_test))
sentiment_model.save('models/sentiment_analysis.keras')

Epoch 1/10


I0000 00:00:1758594923.009627    4244 service.cc:152] XLA service 0x7fec4c003320 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1758594923.009641    4244 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce RTX 3060 Laptop GPU, Compute Capability 8.6
2025-09-22 19:35:23.045634: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1758594923.225238    4244 cuda_dnn.cc:529] Loaded cuDNN version 91100





[1m 98/122[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.2727 - loss: 0.6862

I0000 00:00:1758594926.783726    4244 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.



[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.2728 - loss: 0.6859




[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 36ms/step - accuracy: 0.2768 - loss: 0.6836 - val_accuracy: 0.2979 - val_loss: 0.6727
Epoch 2/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.2869 - loss: 0.6615 - val_accuracy: 0.3402 - val_loss: 0.6246
Epoch 3/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3504 - loss: 0.5957 - val_accuracy: 0.3515 - val_loss: 0.6039
Epoch 4/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3772 - loss: 0.5460 - val_accuracy: 0.3485 - val_loss: 0.6234
Epoch 5/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3922 - loss: 0.5037 - val_accuracy: 0.3649 - val_loss: 0.6264
Epoch 6/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3875 - loss: 0.4926 - val_accuracy: 0.3557 - val_loss: 0.6273
Epoch 7/10
[1m122/122[0m [32m━━━━━━

In [27]:
# Function to predict sentiment for new sentences
def predict_sentiment(text):
    # Preprocess the text
    sequence = tokenizer.texts_to_sequences([text])
    padded = tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')
    
    # Predict the sentiment
    prediction = sentiment_model.predict(padded)[0][0]
    
    # Return the result
    if prediction >= 0.67:
        return f'Positive (Probability: {prediction:.4f})'
    elif prediction >= 0.33:
        return f'Neutral (Probability: {prediction:.4f})'
    else:
        return f'Negative (Probability: {prediction:.4f})'

# Example predictions
new_sentences = ["In the third quarter of 2010 , net sales increased by 5.2 % to EUR 205.5 mn , and operating profit by 34.9 % to EUR 23.5 mn .", "Company profits go down in 2024", "great salary bonus", 'payroll direct deposit, great salary bonus']
for sentence in new_sentences:
    print(f"'{sentence}' -> {predict_sentiment(sentence)}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 199ms/step
'In the third quarter of 2010 , net sales increased by 5.2 % to EUR 205.5 mn , and operating profit by 34.9 % to EUR 23.5 mn .' -> Positive (Probability: 1.0000)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
'Company profits go down in 2024' -> Negative (Probability: 0.0154)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
'great salary bonus' -> Positive (Probability: 0.6754)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
'payroll direct deposit, great salary bonus' -> Neutral (Probability: 0.5311)
